Compare commits

...

185 Commits

Author SHA1 Message Date
J-B Orfila
d0937aae20 pbs count 2024-03-20 18:28:02 +01:00
Arthur Meyre
e81152a630 chore(tfhe): remove last remaining modular_std_dev
- some places were not updated, remove the last non modular std_dev
- the ones to dump parameters are modular so are kept
2024-03-12 11:12:40 +01:00
Pedro Alves
8c4675dc3e fix(gpu): fix a bug in integer multiplication 2024-03-12 09:57:39 +01:00
Pedro Alves
29fb4fbe77 chore(gpu): refactor low-latency and multi-bit PBSs so the buffer is a structured object 2024-03-12 09:57:39 +01:00
Agnes Leroy
f84c34c903 feat(gpu): signed scalar add 2024-03-11 14:49:39 +01:00
dependabot[bot]
cc905a04c7 chore(deps): bump tj-actions/changed-files from 42.0.5 to 42.1.0
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.5 to 42.1.0.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](800a282599...aa08304bd4)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-03-11 09:30:41 +01:00
Arthur Meyre
3fc791e813 chore(ci): to avoid stack overlow crashes increase thread stack size
- Default Linux thread stack size seems to be 8 MB, rust limits it to 2 MB
by default, change that to avoid tests failing because of overflowed stacks
2024-03-08 19:49:22 +01:00
Pedro Alves
d4f8fb8f57 feat(gpu): Implement benchmark for low latency and amortized PBS in all variants and the FFT 2024-03-08 14:04:53 -03:00
Pedro Alves
68ce43d2f0 feat(gpu): implement custom benchmarks 2024-03-08 14:04:53 -03:00
Arthur Meyre
c5b9e5400a chore(ci): make sure dev_bench is checked by clippy
- removed the experimental feature requirement
2024-03-08 10:56:36 +01:00
David Testé
8167c85764 chore(bench): reduce measurement duration to 60 for pbs benchmarks
This is done to speed-up benchmark duration.
2024-03-08 09:16:17 +01:00
tmontaigu
98bd45503c chore(hlapi): add some GPU test for FheUint
Tests are not complete yet, but its the first step to get there
2024-03-07 20:08:11 +01:00
Agnes Leroy
ed50042719 feat(gpu): signed mul with tests and benchmarks 2024-03-07 15:37:52 +01:00
David Testé
053d56a3d6 chore(ci): format benchmark results parser with black 2024-03-07 13:33:46 +01:00
David Testé
e5b117ca29 chore(ci): handle new name format to get pbs throughput values
core_crypto benchmark name format has been changed to reflect
what's used in other layers. Benchmark result parser was no longer
able to compute the right value for the PBS throughput.
2024-03-07 13:33:46 +01:00
tmontaigu
9de486f33c chore(integer): move & hardden sub/neg tests
Also start making non parallel test use test cases
2024-03-07 10:38:27 +01:00
Arthur Meyre
ccf879c9ae refactor(tfhe): plug NoiseDistribution in the various APIs 2024-03-07 10:24:15 +01:00
Mayeul@Zama
d3c1f91948 test(shortint): add oprf deterministic test 2024-03-06 17:19:05 +01:00
Arthur Meyre
273dbe1b85 chore(core): make torus_modular_diff safer to use 2024-03-06 15:54:06 +01:00
Agnes Leroy
7ac061266f feat(gpu): signed sub and neg with tests and benchmarks
Refactor tests in the meanwhile to avoid huge tests files.
2024-03-06 15:53:51 +01:00
Agnes Leroy
c1c56ab770 fix(gpu): fix memory bug in multi-bit PBS 2024-03-06 14:18:29 +01:00
Pedro Alves
00dad37812 chore(gpu): replace recomended lwe_chunk_size for NVIDIA Tesla H100 GPUs 2024-03-06 07:10:22 -03:00
Arthur Meyre
f94533d70d chore(ci): fix CUDA_PATH bin not being exported in GITHUB_PATH 2024-03-06 09:22:45 +01:00
David Testé
b7d7e68d0c chore(ci): run static linter on workflows 2024-03-05 15:00:09 +01:00
David Testé
e8135c207d chore(ci): fix lint errors in workflows 2024-03-05 15:00:09 +01:00
Arthur Meyre
601b200351 chore(ci): fix workflows, missing leading $, skipped does not exist
- avoid spamming if cancelled
2024-03-04 18:19:46 +01:00
Arthur Meyre
a0d5bf2fc2 feat(core): switch GLWE primitives to the new noise distribution system 2024-03-04 15:01:25 +01:00
dependabot[bot]
58223dea09 chore(deps): bump tj-actions/changed-files from 42.0.4 to 42.0.5
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.4 to 42.0.5.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](3f54ebb830...800a282599)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-03-04 09:59:53 +01:00
dependabot[bot]
1f3096b743 chore(deps): bump codecov/codecov-action from 3.1.5 to 4.1.0
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3.1.5 to 4.1.0.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/codecov/codecov-action/compare/v3.1.5...54bcd8715eee62d40e33596ef5e8f0f48dbbccab)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-03-04 09:59:15 +01:00
Agnes Leroy
4a3d5d432a chore(gpu): fix integer bench workflows 2024-03-04 09:30:10 +01:00
Agnes Leroy
c6bfcd75a6 chore(gpu): add pbs throughput benchmarks 2024-03-04 09:30:10 +01:00
Agnes Leroy
85dfd70c6b chore(bench): fully load the cpu for throughput benches 2024-03-04 09:30:10 +01:00
Agnes Leroy
c720656340 chore(gpu): bench signed add on gpu 2024-03-04 09:26:34 +01:00
Agnes Leroy
1c209403a6 feat(gpu): signed addition 2024-03-04 09:26:34 +01:00
tmontaigu
347fc9aaa7 chore(hlapi): add cuda tests for FheBool 2024-03-01 17:17:37 +01:00
tmontaigu
198485b5fb feat(hlapi): bin cuda scalar_eq/ne on FheBool 2024-03-01 17:17:37 +01:00
David Testé
bd7547c93d chore(bench): benchmark 4 bits integer operations 2024-03-01 14:39:58 +01:00
Arthur Meyre
955495d714 refactor(core): change layout of compact public key encryption for LWE list
- this makes sure the product computed for the first ciphertext matches the
product computed for a single ciphertext in the non list case

BREAKING CHANGE:
all previous compact public key list encryptions are not compatible with
the new layout
2024-03-01 11:05:04 +01:00
David Testé
902755c33c feat(core_crypto): add parallelized pfpks with lwe ciphertext list 2024-02-29 18:05:57 +01:00
Arthur Meyre
89f845fa4f refactor(tfhe): use dynamic noise distributions for LWE primitives 2024-02-29 18:05:12 +01:00
Arthur Meyre
9f89d2c09d chore(core): lighten the bound to be generable from a Gaussian distribution 2024-02-29 18:05:12 +01:00
Arthur Meyre
ea0d146ed0 chore(core): add missing unsigned integer slice add noise primitives 2024-02-29 18:05:12 +01:00
tmontaigu
943ccdf450 chore(integer): harden unsigned add tests
This adds degrees and noise levels checks as well as comparing
individual decrypted block values with their degrees.
2024-02-29 17:24:25 +01:00
tmontaigu
f39896ac63 refactor(integer): start refactoring tests
This starts splitting the long test radix tests files into
smaller ones, starting with the add family of function.
2024-02-29 17:24:25 +01:00
Pedro Alves
46a87c6f89 fix(gpu): fix scalar eq for booleans 2024-02-29 11:51:49 +01:00
David Testé
a5579532be chore(ci): add product cost for rtx4090 to compute throughput
RTX4090 we're using here is owned by Zama. So we don't pay an
hourly rate to AWS per se. But in ordrer to compute throughput on
benchmarks results, the parser needs a numeric value corresponding
to the hardware used. Ops-per-dollar metric is not really used
today conversely ops-per-seconds is.
In the end we use an approximation of the cost for electrical
consumption.
2024-02-28 15:53:08 +01:00
Agnes Leroy
41e1781226 chore(gpu): move ciphertext info to dedicated file 2024-02-28 09:02:36 +01:00
Agnes Leroy
697ce94ee2 chore(gpu): remove duplicated test params 2024-02-28 09:02:21 +01:00
Arthur Meyre
a667b654ef chore(tfhe): use div_ceil now that MSRV is 1.73 2024-02-27 18:35:54 +01:00
Arthur Meyre
1bff07b6eb chore(tfhe): update rust MSRV to 1.73 2024-02-27 18:35:54 +01:00
David Testé
59664e84c8 chore(bench): format core_crypto benchmark names to ease parsing 2024-02-27 18:05:35 +01:00
Agnes Leroy
79dc101728 chore(gpu): fix 4090 bench workflow 2024-02-27 17:46:20 +01:00
Arthur Meyre
6828438898 chore(tfhe): bump version to 0.6.0 2024-02-27 13:24:10 +01:00
Arthur Meyre
a8f4cf7c29 chore(cuda): bump backend version to 0.2.0 2024-02-27 13:24:10 +01:00
David Testé
30d2f5f66d chore(ci): add coverage build make recipe 2024-02-27 09:29:03 +01:00
David Testé
112cc6f6c9 chore(ci): remove private feature __coverage to use tarpaulin cfg 2024-02-27 09:29:03 +01:00
David Testé
93581f7ee1 chore(ci): add integer layer to code coverage
A special set of cryptographic parameters set have been created to
speed-up test execution in coverage mode. These parameters set are
*NOT* guaranteed to be secure nor to yield correct results.
2024-02-27 09:29:03 +01:00
David Testé
6e08e91109 chore(ci): checkout repo with fetch-depth 0 to get commit hash
The COMMIT_HASH computed variable needs fetch-depth=0 to be able
to get the versions of the repository.
2024-02-27 08:50:38 +01:00
Agnes Leroy
75f0ad1d4b chore(gpu): add core crypto benches to 4090 bench workflow 2024-02-27 08:50:38 +01:00
Arthur Meyre
618758bd95 fix(core): fix unsigned noise addition for custom modulus 2024-02-26 22:19:01 +01:00
Arthur Meyre
d770a271b3 chore(core): add custom power of 2 support for u128 2024-02-26 22:19:01 +01:00
David Testé
80468494b2 chore(ci): lock version of lattice-estimator in workflow
Latest version of lattice-estimator produce overflow errors.
We force the checkout to the last working version to avoid a red
CI.
2024-02-26 22:18:06 +01:00
Pedro Alves
26e5af542f feat(gpu): Reintroduce a tool to independently test PBS, Keyswitch, and fft at C++ side. 2024-02-26 13:44:32 -03:00
Arthur Meyre
f23b4f21dc chore(core): remove the possibility to seed the NoiseRandomGenerator
- to further avoid misuse, now the NoiseRandomGenerator itself requires a
seeder
- removed the possibility to re-seed the noise generator, even in tests, we
now have access to deterministic seeders which did not use to be the case
2024-02-26 13:28:17 +01:00
Arthur Meyre
b394da3dbb chore(tfhe): remove unused distributions 2024-02-26 13:28:17 +01:00
Arthur Meyre
6007cd2c81 chore(core): refactor byte counts for runtime noise distribution choice
- we will want to be able to choose a noise distribution at runtime and not
keep a hard coded gaussian, we therefore need to be able to adapt to the
number of bytes a distribution may require to properly generate a sample
2024-02-26 13:28:17 +01:00
Arthur Meyre
a6fdc46794 chore(core): rename Encryption RNG primitives to match noise distribution
- we are shifting to non hardcoded noise distributions for encryption,
rename functions for mask and noise generation to indicate which hard coded
distribution was used initially
2024-02-26 13:28:17 +01:00
dependabot[bot]
0134a4a0f2 chore(deps): bump codecov/codecov-action from 4.0.1 to 4.0.2
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 4.0.1 to 4.0.2.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)
- [Commits](e0b68c6749...0cfda1dd0a)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-02-26 13:27:24 +01:00
dependabot[bot]
68dfd96993 chore(deps): bump tj-actions/changed-files from 42.0.3 to 42.0.4
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.3 to 42.0.4.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](ec75ae5ab7...3f54ebb830)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-02-26 13:27:07 +01:00
David Testé
6811177178 chore(ci): fix missing backslash in rtx benchmark workflow
This missing backslash causes the Python command to fail since
some input arguments are missing.
2024-02-26 09:32:02 +01:00
Pedro Alves
753c7aa0d2 chore(gpu): minor improvement on the LUT generation function and in
are_all_comparisons_block_true()
2024-02-24 08:49:59 +01:00
tmontaigu
f38a9a9b4c feat(integer): add ilog2 and checked_ilog2 2024-02-23 18:55:34 +01:00
Agnes Leroy
c7f6eb0119 chore(gpu): change cudavec and cudastream 2024-02-23 15:04:12 +01:00
David Testé
85da12c00f chore(ci): run gpu benchmarks on rtx4090 every friday
Also increase timeout value to ensure benchmarks could last more
than 6 hours to execute.
2024-02-23 12:33:14 +01:00
Arthur Meyre
20b1427f72 chore(ci): fix cuda clippy targets
- missing feature meant some benchmarks were not linted
- add all targets for the cuda backend
2024-02-22 17:02:04 +01:00
tmontaigu
716677f383 feat(capi): allow control of threading 2024-02-22 12:08:46 +01:00
Arthur Meyre
d09e5ab066 feat(core): add TUniform distribution to core_crypto
- mutualize the distribution test between Uniform and TUniform, as both are
distributions with finite support (unlike the gaussian which needs its own
distribution test)
- the distribution test requires that the values can be mapped to/from
usize to be able to accumulate the statistics of each value being generated
- the tests make use of a DistributionTestHelper which genericizes the
construction of a distribution, mapping the value to/from usize and the
computation of the theoretical cumulative distribution function (on which
the test depends to test the validity of the distribution via the DKW
inequality, see Wikipedia link in the code)
2024-02-22 10:13:59 +01:00
Arthur Meyre
f8bfeb8927 feat(core): add a way to generate values from a semi-dynamic distribution
- semi dynamic as the distribution is a generic parameter, as the Scalar
type needs to be generable by that distribution but it also is configurable
at runtime
2024-02-22 10:13:59 +01:00
Arthur Meyre
67b543b6e7 chore(core): rename some tests whose names made little sense 2024-02-22 10:13:59 +01:00
Agnes Leroy
b2cfe2765c chore(gpu): add benchmarks for scalar eq/ne and cast 2024-02-21 10:04:01 +01:00
David Testé
8397637b24 chore(ci): use aws-region as input to stop ec2 instances
This is done to handle case where a PR is merged before AWS EC2
instance teardown. If we use profile input in this case, Slab will
try to fetch ci/slab.toml on a git reference that doesn't exists
anymore thus sending back an error without being able to terminate
the instance. By using aws-region Slab won't fetch slab.toml file.
2024-02-20 18:07:31 +01:00
tmontaigu
42b7c2f403 fix(integer): correct degree in small comparisons 2024-02-20 14:13:13 +01:00
J-B Orfila
b708abb10b feat(core): allow switching moduli during an LWE Keyswitch 2024-02-20 10:34:39 +01:00
Arthur Meyre
e62808b2b4 chore(core): fix CiphertextModulus::new error message
- the returned error from try_new was ignored
- use an enum with a const_panic and hardcoded error messages to keep new
const
- impl Debug manually to have nice error messages still when unwraping on
the try_new result if it's an Err

BREAKING CHANGE:
try_new and try_new_power_of_2 return type has changed for
CiphertextModulus
2024-02-20 10:34:39 +01:00
Agnes Leroy
62135791bf chore(gpu): panic when polynomial size is not supported 2024-02-20 09:29:04 +01:00
Agnes Leroy
41c38d127b chore(gpu): fix GPU PBS benchmark parameters 2024-02-20 09:29:04 +01:00
tmontaigu
d55d68ec52 fix(capi): add missing function on FheBool
- safe ser/de
- classical ser/de
- comparisons
- scalar binary fn/comparisons
- compact & compressed fhe bool encryption
2024-02-19 19:20:00 +01:00
Arthur Meyre
9faab7b9a6 chore(ci): increase timeout for M1 mac CI 2024-02-19 18:29:57 +01:00
Agnes Leroy
ff539aab6b chore(gpu): activate all targets for clippy_gpu 2024-02-19 16:47:19 +01:00
Agnes Leroy
799829eab4 feat(gpu): cast between unsigned cuda radix ciphertexts 2024-02-19 14:46:10 +01:00
Agnes Leroy
c30395daef chore(gpu): add workflow for 4090 integer bench 2024-02-19 14:02:17 +01:00
tmontaigu
ebce4fcfd4 chore(hlapi): add tests for fhe_bool 2024-02-19 10:11:40 +01:00
tmontaigu
85a428bb43 fix(integer): make encrypt_bool specify the degree
encrypt_one_block does not leak information
on the message.
BooleanBlocks are meant for when we want to
be explicit that the value is a boolean
and are ok for this to be public.

Thus it needs to correctly set the degree to 1
for other operations to properly take advantage of that
2024-02-19 10:11:40 +01:00
tmontaigu
c4266bd610 fix(shortint): fix bitwise opts degree
We used `after_bitand/or/xor` on the ct_left
**after** the lut had changed its degree.
So the `after_bit` function computed the
resulting using a wrong degree for the left
ct.
2024-02-19 10:11:40 +01:00
tmontaigu
76a7cd9b24 fix(hlapi): bind missing cuda bitnot 2024-02-19 10:11:40 +01:00
dependabot[bot]
9baa54b636 chore(deps): bump tj-actions/changed-files from 42.0.2 to 42.0.3
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.2 to 42.0.3.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](90a06d6ba9...ec75ae5ab7)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-02-19 09:29:56 +01:00
tmontaigu
863e0c275b feat(integer): add [trailing/leading]_[zeros/ones] 2024-02-16 15:56:15 +01:00
Arthur Meyre
cd13b40dbb chore(ci): fix GPU tests to run only with core crypto features 2024-02-16 13:43:28 +01:00
Arthur Meyre
1c8e88ebfd chore(ci): add -e flag to gpu fmt check script 2024-02-16 13:43:28 +01:00
Arthur Meyre
02bac34f1b chore(ci): set-up a workflow to use the RTX 4090 2024-02-16 13:43:28 +01:00
Arthur Meyre
4576508ccb chore(ci): update macOS runner for cargo builds 2024-02-15 19:01:15 +01:00
sarah el kazdadi
7190dad1e3 chore(ci): update toolchain, fix clippy warnings 2024-02-15 19:01:15 +01:00
sarah el kazdadi
18b9458401 fix(tfhe): update pulp and bytemuck to fix nightly breakage 2024-02-15 19:01:15 +01:00
David Testé
747ade0a54 chore(ci): update gpu aws ec2 ami
Done to get clang-format-15 and cmake-format and latest packages
updates.
2024-02-15 14:45:45 +01:00
David Testé
ada460b429 chore(bench): fix array declaration for multi-bit gpu 2024-02-15 13:32:11 +01:00
Beka Barbakadze
56f9b221eb feat(gpu): scalar shifts with one wave of pbs 2024-02-15 14:35:08 +04:00
Arthur Meyre
52f3babde5 feat(shortint): add an atomic counter to keep track of the number of PBSes 2024-02-15 10:47:12 +01:00
Arthur Meyre
3ff5d551a9 chore(ci): make avx512 enabled by default for benchmarks
- was too error prone when used by other people in the company, no more
doubts and we are generally not interested in non avx512 results
2024-02-15 10:37:14 +01:00
David Testé
0b1ea3b7dc chore(deps): update npm packages for wasm interface
NPM package `ip` had a critical security flaw thus packages needed
an upgrade to fix the issue.
2024-02-13 17:46:13 +01:00
dependabot[bot]
e0fddc8ea7 chore(deps): bump actions/upload-artifact from 4.3.0 to 4.3.1
Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.0 to 4.3.1.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](26f96dfa69...5d5d22a312)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-02-12 13:22:21 +01:00
yuxizama
5354cffd8e chore(doc): Update README.md structure 2024-02-12 09:38:34 +01:00
David Testé
d258d1fcf4 chore(ci): notify ec2 teardown failure on slack channel 2024-02-12 09:29:26 +01:00
David Testé
7cecbb30b2 chore(bench): run keyswitch benchmarks on multi-bit parameters 2024-02-12 09:29:11 +01:00
tmontaigu
ece82c51a5 feat(capi): add Cuda support
- This adds GPU support in the C API
- Also make ctest (cmake test launcher) print
  test output when it fails
2024-02-09 14:45:21 +01:00
Pedro Alves
8c54c8200b feat(gpu): implement scalar eq and ne 2024-02-09 13:04:31 +01:00
Arthur Meyre
b6bfe30065 chore(tfhe): remove some outdated concrete branding 2024-02-09 10:48:38 +01:00
Agnes Leroy
d5c0c0242c chore(bench): fix ks benchmark 2024-02-08 22:02:11 +01:00
Arthur Meyre
6826b6b638 chore(tfhe): pin bytemuck temporarily as the 1.14.2 is broken
- this follows the nightly update about stdsimd vs stdarch, the change
on bytemuck's side is not properly stable compatible
2024-02-08 14:39:39 +01:00
aquint-zama
f0b4749aca chore(doc): fix docs snippet comments 2024-02-07 09:25:26 +01:00
David Testé
eb4785001d chore(ci): add checks on params before running lattice estimation 2024-02-06 18:11:16 +01:00
tmontaigu
16d6b2f75d feat(capi): allow cbindgen to generate docs
with `documentation=true` cbindgen now properly
generates/copies any rust documentation that
are on `#[no_mangle] pub extern "C" fn` into
their corresponding declaration in the header file.

This will allows to finally start adding some documentation
on the CAPI (tfhe.h)
2024-02-06 12:09:43 +01:00
David Testé
9bdeb697ad chore(bench): implement integer casting benchmarks 2024-02-06 09:41:17 +01:00
Agnes Leroy
b5615bb3ad fix(gpu): fix 40 bit integer multiplication
Return in cuda memcpy and memset if size is 0 instead of aborting.
2024-02-06 09:08:48 +01:00
Agnes Leroy
37b94780b2 chore(bench): modify PBS bench names 2024-02-06 09:08:41 +01:00
Agnes Leroy
035a70d81f chore(gpu): add a benchmark for keyswitch on GPU 2024-02-06 09:08:41 +01:00
David Testé
f5c971652d refactor(boolean): put all parameters into constant array 2024-02-05 18:01:07 +01:00
Arthur Meyre
a0b75d9a37 chore(doc): rename acc->lut to better match shortint API naming in doctest 2024-02-05 17:39:13 +01:00
Arthur Meyre
90da50dc53 feat(shortint): many lut construction using MSB leftover space 2024-02-05 17:39:13 +01:00
Arthur Meyre
473a6a0f40 test(core): add a many LUT test in core crypto
- it does not require any new primitive so was made into a test at the core
crypto level
- shortint will have a more user friendly API, using the MSBs for selecting
the function means it should not require too much design as deltas are
always the same
2024-02-05 17:39:13 +01:00
dependabot[bot]
8f1a1da4e1 chore(deps): bump codecov/codecov-action from 3.1.5 to 4.0.1
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3.1.5 to 4.0.1.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)
- [Commits](4fe8c5f003...e0b68c6749)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-02-05 17:04:11 +01:00
David Testé
a3c07dedad chore(bench): add 4_4 parameters set to pbs throughput benchmarks 2024-02-05 17:03:38 +01:00
Agnes Leroy
5513d3a894 chore(gpu): abort when trying to launch 32 bit multi-bit PBS 2024-02-05 14:58:48 +01:00
David Testé
6ef0a2b4ef chore(ci): run ec2 teardown only if setup is not skipped 2024-02-04 09:54:37 +01:00
Arthur Meyre
bba4bcee88 chore(doc): update installation to match README
- added more clearly the information for x86 windows machines
2024-02-02 18:27:46 +01:00
David Testé
45befcaf40 chore(ci): switch to slab action for pull-request workflows 2024-02-02 16:46:59 +01:00
Agnes Leroy
97feefe2ed chore(gpu): reuse memory for the single carry propagation 2024-02-02 08:27:23 +01:00
Agnes Leroy
4ef8045a67 fix(gpu): fix cuda_memset with size 0 2024-02-02 08:27:10 +01:00
tmontaigu
48f67fb427 refactor(hlapi): split long files of hlapi
This splits the long base.rs files into multiple ones,
to make it easier to navigate.

There is no code changes appart from moving stuff.
2024-02-01 15:43:44 +01:00
Agnes Leroy
bce3bf1733 chore(gpu): add fmt and clippy checks in tfhe-cuda-backend 2024-02-01 15:23:49 +01:00
Agnes Leroy
253062c5aa chore(gpu): add tfhe-cuda-backend to the workspace 2024-02-01 15:23:49 +01:00
Arthur Meyre
b44ed91519 feat(integer): add smart_neg_assign 2024-02-01 10:04:22 +01:00
Arthur Meyre
ddb010d8f1 chore(integer): plug keyswitching tests with the ci_run_filter for nextest 2024-02-01 10:04:10 +01:00
Arthur Meyre
8a9559c4d1 chore(doc): fix modulus struct docstrings in shortint
- it stated it represented a number of bits while it represents the actual
modulus
2024-02-01 10:03:58 +01:00
Sexosexosexo
02265705fc docs(tfhe): add fhe_strings example 2024-01-31 17:07:11 +01:00
Mayeul@Zama
7b4bb6ad55 feat(c_api): add oprf 2024-01-31 16:53:33 +01:00
Mayeul@Zama
fd084d50c5 style(c_api): reformat c test 2024-01-31 16:53:33 +01:00
Mayeul@Zama
f59cb6c632 feat(c_api): add oprf 2024-01-31 16:53:33 +01:00
Mayeul@Zama
c594734fcf fix(integer): make oprf test more strict 2024-01-31 16:53:33 +01:00
Arthur Meyre
f9669c3294 chore(ci): update scripts and Makefile for future forward compatibility 2024-01-31 16:24:39 +01:00
Arthur Meyre
76665ab478 chore(ci): convert some make targets to be semver trick compatible 2024-01-31 16:24:39 +01:00
tmontaigu
9b454abe2a feat(integer): add checked_div
A division that returns a flag to know if the the visor was 0
2024-01-31 13:44:49 +01:00
tmontaigu
da08115c10 chore(integer): add notes in docs regarding division by 0 2024-01-31 13:44:49 +01:00
Arthur Meyre
8aec783dd9 chore(integer): remove deprecated parameter set 2024-01-31 09:35:22 +01:00
Ben
4bf28b836a chore(docs): fix typo 2024-01-31 09:34:51 +01:00
David Testé
9df529bc59 chore(ci): use slab action on fast cpu tests workflow 2024-01-31 09:09:24 +01:00
Agnes Leroy
71bff0963c chore(gpu): check for all cuda errors and abort in device.cu/.h
Remove some legacy compilation warnings
2024-01-31 08:54:48 +01:00
Arthur Meyre
eeaf45dbc7 docs(bench): add scalar benchmarks for integer 2024-01-30 10:50:37 +01:00
tmontaigu
353f279a9e feat(integer): fuse two PBS in comparisons
In comparisons, we were reducing a vec of orderings
(inferior, equal, superior) into one final ordering,
and then we would do one final PBS to transform that
into a boolean value (0 or 1) depending what was wanted
(<=, <, >, >=).

This fuse the last PBS (ordering -> boolean value) with
the last round of reduction, when there are only two blocks left
to be reduced.

This allows to gain one PBS. Meaning for ciphertext/cipheretxt
comparisons we get back the performance lost introduced by
the fix in f4c220c1. And comparisons between a clear and
ciphertext get an improvement.
2024-01-29 14:42:56 +01:00
tmontaigu
8355ed5c10 fix(integer): add noise cleaning pbs in comparisons
In comparisons we were packing blocks to then do a subtraction
between them. However this goes above the noise limit
that would guarentee the advertised error propability.

To fix that we add a pbs to clean the noise. This pbs only needs
to be added in the ciphertext/ciphertext comparisons. Making them slower
by 1 PBS.
2024-01-29 14:42:56 +01:00
dependabot[bot]
84844bb4eb chore(deps): bump codecov/codecov-action from 3.1.4 to 3.1.5
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 3.1.4 to 3.1.5.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)
- [Commits](eaaf4bedf3...4fe8c5f003)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-01-29 09:45:08 +01:00
dependabot[bot]
9b6e861f9b chore(deps): bump tj-actions/changed-files from 42.0.0 to 42.0.2
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 42.0.0 to 42.0.2.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](ae82ed4ae0...90a06d6ba9)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-01-29 09:44:57 +01:00
dependabot[bot]
b73f24057d chore(deps): bump actions/upload-artifact from 4.1.0 to 4.3.0
Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.1.0 to 4.3.0.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/v4.1.0...26f96dfa697d77e81fd5907df203aa23a56210a8)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-01-29 09:44:40 +01:00
David Testé
43c0799655 chore(bench): add ciphertexts sum to integer benchmarks 2024-01-26 17:56:49 +01:00
David Testé
f94a63eedc chore(bench): add pbs benchmarks on gpu 2024-01-26 17:42:31 +01:00
tmontaigu
35d65bcde7 docs(hlapi): document trivial encryption to debug 2024-01-25 10:36:07 +01:00
tmontaigu
0bfe59a656 docs(hlapi): document how to use rayon 2024-01-25 10:36:07 +01:00
yuxizama
ffe4c7135a chore(docs): update readme links and badges 2024-01-25 10:10:12 +01:00
yuxizama
a8f329fc75 chore(docs): update README.md
Change support banner
2024-01-25 10:10:12 +01:00
Agnes Leroy
11db96d394 fix(gpu): make all async functions unsafe, fix cuda_drop binding, add missing sync 2024-01-24 21:34:15 +01:00
David Testé
ae8d48138c chore(ci): add gpu tests from user documentation 2024-01-24 16:27:12 +01:00
Agnes Leroy
e912394b52 chore(gpu): fix formatting command 2024-01-24 15:46:23 +01:00
Agnes Leroy
8958b6df98 chore(gpu): fix compilation when no nvidia gpu is available 2024-01-24 15:46:23 +01:00
tmontaigu
aeb36ee14f fix(integer): is_scalar_out_of_bounds handles bigger ct
Fix a bug where in is_scalar_out_of_bounds, if the scalar was
negative and the ciphertext a signed one with more blocks than
the decomposed scalar, we would do an out of bound access
(i.e a panic).

This fixes that, this will fix doing signed_overflowing_mul on 256 bits
where the bug first appeared
2024-01-24 10:06:39 +01:00
David Testé
b3976f2963 chore(ci): fix inputs for gpu full benchmark workflow 2024-01-24 10:02:28 +01:00
Arthur Meyre
0d6e0c7224 fix(core): ignore value in the body when doing LWE encryption 2024-01-23 18:37:24 +01:00
Agnes Leroy
bd26d0ecd6 chore(gpu): rename "test vector" -> "luts" and "tvi" -> "lut_indexes" 2024-01-23 16:02:45 +01:00
Agnes Leroy
16f457b57c chore(gpu): move around code in integer.h for better readability 2024-01-23 16:02:45 +01:00
tmontaigu
6060882a7a fix(integer): fix cast in scalar_shift/rotate
In scalar_shift/rotate, we get the number of bits to shift/rotate
as a generic type, the can be casted to u64.

We compute the total number of bits the ciphertext has, cast that number
to the same type as the scalar, and do "shift % num_bits".

However, if the number of bits computed exceeds the max value the scalar
type can hold, we could end up doing a remainder with 0.

e.g 256bits ciphertext and scalar type u8 => 256u64 casted to u8 results
in 0.

Fix that by casting the scalar value to u64.
2024-01-23 15:10:04 +01:00
tmontaigu
3e2833ac64 chore(hlapi): remove leftover file
This file was not correctly removed during the refactor
2024-01-23 14:54:01 +01:00
David Testé
bc85163c23 chore(ci): change rust-toolchain action
Github thrid-party Action actions-rs/toolchain is not maintained
anymore. We switch to dtolnay/rust-toolchain.
2024-01-23 14:20:31 +01:00
David Testé
45b2548b17 chore(ci): set rustbacktrace var to full to ease debug on failure 2024-01-23 14:20:14 +01:00
Arthur Meyre
0476ee0c3c chore(docs): fix link to 0.4 semver doc 2024-01-23 10:50:34 +01:00
dependabot[bot]
8d77ea0a57 chore(deps): bump actions/upload-artifact from 3.1.2 to 4.2.0
Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 3.1.2 to 4.2.0.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/v3.1.2...694cdabd8bdb0f10b2cea11669e1bf5453eed0a6)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-01-23 10:32:16 +01:00
dependabot[bot]
f10fa3f13c chore(deps): bump tj-actions/changed-files from 41.1.1 to 42.0.0
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 41.1.1 to 42.0.0.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](62f4729b5d...ae82ed4ae0)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-01-23 10:31:59 +01:00
dependabot[bot]
fd4e2059f4 chore(deps): bump actions/checkout from 3.5.3 to 4.1.1
Bumps [actions/checkout](https://github.com/actions/checkout) from 3.5.3 to 4.1.1.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v3.5.3...b4ffde65f46336ab88eb53be808477a3936bae11)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-01-23 10:31:52 +01:00
402 changed files with 45140 additions and 21998 deletions

View File

@@ -1,6 +1,6 @@
---
name: Bug report
about: Report a problem with concrete
about: Report a problem with TFHE-rs
title: ''
labels: triage_required
assignees: ''

View File

@@ -1,6 +1,6 @@
---
name: Feature request
about: Suggest an idea for concrete
about: Suggest an idea for TFHE-rs
title: ''
labels: feature_request
assignees: ''

9
.github/actionlint.yaml vendored Normal file
View File

@@ -0,0 +1,9 @@
self-hosted-runner:
# Labels of self-hosted runner in array of strings.
labels:
- m1mac
- 4090-desktop
# Configuration variables in array of strings defined in your repository or
# organization. `null` means disabling configuration variables check.
# Empty array means no configuration variable is allowed.
config-variables: null

34
.github/workflows/approve_label.yml vendored Normal file
View File

@@ -0,0 +1,34 @@
# Manage approved label in pull request
name: PR approved label manager
on:
pull_request:
pull_request_review:
types: [submitted]
jobs:
trigger-tests:
runs-on: ubuntu-latest
permissions:
pull-requests: write
steps:
- name: Get current labels
uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
# Remove label if a push is performed after an approval
- name: Remove approved label
if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
with:
# We use a PAT to have the same user (zama-bot) for label deletion as for creation.
github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
labels: approved
# Add label only if the review is approved and if the label doesn't already exist
- name: Add approved label
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
with:
# We need to use a PAT to be able to trigger `labeled` event for the other workflow.
github_token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
labels: approved

View File

@@ -5,66 +5,56 @@ env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: 'Slab request ID'
type: string
fork_repo:
description: 'Name of forked repo as user/repo'
type: string
fork_git_sha:
description: 'Git SHA to checkout from fork'
type: string
pull_request:
jobs:
fast-tests:
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
cancel-in-progress: true
runs-on: ${{ inputs.runner_name }}
setup-ec2:
name: Setup EC2 instance (fast-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
aws-region: ${{ steps.start-instance.outputs.aws-region }}
steps:
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
profile: cpu-big
fast-tests:
name: Fast CPU tests
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
default: true
- name: Run concrete-csprng tests
run: |
@@ -120,8 +110,29 @@ jobs:
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Fast AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
teardown-ec2:
name: Teardown EC2 instance (fast-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, fast-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
region: ${{ needs.setup-ec2.outputs.aws-region }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (fast-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -0,0 +1,75 @@
# Compile and test tfhe-cuda-backend on an RTX 4090 machine
name: TFHE Cuda Backend - 4090 full tests
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [labeled]
jobs:
cuda-tests-linux:
name: CUDA tests (RTX 4090)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, '4090_test') }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Install latest stable
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
- name: Run fmt checks
run: |
make check_fmt_gpu
- name: Run clippy checks
run: |
make pcc_gpu
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
- name: Run user docs tests
run: |
make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
if: ${{ github.event_name == 'pull_request' }}
with:
labels: 4090_test
github_token: ${{ secrets.GITHUB_TOKEN }}
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA RTX 4090 tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -1,45 +1,48 @@
# Compile and test Concrete-cuda on an AWS instance
name: Concrete Cuda - Full tests
# Compile and test tfhe-cuda-backend on an AWS instance
name: TFHE Cuda Backend - Full tests
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: 'Slab request ID'
type: string
fork_repo:
description: 'Name of forked repo as user/repo'
type: string
fork_git_sha:
description: 'Git SHA to checkout from fork'
type: string
pull_request:
jobs:
run-cuda-tests-linux:
setup-ec2:
name: Setup EC2 instance (cuda-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
aws-region: ${{ steps.start-instance.outputs.aws-region }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
profile: gpu-test
cuda-tests-linux:
name: CUDA tests
needs: setup-ec2
concurrency:
group: tfhe_cuda_backend_test-${{ github.ref }}
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
name: Test code in EC2
runs-on: ${{ inputs.runner_name }}
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
@@ -52,31 +55,17 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Checkout tfhe-rs
uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
default: true
- name: Export CUDA variables
if: ${{ !cancelled() }}
@@ -90,15 +79,66 @@ jobs:
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run fmt checks
run: |
make check_fmt_gpu
- name: Run clippy checks
run: |
make clippy_gpu
make pcc_gpu
- name: Run all tests
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
- name: Run user docs tests
run: |
make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "CUDA AWS tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-ec2:
name: Teardown EC2 instance (cuda-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
region: ${{ needs.setup-ec2.outputs.aws-region }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (cuda-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -4,66 +4,58 @@ env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
fork_repo:
description: "Name of forked repo as user/repo"
type: string
fork_git_sha:
description: "Git SHA to checkout from fork"
type: string
pull_request:
types: [ labeled ]
jobs:
integer-tests:
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
cancel-in-progress: true
runs-on: ${{ inputs.runner_name }}
setup-ec2:
name: Setup EC2 instance (unsigned-integer-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
aws-region: ${{ steps.start-instance.outputs.aws-region }}
steps:
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
profile: cpu-big
unsigned-integer-tests:
name: Unsigned integer tests
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
default: true
- name: Gen Keys if required
run: |
@@ -87,8 +79,29 @@ jobs:
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_MESSAGE: "Unsigned Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-ec2:
name: Teardown EC2 instance (unsigned-integer-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, unsigned-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
region: ${{ needs.setup-ec2.outputs.aws-region }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (unsigned-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -4,66 +4,58 @@ env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
fork_repo:
description: "Name of forked repo as user/repo"
type: string
fork_git_sha:
description: "Git SHA to checkout from fork"
type: string
pull_request:
types: [ labeled ]
jobs:
multi-bit-tests:
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
cancel-in-progress: true
runs-on: ${{ inputs.runner_name }}
setup-ec2:
name: Setup EC2 instance (signed-integer-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
aws-region: ${{ steps.start-instance.outputs.aws-region }}
steps:
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
profile: cpu-big
signed-integer-tests:
name: Signed integer tests
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
default: true
- name: Gen Keys if required
run: |
@@ -91,8 +83,29 @@ jobs:
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_MESSAGE: "Signed Integer tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-ec2:
name: Teardown EC2 instance (signed-integer-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, signed-integer-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
region: ${{ needs.setup-ec2.outputs.aws-region }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (signed-integer-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -4,66 +4,58 @@ env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: 'Slab request ID'
type: string
fork_repo:
description: 'Name of forked repo as user/repo'
type: string
fork_git_sha:
description: 'Git SHA to checkout from fork'
type: string
pull_request:
types: [ labeled ]
jobs:
shortint-tests:
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
cancel-in-progress: true
runs-on: ${{ inputs.runner_name }}
setup-ec2:
name: Setup EC2 instance (cpu-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
aws-region: ${{ steps.start-instance.outputs.aws-region }}
steps:
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
profile: cpu-big
cpu-tests:
name: CPU tests
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
default: true
- name: Run concrete-csprng tests
run: |
@@ -113,8 +105,29 @@ jobs:
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "Shortint tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_MESSAGE: "CPU tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-ec2:
name: Teardown EC2 instance (cpu-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, cpu-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
region: ${{ needs.setup-ec2.outputs.aws-region }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (cpu-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -4,66 +4,58 @@ env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: 'Slab request ID'
type: string
fork_repo:
description: 'Name of forked repo as user/repo'
type: string
fork_git_sha:
description: 'Git SHA to checkout from fork'
type: string
pull_request:
types: [ labeled ]
jobs:
wasm-tests:
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
cancel-in-progress: true
runs-on: ${{ inputs.runner_name }}
setup-ec2:
name: Setup EC2 instance (wasm-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
aws-region: ${{ steps.start-instance.outputs.aws-region }}
steps:
# Step used for log purpose.
- name: Instance configuration used
run: |
echo "ID: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
echo "Fork repo: ${{ inputs.fork_repo }}"
echo "Fork git sha: ${{ inputs.fork_git_sha }}"
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
profile: cpu-small
wasm-tests:
name: WASM tests
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
default: true
- name: Run js on wasm API tests
run: |
@@ -80,8 +72,29 @@ jobs:
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "WASM tests finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
teardown-ec2:
name: Teardown EC2 instance (wasm-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, wasm-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
region: ${{ needs.setup-ec2.outputs.aws-region }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (wasm-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -32,6 +32,8 @@ env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-boolean-benchmarks:
@@ -61,14 +63,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON bench_boolean
make bench_boolean
- name: Parse results
run: |
@@ -96,7 +97,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_boolean
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -6,6 +6,8 @@ on:
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
@@ -17,7 +19,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ubuntu-latest, macos-latest-large, windows-latest]
fail-fast: false
steps:
@@ -66,5 +68,9 @@ jobs:
run: |
make build_c_api
- name: Build coverage tests
run: |
make build_tfhe_coverage
# The wasm build check is a bit annoying to set-up here and is done during the tests in
# aws_tfhe_tests.yml

27
.github/workflows/ci_lint.yml vendored Normal file
View File

@@ -0,0 +1,27 @@
# Lint and check CI
name: CI Lint and Checks
on:
pull_request:
env:
ACTIONLINT_VERSION: 1.6.27
jobs:
lint-check:
name: Lint and checks
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Get actionlint
run: |
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/main/scripts/download-actionlint.bash) ${{ env.ACTIONLINT_VERSION }}
echo "f2ee6d561ce00fa93aab62a7791c1a0396ec7e8876b2a8f2057475816c550782 actionlint" > checksum
sha256sum -c checksum
ln -s "$(pwd)/actionlint" /usr/local/bin/
- name: Lint workflows
run: |
make lint_workflow

View File

@@ -4,6 +4,8 @@ env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
@@ -38,7 +40,7 @@ jobs:
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
cancel-in-progress: true
runs-on: ${{ inputs.runner_name }}
timeout-minutes: 1080
timeout-minutes: 11520 # 8 days
steps:
# Step used for log purpose.
- name: Instance configuration used
@@ -61,14 +63,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
default: true
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@62f4729b5df35e6e0e01265fa70a82ccaf196b4b
uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11
with:
files_yaml: |
tfhe:
@@ -98,7 +99,7 @@ jobs:
make test_shortint_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d
uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
@@ -106,6 +107,20 @@ jobs:
fail_ci_if_error: true
files: shortint/cobertura.xml,boolean/cobertura.xml,core_crypto/cobertura.xml,core_crypto_avx512/cobertura.xml
- name: Run integer coverage
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
run: |
make test_integer_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@54bcd8715eee62d40e33596ef5e8f0f48dbbccab
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./coverage/
fail_ci_if_error: true
files: integer/cobertura.xml
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true

View File

@@ -1,5 +1,5 @@
# Run PBS benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: PBS benchmarks
# Run core crypto benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: Core crypto benchmarks
on:
workflow_dispatch:
@@ -32,10 +32,12 @@ env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-pbs-benchmarks:
name: Execute PBS benchmarks in EC2
run-core-crypto-benchmarks:
name: Execute core crypto benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
steps:
@@ -61,14 +63,14 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON bench_pbs
make bench_pbs
make bench_ks
- name: Parse results
run: |
@@ -86,9 +88,9 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_pbs
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo

View File

@@ -0,0 +1,157 @@
# Run core crypto benchmarks on an AWS instance with CUDA and return parsed results to Slab CI bot.
name: Core crypto GPU benchmarks
on:
workflow_dispatch:
inputs:
instance_id:
description: "Instance ID"
type: string
instance_image_id:
description: "Instance AMI ID"
type: string
instance_type:
description: "Instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: "Slab request ID"
type: string
# This input is not used in this workflow but still mandatory since a calling workflow could
# use it. If a triggering command include a user_inputs field, then the triggered workflow
# must include this very input, otherwise the workflow won't be called.
# See start_full_benchmarks.yml as example.
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
jobs:
run-core-crypto-benchmarks:
name: Execute GPU core crypto benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ !cancelled() }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Instance configuration used
run: |
echo "IDs: ${{ inputs.instance_id }}"
echo "AMI: ${{ inputs.instance_image_id }}"
echo "Type: ${{ inputs.instance_type }}"
echo "Request ID: ${{ inputs.request_id }}"
- name: Get benchmark date
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Run benchmarks with AVX512
run: |
make bench_pbs_gpu
make bench_ks_gpu
- name: Parse results
run: |
COMMIT_DATE="$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"
COMMIT_HASH="$(git describe --tags --dirty)"
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware ${{ inputs.instance_type }} \
--backend gpu \
--project-version "${COMMIT_HASH}" \
--branch ${{ github.ref_name }} \
--commit-date "${COMMIT_DATE}" \
--bench-date "${{ env.BENCH_DATE }}" \
--name-suffix avx512 \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on downloaded artifact"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "PBS GPU benchmarks failed. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -1,74 +0,0 @@
name: CSPRNG randomness testing Workflow
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
# All the inputs are provided by Slab
inputs:
instance_id:
description: "AWS instance ID"
type: string
instance_image_id:
description: "AWS instance AMI ID"
type: string
instance_type:
description: "AWS instance product type"
type: string
runner_name:
description: "Action runner name"
type: string
request_id:
description: 'Slab request ID'
type: string
fork_repo:
description: 'Name of forked repo as user/repo'
type: string
fork_git_sha:
description: 'Git SHA to checkout from fork'
type: string
jobs:
csprng-randomness-teting:
name: CSPRNG randomness testing
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_${{ inputs.instance_image_id }}_${{ inputs.instance_type }}
cancel-in-progress: true
runs-on: ${{ inputs.runner_name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: ${{ inputs.fork_repo }}
ref: ${{ inputs.fork_git_sha }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
with:
toolchain: stable
default: true
- name: Dieharder randomness test suite
run: |
make dieharder_csprng
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -0,0 +1,96 @@
name: CSPRNG randomness testing Workflow
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
jobs:
setup-ec2:
name: Setup EC2 instance (csprng-randomness-tests)
if: ${{ github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'approved') }}
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
instance-id: ${{ steps.start-instance.outputs.ec2-instance-id }}
aws-region: ${{ steps.start-instance.outputs.aws-region }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
profile: cpu-small
csprng-randomness-tests:
name: CSPRNG randomness tests
needs: setup-ec2
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: true
runs-on: ${{ needs.setup-ec2.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
- name: Dieharder randomness test suite
run: |
make dieharder_csprng
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "concrete-csprng randomness check finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
teardown-ec2:
name: Teardown EC2 instance (csprng-randomness-tests)
if: ${{ always() && needs.setup-ec2.result != 'skipped' }}
needs: [ setup-ec2, csprng-randomness-tests ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@8562abbdc96b3619bd5debe1fb934db298f9a044
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
region: ${{ needs.setup-ec2.outputs.aws-region }}
label: ${{ needs.setup-ec2.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "EC2 teardown (csprng-randomness-tests) failed. (${{ env.ACTION_RUN_URL }})"

View File

@@ -0,0 +1,202 @@
# Run all benchmarks on an RTX 4090 machine and return parsed results to Slab CI bot.
name: TFHE Cuda Backend - 4090 full benchmarks
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [labeled]
schedule:
# Weekly benchmarks will be triggered each Friday at 9p.m.
- cron: "0 21 * * 5"
jobs:
cuda-integer-benchmarks:
name: Cuda integer benchmarks for all operations flavor (RTX 4090)
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_cuda_integer_bench
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [integer, integer_multi_bit]
op_flavor: [default, unchecked]
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Run integer benchmarks
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "rtx4090" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ always() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Integer RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
cuda-core-crypto-benchmarks:
name: Cuda core crypto benchmarks (RTX 4090)
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' || contains(github.event.label.name, '4090_bench') }}
needs: cuda-integer-benchmarks
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}_cuda_core_crypto_bench
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
- name: Run integer benchmarks
run: |
make bench_pbs_gpu
make bench_ks_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "rtx4090" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ !success() && !cancelled() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Core crypto RTX 4090 full benchmarks failed. (${{ env.ACTION_RUN_URL }})"
remove_github_label:
name: Remove 4090 bench label
if: ${{ github.event_name == 'pull_request' }}
needs: [cuda-integer-benchmarks, cuda-core-crypto-benchmarks]
runs-on: ["self-hosted", "4090-desktop"]
steps:
- uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
with:
labels: 4090_bench
github_token: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -25,6 +25,8 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -54,14 +56,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer
make FAST_BENCH=TRUE bench_integer
- name: Parse benchmarks to csv
run: |
@@ -69,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +91,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -28,6 +28,8 @@ env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
prepare-matrix:
@@ -39,17 +41,17 @@ jobs:
- name: Weekly benchmarks
if: ${{ github.event.inputs.user_inputs == 'weekly_benchmarks' }}
run: |
echo "OP_FLAVOR=[\"default\"]" >> ${GITHUB_ENV}
echo "OP_FLAVOR=[\"default\"]" >> "${GITHUB_ENV}"
- name: Quarterly benchmarks
if: ${{ github.event.inputs.user_inputs == 'quarterly_benchmarks' }}
run: |
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> ${GITHUB_ENV}
echo "OP_FLAVOR=[\"default\", \"smart\", \"unchecked\", \"misc\"]" >> "${GITHUB_ENV}"
- name: Set operation flavor output
id: set_op_flavor
run: |
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> ${GITHUB_OUTPUT}
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
integer-benchmarks:
name: Execute integer benchmarks for all operations flavor
@@ -78,9 +80,11 @@ jobs:
- name: Get benchmark details
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -88,10 +92,9 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Checkout Slab repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -102,7 +105,7 @@ jobs:
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
- name: Parse results
run: |
@@ -118,7 +121,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -25,6 +25,8 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -54,7 +56,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -64,31 +66,33 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_gpu
- name: Parse benchmarks to csv
run: |
@@ -96,7 +100,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -118,13 +122,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
@@ -145,7 +149,7 @@ jobs:
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
if: ${{ !success() && !cancelled() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:

View File

@@ -19,24 +19,35 @@ on:
request_id:
description: "Slab request ID"
type: string
# This input is not used in this workflow but still mandatory since a calling workflow could
# use it. If a triggering command include a user_inputs field, then the triggered workflow
# must include this very input, otherwise the workflow won't be called.
# See start_full_benchmarks.yml as example.
user_inputs:
description: "Type of benchmarks to run"
type: string
default: "weekly_benchmarks"
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
integer-benchmarks:
name: Execute integer benchmarks for all operations flavor
runs-on: ${{ github.event.inputs.runner_name }}
timeout-minutes: 1440 # 24 hours
if: ${{ !cancelled() }}
continue-on-error: true
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [ integer, integer_multi_bit]
op_flavor: [ default, unchecked ]
command: [integer, integer_multi_bit]
op_flavor: [default, unchecked]
# explicit include-based build matrix, of known valid options
include:
- os: ubuntu-22.04
@@ -53,15 +64,17 @@ jobs:
echo "Request ID: ${{ inputs.request_id }}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
- name: Get benchmark details
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -69,30 +82,32 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
@@ -100,7 +115,7 @@ jobs:
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
- name: Parse results
run: |
@@ -117,7 +132,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
@@ -139,7 +154,7 @@ jobs:
slack-notification:
name: Slack Notification
runs-on: ${{ github.event.inputs.runner_name }}
if: ${{ failure() }}
if: ${{ !success() && !cancelled() }}
needs: integer-benchmarks
steps:
- name: Notify

View File

@@ -25,6 +25,8 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -54,14 +56,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Run multi-bit benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_integer_multi_bit
make FAST_BENCH=TRUE bench_integer_multi_bit
- name: Parse benchmarks to csv
run: |
@@ -69,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +91,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -1,5 +1,5 @@
# Run integer benchmarks with multi-bit cryptographic parameters on an AWS instance and return parsed results to Slab CI bot.
name: Integer Multi-bit benchmarks
name: Integer GPU Multi-bit benchmarks
on:
workflow_dispatch:
@@ -25,11 +25,14 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
cuda-integer-benchmarks:
name: Execute integer multi-bit benchmarks in EC2
runs-on: ${{ github.event.inputs.runner_name }}
timeout-minutes: 1440 # 24 hours
if: ${{ !cancelled() }}
strategy:
fail-fast: false
@@ -37,8 +40,7 @@ jobs:
matrix:
include:
- os: ubuntu-22.04
cuda: "11.8"
cuda_arch: "70"
cuda: "12.2"
gcc: 9
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
@@ -55,7 +57,7 @@ jobs:
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
fetch-depth: 0
@@ -65,31 +67,33 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}" >> "${GITHUB_ENV}"
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Run multi-bit benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
make FAST_BENCH=TRUE BENCH_OP_FLAVOR=default bench_integer_multi_bit_gpu
- name: Parse benchmarks to csv
run: |
@@ -97,7 +101,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -119,13 +123,13 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
with:
repository: zama-ai/slab
path: slab
@@ -146,7 +150,7 @@ jobs:
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() }}
if: ${{ !success() && !cancelled() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@b24d75fe0e728a4bf9fc42ee217caa686d141ee8
env:

View File

@@ -14,6 +14,8 @@ on:
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
FAST_TESTS: "TRUE"
@@ -25,15 +27,16 @@ jobs:
cargo-builds:
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' || contains(github.event.label.name, 'm1_test') }}
runs-on: ["self-hosted", "m1mac"]
# 12 hours, default is 6 hours, hopefully this is more than enough
timeout-minutes: 720
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
- name: Install latest stable
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: stable
default: true
- name: Run pcc checks
run: |

View File

@@ -24,6 +24,7 @@ jobs:
with:
repository: malb/lattice-estimator
path: lattice_estimator
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
- name: Install Sage
run: |

View File

@@ -24,6 +24,8 @@ env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-shortint-benchmarks:
@@ -53,14 +55,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON bench_shortint
make bench_shortint
- name: Parse results
run: |
@@ -88,7 +89,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_shortint
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -32,6 +32,8 @@ env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
shortint-benchmarks:
@@ -57,9 +59,11 @@ jobs:
- name: Get benchmark details
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -67,10 +71,9 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Checkout Slab repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -81,7 +84,7 @@ jobs:
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_shortint
- name: Parse results
run: |
@@ -112,7 +115,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -25,6 +25,8 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -54,14 +56,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_signed_integer
make FAST_BENCH=TRUE bench_signed_integer
- name: Parse benchmarks to csv
run: |
@@ -69,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +91,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -28,6 +28,8 @@ env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
integer-benchmarks:
@@ -56,9 +58,11 @@ jobs:
- name: Get benchmark details
run: |
echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})" >> "${GITHUB_ENV}"
echo "COMMIT_HASH=$(git describe --tags --dirty)" >> "${GITHUB_ENV}"
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
@@ -66,10 +70,9 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Checkout Slab repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11
@@ -80,7 +83,7 @@ jobs:
- name: Run benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_signed_${{ matrix.command }}
- name: Parse results
run: |
@@ -96,7 +99,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -25,6 +25,8 @@ env:
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-integer-benchmarks:
@@ -54,14 +56,13 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Run multi-bit benchmarks with AVX512
run: |
make AVX512_SUPPORT=ON FAST_BENCH=TRUE bench_signed_integer_multi_bit
make FAST_BENCH=TRUE bench_signed_integer_multi_bit
- name: Parse benchmarks to csv
run: |
@@ -69,7 +70,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -90,7 +91,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -32,8 +32,12 @@ on:
description: "Run signed integer multi bit benches"
type: boolean
default: true
pbs_bench:
description: "Run PBS benches"
core_crypto_bench:
description: "Run core crypto benches"
type: boolean
default: true
core_crypto_gpu_bench:
description: "Run core crypto benches on GPU"
type: boolean
default: true
wasm_client_bench:
@@ -50,7 +54,7 @@ jobs:
integer_bench, integer_multi_bit_bench,
signed_integer_bench, signed_integer_multi_bit_bench,
integer_gpu_bench, integer_multi_bit_gpu_bench,
pbs_bench, wasm_client_bench ]
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
@@ -60,7 +64,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@62f4729b5df35e6e0e01265fa70a82ccaf196b4b
uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11
with:
files_yaml: |
common_benches:
@@ -98,10 +102,10 @@ jobs:
- tfhe/src/integer/**
- tfhe/benches/integer/signed_bench.rs
- .github/workflows/signed_integer_multi_bit_benchmark.yml
pbs_bench:
core_crypto_bench:
- tfhe/src/core_crypto/**
- tfhe/benches/core_crypto/**
- .github/workflows/pbs_benchmark.yml
- .github/workflows/core_crypto_benchmark.yml
wasm_client_bench:
- tfhe/web_wasm_parallel_tests/**
- .github/workflows/wasm_client_benchmark.yml

View File

@@ -24,8 +24,9 @@ jobs:
if: ${{ (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') || github.event_name == 'workflow_dispatch' }}
strategy:
matrix:
command: [ boolean_bench, shortint_full_bench, integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
pbs_bench, wasm_client_bench ]
command: [ boolean_bench, shortint_full_bench,
integer_full_bench, signed_integer_full_bench, integer_gpu_full_bench,
core_crypto_bench, core_crypto_gpu_bench, wasm_client_bench ]
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs

View File

@@ -17,7 +17,7 @@ jobs:
with:
fetch-depth: 0
- name: Save repo
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: repo-archive
path: '.'

View File

@@ -1,55 +0,0 @@
# Trigger an AWS build each time commits are pushed to a pull request.
name: PR AWS build trigger
on:
pull_request:
pull_request_review:
types: [submitted]
jobs:
trigger-tests:
runs-on: ubuntu-latest
permissions:
pull-requests: write
steps:
- name: Get current labels
uses: snnaplab/get-labels-action@f426df40304808ace3b5282d4f036515f7609576
- name: Remove approved label
if: ${{ github.event_name == 'pull_request' && contains(fromJSON(env.LABELS), 'approved') }}
uses: actions-ecosystem/action-remove-labels@2ce5d41b4b6aa8503e285553f75ed56e0a40bae0
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
labels: approved
- name: Launch fast tests
if: ${{ github.event_name == 'pull_request' }}
uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
with:
allow-repeats: true
message: |
@slab-ci cpu_fast_test
@slab-ci gpu_test
- name: Add approved label
uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
labels: approved
# PR label 'approved' presence is checked to avoid running the full test suite several times
# in case of multiple approvals without new commits in between.
- name: Launch full tests suite
if: ${{ github.event_name == 'pull_request_review' && github.event.review.state == 'approved' && !contains(fromJSON(env.LABELS), 'approved') }}
uses: mshick/add-pr-comment@a65df5f64fc741e91c59b8359a4bc56e57aaf5b1
with:
allow-repeats: true
message: |
Pull Request has been approved :tada:
Launching full test suite...
@slab-ci cpu_test
@slab-ci cpu_unsigned_integer_test
@slab-ci cpu_signed_integer_test
@slab-ci cpu_wasm_test
@slab-ci csprng_randomness_testing

View File

@@ -32,6 +32,8 @@ env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
jobs:
run-wasm-client-benchmarks:
@@ -61,10 +63,9 @@ jobs:
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
uses: dtolnay/rust-toolchain@be73d7920c329f220ce78e0234b8f96b7ae60248
with:
toolchain: nightly
override: true
- name: Run benchmarks
run: |
@@ -97,7 +98,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@1eb3cb2b3e0f29609092a73eb033bb759a334595
uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3
with:
name: ${{ github.sha }}_wasm
path: ${{ env.RESULTS_FILENAME }}

3
.gitignore vendored
View File

@@ -19,3 +19,6 @@ dieharder_run.log
# Coverage reports
/coverage/
# Cuda local build
backends/tfhe-cuda-backend/cuda/cmake-build-debug/

View File

@@ -1,6 +1,6 @@
[workspace]
resolver = "2"
members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng"]
members = ["tfhe", "tasks", "apps/trivium", "concrete-csprng", "backends/tfhe-cuda-backend"]
[profile.bench]
lto = "fat"

196
Makefile
View File

@@ -17,6 +17,7 @@ FAST_TESTS?=FALSE
FAST_BENCH?=FALSE
BENCH_OP_FLAVOR?=DEFAULT
NODE_VERSION=20
FORWARD_COMPAT?=OFF
# sed: -n, do not print input stream, -e means a script/expression
# 1,/version/ indicates from the first line, to the line matching version at the start of the line
# p indicates to print, so we keep only the start of the Cargo.toml until we hit the first version
@@ -49,12 +50,18 @@ else
COVERAGE_ONLY=
endif
ifeq ($(FORWARD_COMPAT),ON)
FORWARD_COMPAT_FEATURE=forward_compatibility
else
FORWARD_COMPAT_FEATURE=
endif
# Variables used only for regex_engine example
REGEX_STRING?=''
REGEX_PATTERN?=''
# tfhe-cuda-backend
TFHECUDA_SRC="backends/tfhe-cuda-backend/implementation"
TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
# Exclude these files from coverage reports
@@ -137,6 +144,11 @@ check_linelint_installed:
@printf "\n" | linelint - > /dev/null 2>&1 || \
( echo "Unable to locate linelint. Try installing it: https://github.com/fernandrone/linelint/releases" && exit 1 )
.PHONY: check_actionlint_installed # Check if actionlint workflow linter is installed
check_actionlint_installed:
@actionlint --version > /dev/null 2>&1 || \
( echo "Unable to locate actionlint. Try installing it: https://github.com/rhysd/actionlint/releases" && exit 1 )
.PHONY: fmt # Format rust code
fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt
@@ -150,11 +162,17 @@ fmt_gpu: install_rs_check_toolchain
check_fmt: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
.PHONY: clippy_gpu # Run clippy lints on the gpu backend
.PHONY: check_fmt_gpu # Check rust and cuda code format
check_fmt_gpu: install_rs_check_toolchain
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" fmt --check
cd "$(TFHECUDA_SRC)" && ./format_tfhe_cuda_backend.sh -c
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),integer,shortint,gpu \
-p tfhe -- --no-deps -D warnings
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
--all-targets \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
fix_newline: check_linelint_installed
@@ -164,6 +182,10 @@ fix_newline: check_linelint_installed
check_newline: check_linelint_installed
linelint .
.PHONY: lint_workflow # Run static linter on GitHub workflows
lint_workflow: check_actionlint_installed
actionlint
.PHONY: clippy_core # Run clippy lints on core_crypto with and without experimental features
clippy_core: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -172,6 +194,12 @@ clippy_core: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),experimental \
-p $(TFHE_SPEC) -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),nightly-avx512 \
-p $(TFHE_SPEC) -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),experimental,nightly-avx512 \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_boolean # Run clippy lints enabling the boolean features
clippy_boolean: install_rs_check_toolchain
@@ -239,6 +267,11 @@ clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_triviu
clippy_fast: clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core \
clippy_concrete_csprng
.PHONY: clippy_cuda_backend # Run clippy lints on the tfhe-cuda-backend
clippy_cuda_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-cuda-backend -- --no-deps -D warnings
.PHONY: build_core # Build core_crypto without experimental features
build_core: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
@@ -277,6 +310,11 @@ build_tfhe_full: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --all-targets
.PHONY: build_tfhe_coverage # Build with test coverage enabled
build_tfhe_coverage: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) --tests
.PHONY: symlink_c_libs_without_fingerprint # Link the .a and .so files without the changing hash part in target
symlink_c_libs_without_fingerprint:
@./scripts/symlink_c_libs_without_fingerprint.sh \
@@ -286,15 +324,23 @@ symlink_c_libs_without_fingerprint:
.PHONY: build_c_api # Build the C API for boolean, shortint and integer
build_c_api: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,$(FORWARD_COMPAT_FEATURE) \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
.PHONY: build_c_api_gpu # Build the C API for boolean, shortint and integer
build_c_api_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,gpu \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
.PHONY: build_c_api_experimental_deterministic_fft # Build the C API for boolean, shortint and integer with experimental deterministic FFT
build_c_api_experimental_deterministic_fft: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) build --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4 \
--features=$(TARGET_ARCH_FEATURE),boolean-c-api,shortint-c-api,high-level-c-api,experimental-force_fft_algo_dif4,$(FORWARD_COMPAT_FEATURE) \
-p $(TFHE_SPEC)
@"$(MAKE)" symlink_c_libs_without_fingerprint
.PHONY: build_web_js_api # Build the js API targeting the web browser
build_web_js_api: install_rs_build_toolchain install_wasm_pack
@@ -338,32 +384,40 @@ test_core_crypto_cov: install_rs_build_toolchain install_rs_check_toolchain inst
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/core_crypto --line --engine llvm --timeout 500 \
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache \
-p $(TFHE_SPEC) -- core_crypto::
@if [[ "$(AVX512_SUPPORT)" == "ON" ]]; then \
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/core_crypto_avx512 --line --engine llvm --timeout 500 \
--implicit-test-threads $(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,__coverage,$(AVX512_FEATURE) \
-p $(TFHE_SPEC) -- core_crypto::; \
--features=$(TARGET_ARCH_FEATURE),experimental,internal-keycache,$(AVX512_FEATURE) \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time core_crypto::; \
fi
.PHONY: test_cuda_backend # Run the internal tests of the CUDA backend
test_cuda_backend:
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
cmake .. -DCMAKE_BUILD_TYPE=Release -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON && \
make -j && \
make test
.PHONY: test_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_gpu: test_core_crypto_gpu test_integer_gpu
test_gpu: test_core_crypto_gpu test_integer_gpu test_cuda_backend
.PHONY: test_core_crypto_gpu # Run the tests of the core_crypto module including experimental on the gpu backend
test_core_crypto_gpu: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- core_crypto::gpu::
--features=$(TARGET_ARCH_FEATURE),gpu -p $(TFHE_SPEC) -- core_crypto::gpu::
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
test_integer_gpu: install_rs_build_toolchain install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p tfhe -- integer::gpu::server_key::
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
.PHONY: test_boolean # Run the tests of the boolean module
test_boolean: install_rs_build_toolchain
@@ -375,8 +429,8 @@ test_boolean_cov: install_rs_check_toolchain install_tarpaulin
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/boolean --line --engine llvm --timeout 500 \
$(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,__coverage \
-p $(TFHE_SPEC) -- boolean::
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time boolean::
.PHONY: test_c_api_rs # Run the rust tests for the C API
test_c_api_rs: install_rs_check_toolchain
@@ -392,19 +446,23 @@ test_c_api_c: build_c_api
.PHONY: test_c_api # Run all the tests for the C API
test_c_api: test_c_api_rs test_c_api_c
.PHONY: test_c_api_gpu # Run the C tests for the C API
test_c_api_gpu: build_c_api_gpu
./scripts/c_api_tests.sh --gpu
.PHONY: test_shortint_ci # Run the tests for shortint ci
test_shortint_ci: install_rs_build_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)"
--cargo-profile "$(CARGO_PROFILE)" --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_shortint_multi_bit_ci # Run the tests for shortint ci running only multibit tests
test_shortint_multi_bit_ci: install_rs_build_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
./scripts/shortint-tests.sh --rust-toolchain $(CARGO_RS_BUILD_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_shortint # Run all the tests for shortint
test_shortint: install_rs_build_toolchain
@@ -416,15 +474,16 @@ test_shortint_cov: install_rs_check_toolchain install_tarpaulin
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/shortint --line --engine llvm --timeout 500 \
$(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,__coverage \
-p $(TFHE_SPEC) -- shortint::
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time shortint::
.PHONY: test_integer_ci # Run the tests for integer ci
test_integer_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)"
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
--tfhe-package "$(TFHE_SPEC)"
.PHONY: test_unsigned_integer_ci # Run the tests for unsigned integer ci
test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -432,7 +491,7 @@ test_unsigned_integer_ci: install_rs_check_toolchain install_cargo_nextest
FAST_TESTS="$(FAST_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
--unsigned-only
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_signed_integer_ci # Run the tests for signed integer ci
test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
@@ -440,14 +499,15 @@ test_signed_integer_ci: install_rs_check_toolchain install_cargo_nextest
FAST_TESTS="$(FAST_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --avx512-support "$(AVX512_SUPPORT)" \
--signed-only
--signed-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_integer_multi_bit_ci # Run the tests for integer ci running only multibit tests
test_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
FAST_TESTS="$(FAST_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)"
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
--tfhe-package "$(TFHE_SPEC)"
.PHONY: test_unsigned_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -455,7 +515,7 @@ test_unsigned_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nex
FAST_TESTS="$(FAST_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
--unsigned-only
--unsigned-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_signed_integer_multi_bit_ci # Run the tests for nsigned integer ci running only multibit tests
test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nextest
@@ -463,7 +523,7 @@ test_signed_integer_multi_bit_ci: install_rs_check_toolchain install_cargo_nexte
FAST_TESTS="$(FAST_TESTS)" \
./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --avx512-support "$(AVX512_SUPPORT)" \
--signed-only
--signed-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_safe_deserialization # Run the tests for safe deserialization
test_safe_deserialization: install_rs_build_toolchain install_cargo_nextest
@@ -475,18 +535,44 @@ test_integer: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache -p $(TFHE_SPEC) -- integer::
.PHONY: test_integer_cov # Run the tests of the integer module with code coverage
test_integer_cov: install_rs_check_toolchain install_tarpaulin
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) tarpaulin --profile $(CARGO_PROFILE) \
--out xml --output-dir coverage/integer --line --engine llvm --timeout 500 \
--implicit-test-threads \
--exclude-files $(COVERAGE_EXCLUDED_FILES) \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache \
-p $(TFHE_SPEC) -- -Z unstable-options --report-time integer::
.PHONY: test_high_level_api # Run all the tests for high_level_api
test_high_level_api: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
-- high_level_api::
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) \
-E "test(/high_level_api::.*gpu.*/)"
.PHONY: test_user_doc # Run tests from the .md documentation
test_user_doc: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache -p $(TFHE_SPEC) \
-- test_user_docs::
.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
test_user_doc_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu -p $(TFHE_SPEC) \
-- test_user_docs::
.PHONY: test_fhe_strings # Run tests for fhe_strings example
test_fhe_strings: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--example fhe_strings \
--features=$(TARGET_ARCH_FEATURE),integer
.PHONY: test_regex_engine # Run tests for regex_engine example
test_regex_engine: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -530,7 +616,7 @@ docs: doc
lint_doc: install_rs_check_toolchain
RUSTDOCFLAGS="--html-in-header katex-header.html -Dwarnings" \
cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" doc \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p tfhe --no-deps
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -p $(TFHE_SPEC) --no-deps
.PHONY: lint_docs # Build rust doc with linting enabled alias for lint_doc
lint_docs: lint_doc
@@ -555,6 +641,16 @@ check_compile_tests:
./scripts/c_api_tests.sh --build-only; \
fi
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
check_compile_tests_benches_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
--features=$(TARGET_ARCH_FEATURE),experimental,boolean,shortint,integer,internal-keycache,gpu \
-p $(TFHE_SPEC)
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
cmake .. -DCMAKE_BUILD_TYPE=Debug -DTFHE_CUDA_BACKEND_BUILD_TESTS=ON -DTFHE_CUDA_BACKEND_BUILD_BENCHMARKS=ON && \
make -j
.PHONY: build_nodejs_test_docker # Build a docker image with tools to run nodejs tests for wasm API
build_nodejs_test_docker:
DOCKER_BUILDKIT=1 docker build --build-arg RUST_TOOLCHAIN="$(RS_BUILD_TOOLCHAIN)" \
@@ -607,21 +703,21 @@ bench_integer: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_signed_integer # Run benchmarks for signed integer
bench_signed_integer: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-signed-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
bench_integer_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
@@ -629,7 +725,7 @@ bench_integer_multi_bit: install_rs_check_toolchain
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_signed_integer_multi_bit # Run benchmarks for signed integer using multi-bit parameters
bench_signed_integer_multi_bit: install_rs_check_toolchain
@@ -637,7 +733,7 @@ bench_signed_integer_multi_bit: install_rs_check_toolchain
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-signed-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_integer_multi_bit_gpu # Run benchmarks for integer on GPU backend using multi-bit parameters
bench_integer_multi_bit_gpu: install_rs_check_toolchain
@@ -645,25 +741,25 @@ bench_integer_multi_bit_gpu: install_rs_check_toolchain
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,$(AVX512_FEATURE) -p tfhe --
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_shortint # Run benchmarks for shortint
bench_shortint: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench shortint-bench \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_oprf # Run benchmarks for shortint
bench_oprf: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench oprf-shortint-bench \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p tfhe
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench oprf-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,$(AVX512_FEATURE) -p tfhe
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
@@ -673,20 +769,38 @@ bench_shortint_multi_bit: install_rs_check_toolchain
__TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench shortint-bench \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC) --
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_boolean # Run benchmarks for boolean
bench_boolean: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench boolean-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
--features=$(TARGET_ARCH_FEATURE),boolean,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_pbs # Run benchmarks for PBS
bench_pbs: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench pbs-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,$(AVX512_FEATURE) -p $(TFHE_SPEC)
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_pbs_gpu # Run benchmarks for PBS on GPU backend
bench_pbs_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench pbs-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_ks # Run benchmarks for keyswitch
bench_ks: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench ks-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_ks_gpu # Run benchmarks for PBS on GPU backend
bench_ks_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench ks-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
bench_web_js_api_parallel: build_web_js_api_parallel
@@ -703,7 +817,7 @@ ci_bench_web_js_api_parallel: build_web_js_api_parallel
#
.PHONY: gen_key_cache # Run the script to generate keys and cache them for shortint tests
gen_key_cache: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
RUSTFLAGS="$(RUSTFLAGS) --cfg tarpaulin" cargo $(CARGO_RS_BUILD_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
--example generates_test_keys \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,internal-keycache -- \
$(MULTI_BIT_ONLY) $(COVERAGE_ONLY)
@@ -779,7 +893,7 @@ sha256_bool: install_rs_check_toolchain
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_all check_compile_tests
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
pcc_gpu: pcc clippy_gpu
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc clippy_fast check_compile_tests

185
README.md
View File

@@ -2,36 +2,66 @@
<!-- product name logo -->
<img width=600 src="https://user-images.githubusercontent.com/5758427/231206749-8f146b97-3c5a-4201-8388-3ffa88580415.png">
</p>
<hr/>
<p align="center">
<a href="https://docs.zama.ai/tfhe-rs"> 📒 Read documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a>
</p>
<p align="center">
<!-- Version badge using shields.io -->
<a href="https://github.com/zama-ai/tfhe-rs/releases">
<img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square">
</a>
<!-- Zama Bounty Program -->
<a href="https://github.com/zama-ai/bounty-program">
<img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-yellow?style=flat-square">
</a>
</p>
<hr/>
<p align="center">
<a href="https://docs.zama.ai/tfhe-rs"> 📒 Documentation</a> | <a href="https://zama.ai/community"> 💛 Community support</a> | <a href="https://github.com/zama-ai/awesome-zama"> 📚 FHE resources by Zama</a>
</p>
**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer
arithmetics over encrypted data. It includes:
- a **Rust** API
- a **C** API
- and a **client-side WASM** API
**TFHE-rs** is meant for developers and researchers who want full control over
what they can do with TFHE, while not having to worry about the low level
<p align="center">
<a href="https://github.com/zama-ai/tfhe-rs/releases"><img src="https://img.shields.io/github/v/release/zama-ai/tfhe-rs?style=flat-square"></a>
<a href="LICENSE"><img src="https://img.shields.io/badge/License-BSD--3--Clause--Clear-%23ffb243?style=flat-square"></a>
<a href="https://github.com/zama-ai/bounty-program"><img src="https://img.shields.io/badge/Contribute-Zama%20Bounty%20Program-%23ffd208?style=flat-square"></a>
</p>
## About
### What is TFHE-rs
**TFHE-rs** is a pure Rust implementation of TFHE for boolean and integer arithmetics over encrypted data.
It includes:
- a **Rust** API
- a **C** API
- and a **client-side WASM** API
TFHE-rs is designed for developers and researchers who want full control over
what they can do with TFHE, while not having to worry about the low-level
implementation. The goal is to have a stable, simple, high-performance, and
production-ready library for all the advanced features of TFHE.
<br></br>
### Main features
- **Low-level cryptographic library** that implements Zamas variant of TFHE, including programmable bootstrapping
- **Implementation of the original TFHE boolean API** that can be used as a drop-in replacement for other TFHE libraries
- **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
- **Size-efficient public key encryption**
- **Ciphertext and server key compression** for efficient data transfer
- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
<br></br>
## Table of Contents
- **[Getting Started](#getting-started)**
- [Cargo.toml configuration](#cargotoml-configuration)
- [A simple example](#a-simple-example)
- **[Resources](#resources)**
- [TFHE deep dive](#tfhe-deep-dive)
- [Tutorials](#tutorials)
- [Documentation](#documentation)
- **[Working with TFHE-rs](#working-with-tfhe-rs)**
- [Disclaimers](#disclaimers)
- [Citations](#citations)
- [Contributing](#contributing)
- [License](#license)
- **[Support](#support)**
<br></br>
## Getting Started
The steps to run a first example are described below.
### Cargo.toml configuration
To use the latest version of `TFHE-rs` in your project, you first need to add it as a dependency in your `Cargo.toml`:
@@ -47,20 +77,24 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64-un
```toml
tfhe = { version = "*", features = ["boolean", "shortint", "integer", "aarch64-unix"] }
```
Note: users with ARM devices must compile `TFHE-rs` using a stable toolchain with version >= 1.72.
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND)
running Windows:
+ For x86_64-based machines with the [`rdseed instruction`](https://en.wikipedia.org/wiki/RDRAND) running Windows:
```toml
tfhe = { version = "*", features = ["boolean", "shortint", "integer", "x86_64"] }
```
Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
> [!Note]
> Note: You need to use a Rust version >= 1.73 to compile TFHE-rs.
> [!Note]
> Note: aarch64-based machines are not yet supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
## A simple example
<p align="right">
<a href="#about" > ↑ Back to top </a>
</p>
### A simple example
Here is a full example:
@@ -117,32 +151,64 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
To run this code, use the following command:
<p align="center"> <code> cargo run --release </code> </p>
Note that when running code that uses `tfhe-rs`, it is highly recommended
> [!Note]
> Note that when running code that uses `TFHE-rs`, it is highly recommended
to run in release mode with cargo's `--release` flag to have the best performances possible.
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
## Contributing
<p align="right">
<a href="#about" > ↑ Back to top </a>
</p>
There are two ways to contribute to TFHE-rs:
- you can open issues to report bugs or typos, or to suggest new ideas
- you can ask to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
(becoming an approved contributor involves signing our Contributor License Agreement (CLA))
Only approved contributors can send pull requests, so please make sure to get in touch before you do!
## Resources
## Credits
### TFHE deep dive
- [TFHE Deep Dive - Part I - Ciphertext types](https://www.zama.ai/post/tfhe-deep-dive-part-1)
- [TFHE Deep Dive - Part II - Encodings and linear leveled operations](https://www.zama.ai/post/tfhe-deep-dive-part-2)
- [TFHE Deep Dive - Part III - Key switching and leveled multiplications](https://www.zama.ai/post/tfhe-deep-dive-part-3)
- [TFHE Deep Dive - Part IV - Programmable Bootstrapping](https://www.zama.ai/post/tfhe-deep-dive-part-4)
<br></br>
This library uses several dependencies and we would like to thank the contributors of those
libraries.
### Tutorials
- [Homomorphic Parity Bit](https://docs.zama.ai/tfhe-rs/tutorials/parity_bit)
- [Homomorphic Case Changing on Ascii String](https://docs.zama.ai/tfhe-rs/tutorials/ascii_fhe_string)
- [Boolean SHA256 with TFHE-rs](https://www.zama.ai/post/boolean-sha256-tfhe-rs)
- [Dark Market with TFHE-rs](https://www.zama.ai/post/dark-market-tfhe-rs)
- [Regular Expression Engine with TFHE-rs](https://www.zama.ai/post/regex-engine-tfhe-rs)
## Need support?
<a target="_blank" href="https://community.zama.ai">
<img src="https://user-images.githubusercontent.com/5758427/231115030-21195b55-2629-4c01-9809-be5059243999.png">
</a>
## Citing TFHE-rs
*Explore more useful resources in [TFHE-rs tutorials](https://docs.zama.ai/tfhe-rs/tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama)*
<br></br>
### Documentation
Full, comprehensive documentation is available here: [https://docs.zama.ai/tfhe-rs](https://docs.zama.ai/tfhe-rs).
<p align="right">
<a href="#about" > ↑ Back to top </a>
</p>
## Working with TFHE-rs
### Disclaimers
#### Security Estimation
Security estimations are done using the
[Lattice Estimator](https://github.com/malb/lattice-estimator)
with `red_cost_model = reduction.RC.BDGL16`.
When a new update is published in the Lattice Estimator, we update parameters accordingly.
#### Side-Channel Attacks
Mitigation for side-channel attacks has not yet been implemented in TFHE-rs,
and will be released in upcoming versions.
<br></br>
### Citations
To cite TFHE-rs in academic papers, please use the following entry:
```text
@@ -154,22 +220,31 @@ To cite TFHE-rs in academic papers, please use the following entry:
}
```
## License
### Contributing
This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
please contact us at `hello@zama.ai`.
There are two ways to contribute to TFHE-rs:
## Disclaimers
- [Open issues](https://github.com/zama-ai/tfhe-rs/issues/new/choose) to report bugs and typos, or to suggest new ideas
- Request to become an official contributor by emailing [hello@zama.ai](mailto:hello@zama.ai).
### Security Estimation
Becoming an approved contributor involves signing our Contributor License Agreement (CLA). Only approved contributors can send pull requests, so please make sure to get in touch before you do!
<br></br>
Security estimations are done using the
[Lattice Estimator](https://github.com/malb/lattice-estimator)
with `red_cost_model = reduction.RC.BDGL16`.
### License
This software is distributed under the **BSD-3-Clause-Clear** license. If you have any questions, please contact us at hello@zama.ai.
<p align="right">
<a href="#about" > ↑ Back to top </a>
</p>
When a new update is published in the Lattice Estimator, we update parameters accordingly.
### Side-Channel Attacks
## Support
Mitigation for side channel attacks have not yet been implemented in TFHE-rs,
and will be released in upcoming versions.
<a target="_blank" href="https://community.zama.ai">
<img src="https://github.com/zama-ai/tfhe-rs/assets/157474013/8da6cf5b-51a0-4c86-9e75-fd0e4a4c64a4">
</a>
🌟 If you find this project helpful or interesting, please consider giving it a star on GitHub! Your support helps to grow the community and motivates further development.
<p align="right">
<a href="#about" > ↑ Back to top </a>
</p>

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-cuda-backend"
version = "0.1.2"
version = "0.2.0"
edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"

View File

@@ -30,17 +30,17 @@ The cryptographic operations it provides are:
## Build
The Cuda project held in `tfhe-cuda-backend` can be compiled independently from Concrete in the
following way:
The Cuda project held in `tfhe-cuda-backend` can be compiled independently from TFHE-rs in the following way:
```
git clone git@github.com:zama-ai/tfhe-rs
cd backends/tfhe-cuda-backend/implementation
cd backends/tfhe-cuda-backend/cuda
mkdir build
cd build
cmake ..
make
```
The compute capability is detected automatically (with the first GPU information) and set accordingly.
If your machine does not have an available Nvidia GPU, the compilation will work if you have the nvcc compiler installed. The generated executable will target a 7.0 compute capability (sm_70).
## Links

View File

@@ -0,0 +1 @@
/build/

View File

@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
project(tfhe_cuda_backend LANGUAGES CXX CUDA)
project(tfhe_cuda_backend LANGUAGES CXX)
# See if the minimum CUDA version is available. If not, only enable documentation building.
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
@@ -56,9 +56,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}")
set(CMAKE_CUDA_ARCHITECTURES native)
if(NOT CUDA_NVCC_FLAGS)
set(CUDA_NVCC_FLAGS -arch=sm_70)
if(${CUDA_SUCCESS})
set(CMAKE_CUDA_ARCHITECTURES native)
else()
set(CMAKE_CUDA_ARCHITECTURES 70)
endif()
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
@@ -70,10 +71,13 @@ set(CMAKE_CUDA_FLAGS
set(INCLUDE_DIR include)
add_subdirectory(src)
enable_testing()
add_subdirectory(tests_and_benchmarks)
target_include_directories(tfhe_cuda_backend PRIVATE ${INCLUDE_DIR})
# This is required for rust cargo build
install(TARGETS tfhe_cuda_backend DESTINATION .)
install(TARGETS tfhe_cuda_backend DESTINATION lib)
# Define a function to add a lint target.
@@ -85,5 +89,3 @@ if(CPPLINT)
set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
# set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
endif()
enable_testing()

View File

@@ -1,6 +1,19 @@
#!/bin/bash
find ./{include,src} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
set -e
find ./{include,src} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
while getopts ":c" option; do
case $option in
c)
# code to execute when flag1 is provided
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file' --dry-run --Werror
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'
git diff --exit-code
exit
;;
esac
done
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-15 -i -style='file'
cmake-format -i CMakeLists.txt -c .cmake-format-config.py
find ./{include,src,tests_and_benchmarks/tests,tests_and_benchmarks/benchmarks} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c .cmake-format-config.py'

View File

@@ -5,6 +5,7 @@
#include <cstdint>
enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
enum PBS_VARIANT { DEFAULT = 0, FAST = 1 };
extern "C" {
void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
@@ -40,7 +41,7 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
@@ -48,19 +49,19 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer);
void scratch_cuda_bootstrap_low_latency_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
void scratch_cuda_bootstrap_low_latency_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
cuda_stream_t *stream, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
@@ -68,21 +69,24 @@ void scratch_cuda_bootstrap_low_latency_64(
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);
void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
int8_t **pbs_buffer);
void cleanup_cuda_bootstrap_low_latency_32(cuda_stream_t *stream,
int8_t **pbs_buffer);
void cleanup_cuda_bootstrap_low_latency_64(cuda_stream_t *stream,
int8_t **pbs_buffer);
uint64_t get_buffer_size_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -93,6 +97,212 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::LOW_LAT> {
int8_t *d_mem;
Torus *global_accumulator;
double2 *global_accumulator_fft;
PBS_VARIANT pbs_variant;
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory) {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
if (allocate_gpu_memory) {
switch (pbs_variant) {
case PBS_VARIANT::DEFAULT: {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(
polynomial_size);
uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
uint64_t full_dm = full_sm_step_one;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_two) {
device_mem =
(partial_dm_step_two + partial_dm_step_one * level_count) *
input_lwe_ciphertext_count * (glwe_dimension + 1);
} else if (max_shared_memory < full_sm_step_one) {
device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
level_count * (glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double2),
stream);
global_accumulator = (Torus *)cuda_malloc_async(
(glwe_dimension + 1) * input_lwe_ciphertext_count *
polynomial_size * sizeof(Torus),
stream);
} break;
case PBS_VARIANT::FAST: {
uint64_t full_sm =
get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream);
global_accumulator_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2),
stream);
} break;
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
}
}
void release(cuda_stream_t *stream) {
cuda_drop_async(d_mem, stream);
cuda_drop_async(global_accumulator_fft, stream);
if (pbs_variant == DEFAULT)
cuda_drop_async(global_accumulator, stream);
}
};
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus>
bool has_support_to_cuda_bootstrap_fast_low_latency(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);
template <typename Torus>
void cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
template <typename Torus>
void cuda_bootstrap_low_latency_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
uint32_t lwe_idx, uint32_t max_shared_memory);
template <typename Torus, typename STorus>
void scratch_cuda_fast_bootstrap_low_latency(
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
template <typename Torus, typename STorus>
void scratch_cuda_bootstrap_low_latency(
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory);
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
int glwe_dimension,

View File

@@ -1,23 +1,22 @@
#ifndef CUDA_MULTI_BIT_H
#define CUDA_MULTI_BIT_H
#include "bootstrap.h"
#include <cstdint>
extern "C" {
bool has_support_to_cuda_bootstrap_fast_multi_bit(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor);
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t chunk_size = 0);
void scratch_cuda_multi_bit_pbs_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -25,8 +24,118 @@ void scratch_cuda_multi_bit_pbs_64(
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t chunk_size = 0);
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer);
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
void scratch_cuda_generic_multi_bit_pbs_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);
void cuda_generic_multi_bit_pbs_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
void cleanup_cuda_multi_bit_pbs_32(cuda_stream_t *stream, int8_t **pbs_buffer);
void cleanup_cuda_multi_bit_pbs_64(cuda_stream_t *stream, int8_t **pbs_buffer);
}
template <typename Torus, typename STorus>
void scratch_cuda_fast_multi_bit_pbs(
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus>
void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size = 0);
template <typename Torus, typename STorus>
void scratch_cuda_multi_bit_pbs(
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
template <typename Torus>
void cuda_multi_bit_pbs_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size = 0);
template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
double2 *keybundle_fft;
Torus *global_accumulator;
double2 *global_accumulator_fft;
PBS_VARIANT pbs_variant;
pbs_buffer(cuda_stream_t *stream, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
this->pbs_variant = pbs_variant;
auto max_shared_memory = cuda_get_max_shared_memory(stream->gpu_index);
if (allocate_gpu_memory) {
switch (pbs_variant) {
case DEFAULT:
case FAST:
keybundle_fft = (double2 *)cuda_malloc_async(
input_lwe_ciphertext_count * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2) * sizeof(double2),
stream);
global_accumulator = (Torus *)cuda_malloc_async(
input_lwe_ciphertext_count * (glwe_dimension + 1) *
polynomial_size * sizeof(Torus),
stream);
global_accumulator_fft = (double2 *)cuda_malloc_async(
input_lwe_ciphertext_count * (glwe_dimension + 1) * level_count *
(polynomial_size / 2) * sizeof(double2),
stream);
break;
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
}
}
void release(cuda_stream_t *stream) {
cuda_drop_async(keybundle_fft, stream);
cuda_drop_async(global_accumulator, stream);
cuda_drop_async(global_accumulator_fft, stream);
}
};
#ifdef __CUDACC__
__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
uint32_t level_count,

View File

@@ -11,6 +11,22 @@
extern "C" {
#define check_cuda_error(ans) \
{ cuda_error((ans), __FILE__, __LINE__); }
inline void cuda_error(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
file, line);
std::abort();
}
}
#define PANIC(format, ...) \
{ \
std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \
__LINE__, __func__, ##__VA_ARGS__); \
std::abort(); \
}
struct cuda_stream_t {
cudaStream_t stream;
uint32_t gpu_index;
@@ -18,68 +34,58 @@ struct cuda_stream_t {
cuda_stream_t(uint32_t gpu_index) {
this->gpu_index = gpu_index;
cudaStreamCreate(&stream);
check_cuda_error(cudaStreamCreate(&stream));
}
void release() {
cudaSetDevice(gpu_index);
cudaStreamDestroy(stream);
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamDestroy(stream));
}
void synchronize() { cudaStreamSynchronize(stream); }
void synchronize() { check_cuda_error(cudaStreamSynchronize(stream)); }
};
cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
int cuda_destroy_stream(cuda_stream_t *stream);
void cuda_destroy_stream(cuda_stream_t *stream);
void *cuda_malloc(uint64_t size, uint32_t gpu_index);
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
int cuda_check_support_cooperative_groups();
bool cuda_check_support_cooperative_groups();
int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);
int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream);
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream);
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream);
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream);
int cuda_get_number_of_gpus();
int cuda_synchronize_device(uint32_t gpu_index);
void cuda_synchronize_device(uint32_t gpu_index);
int cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index);
int cuda_drop_async(void *ptr, cuda_stream_t *stream);
void cuda_drop_async(void *ptr, cuda_stream_t *stream);
int cuda_get_max_shared_memory(uint32_t gpu_index);
int cuda_synchronize_stream(cuda_stream_t *stream);
void cuda_synchronize_stream(cuda_stream_t *stream);
#define check_cuda_error(ans) \
{ cuda_error((ans), __FILE__, __LINE__); }
inline void cuda_error(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort)
exit(code);
}
}
void cuda_stream_add_callback(cuda_stream_t *stream,
cudaStreamCallback_t callback, void *user_data);
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer);
}
template <typename Torus>

View File

@@ -3,6 +3,7 @@
#include "bootstrap.h"
#include "bootstrap_multibit.h"
#include "pbs/bootstrap.cuh"
#include <cassert>
#include <cmath>
#include <functional>
@@ -32,34 +33,6 @@ enum COMPARISON_TYPE {
};
enum IS_RELATIONSHIP { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
/*
* generate bivariate accumulator for device pointer
* v_stream - cuda stream
* acc - device pointer for bivariate accumulator
* ...
* f - wrapping function with two Torus inputs
*/
template <typename Torus>
void generate_device_accumulator_bivariate(
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f);
/*
* generate univariate accumulator for device pointer
* v_stream - cuda stream
* acc - device pointer for univariate accumulator
* ...
* f - evaluating function with one Torus input
*/
template <typename Torus>
void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus)> f);
extern "C" {
void scratch_cuda_full_propagation_64(
cuda_stream_t *stream, int8_t **mem_ptr, uint32_t lwe_dimension,
@@ -226,6 +199,34 @@ void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
int8_t **mem_ptr_void);
}
/*
* generate bivariate accumulator (lut) for device pointer
* v_stream - cuda stream
* acc_bivariate - device pointer for bivariate accumulator
* ...
* f - wrapping function with two Torus inputs
*/
template <typename Torus>
void generate_device_accumulator_bivariate(
cuda_stream_t *stream, Torus *acc_bivariate, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
std::function<Torus(Torus, Torus)> f);
/*
* generate univariate accumulator (lut) for device pointer
* v_stream - cuda stream
* acc - device pointer for univariate accumulator
* ...
* f - evaluating function with one Torus input
*/
template <typename Torus>
void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t message_modulus,
uint32_t carry_modulus,
std::function<Torus(Torus)> f);
struct int_radix_params {
PBS_TYPE pbs_type;
uint32_t glwe_dimension;
@@ -273,7 +274,7 @@ template <typename Torus> struct int_radix_lut {
uint32_t num_blocks;
bool mem_reuse = false;
int8_t *pbs_buffer;
int8_t *buffer;
Torus *lut_indexes;
Torus *lwe_indexes;
@@ -297,36 +298,16 @@ template <typename Torus> struct int_radix_lut {
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
///////////////
// PBS
if (params.pbs_type == MULTI_BIT) {
// Only 64 bits is supported
static_assert(
sizeof(Torus) == 8,
"Error (GPU multi bit PBS): only 64 bits Torus is supported");
scratch_cuda_multi_bit_pbs_64(
stream, &pbs_buffer, params.small_lwe_dimension,
params.glwe_dimension, params.polynomial_size, params.pbs_level,
params.grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
} else {
// Classic
// We only use low latency for classic mode
if (sizeof(Torus) == sizeof(uint32_t))
scratch_cuda_bootstrap_low_latency_32(
stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
params.pbs_level, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
else
scratch_cuda_bootstrap_low_latency_64(
stream, &pbs_buffer, params.glwe_dimension, params.polynomial_size,
params.pbs_level, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index), allocate_gpu_memory);
}
execute_scratch_pbs<Torus>(
stream, &buffer, params.glwe_dimension, params.small_lwe_dimension,
params.polynomial_size, params.pbs_level, params.grouping_factor,
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
params.pbs_type, allocate_gpu_memory);
if (allocate_gpu_memory) {
// Allocate LUT
// LUT is used as a trivial encryption and must be initialized outside
// this contructor
// this constructor
lut = (Torus *)cuda_malloc_async(num_luts * lut_buffer_size, stream);
lut_indexes = (Torus *)cuda_malloc_async(lut_indexes_size, stream);
@@ -344,8 +325,8 @@ template <typename Torus> struct int_radix_lut {
for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes[i] = i;
cuda_memcpy_to_gpu(lwe_indexes, h_lwe_indexes,
num_radix_blocks * sizeof(Torus));
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes,
num_radix_blocks * sizeof(Torus), stream);
free(h_lwe_indexes);
// Keyswitch
@@ -357,21 +338,17 @@ template <typename Torus> struct int_radix_lut {
// constructor to reuse memory
int_radix_lut(cuda_stream_t *stream, int_radix_params params,
uint32_t num_luts, uint32_t num_radix_blocks,
int_radix_lut<Torus> *base_lut_object) {
int_radix_lut *base_lut_object) {
this->params = params;
this->num_blocks = num_radix_blocks;
Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
Torus small_size =
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
Torus lut_buffer_size =
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
// base lut object should have bigger or equal memory than current one
assert(num_radix_blocks <= base_lut_object->num_blocks);
// pbs
pbs_buffer = base_lut_object->pbs_buffer;
buffer = base_lut_object->buffer;
// Keyswitch
tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
@@ -398,8 +375,9 @@ template <typename Torus> struct int_radix_lut {
for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes[i] = i;
cuda_memcpy_to_gpu(lwe_indexes, h_lwe_indexes,
num_radix_blocks * sizeof(Torus));
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes,
num_radix_blocks * sizeof(Torus), stream);
cuda_synchronize_stream(stream);
free(h_lwe_indexes);
}
@@ -408,13 +386,47 @@ template <typename Torus> struct int_radix_lut {
return &lut[ind * (params.glwe_dimension + 1) * params.polynomial_size];
}
Torus *get_tvi(size_t ind) { return &lut_indexes[ind]; }
Torus *get_lut_indexes(size_t ind) { return &lut_indexes[ind]; }
void release(cuda_stream_t *stream) {
cuda_drop_async(lut_indexes, stream);
cuda_drop_async(lwe_indexes, stream);
cuda_drop_async(lut, stream);
if (!mem_reuse) {
cuda_drop_async(pbs_buffer, stream);
switch (params.pbs_type) {
case MULTI_BIT:
switch (sizeof(Torus)) {
case sizeof(uint32_t):
cleanup_cuda_multi_bit_pbs_32(stream, &buffer);
break;
case sizeof(uint64_t):
cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit "
"integer "
"moduli are supported.")
}
break;
case LOW_LAT:
switch (sizeof(Torus)) {
case sizeof(uint32_t):
cleanup_cuda_bootstrap_low_latency_32(stream, &buffer);
break;
case sizeof(uint64_t):
cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit "
"integer "
"moduli are supported.")
}
break;
case AMORTIZED:
cleanup_cuda_bootstrap_amortized(stream, &buffer);
break;
default:
PANIC("Cuda error (PBS): unknown PBS type. ")
}
cuda_drop_async(tmp_lwe_before_ks, stream);
cuda_drop_async(tmp_lwe_after_ks, stream);
}
@@ -437,10 +449,10 @@ template <typename Torus> struct int_sc_prop_memory {
Torus *generates_or_propagates;
Torus *step_output;
// test_vector_array[2] = {lut_does_block_generate_carry,
// luts_array[2] = {lut_does_block_generate_carry,
// lut_does_block_generate_or_propagate}
int_radix_lut<Torus> *test_vector_array;
int_radix_lut<Torus> *lut_carry_propagation_sum;
int_radix_lut<Torus> *luts_array;
int_radix_lut<Torus> *luts_carry_propagation_sum;
int_radix_lut<Torus> *message_acc;
int_radix_params params;
@@ -461,7 +473,7 @@ template <typename Torus> struct int_sc_prop_memory {
step_output = (Torus *)cuda_malloc_async(
num_radix_blocks * big_lwe_size_bytes, stream);
// declare functions for test vector generation
// declare functions for lut generation
auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
if (x >= message_modulus)
return OUTPUT_CARRY::GENERATED;
@@ -477,7 +489,7 @@ template <typename Torus> struct int_sc_prop_memory {
return OUTPUT_CARRY::NONE;
};
auto f_lut_carry_propagation_sum = [](Torus msb, Torus lsb) -> Torus {
auto f_luts_carry_propagation_sum = [](Torus msb, Torus lsb) -> Torus {
if (msb == OUTPUT_CARRY::PROPAGATED)
return lsb;
return msb;
@@ -487,18 +499,18 @@ template <typename Torus> struct int_sc_prop_memory {
return x % message_modulus;
};
// create test vector objects
test_vector_array = new int_radix_lut<Torus>(
stream, params, 2, num_radix_blocks, allocate_gpu_memory);
lut_carry_propagation_sum = new struct int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
message_acc = new struct int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
// create lut objects
luts_array = new int_radix_lut<Torus>(stream, params, 2, num_radix_blocks,
allocate_gpu_memory);
luts_carry_propagation_sum = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, luts_array);
message_acc = new int_radix_lut<Torus>(stream, params, 1, num_radix_blocks,
luts_array);
auto lut_does_block_generate_carry = test_vector_array->get_lut(0);
auto lut_does_block_generate_or_propagate = test_vector_array->get_lut(1);
auto lut_does_block_generate_carry = luts_array->get_lut(0);
auto lut_does_block_generate_or_propagate = luts_array->get_lut(1);
// generate test vectors
// generate luts (aka accumulators)
generate_device_accumulator<Torus>(
stream, lut_does_block_generate_carry, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_does_block_generate_carry);
@@ -507,12 +519,13 @@ template <typename Torus> struct int_sc_prop_memory {
polynomial_size, message_modulus, carry_modulus,
f_lut_does_block_generate_or_propagate);
cuda_set_value_async<Torus>(&(stream->stream),
test_vector_array->get_tvi(1), 1,
luts_array->get_lut_indexes(1), 1,
num_radix_blocks - 1);
generate_device_accumulator_bivariate<Torus>(
stream, lut_carry_propagation_sum->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_carry_propagation_sum);
stream, luts_carry_propagation_sum->lut, glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_luts_carry_propagation_sum);
generate_device_accumulator<Torus>(stream, message_acc->lut, glwe_dimension,
polynomial_size, message_modulus,
@@ -523,12 +536,12 @@ template <typename Torus> struct int_sc_prop_memory {
cuda_drop_async(generates_or_propagates, stream);
cuda_drop_async(step_output, stream);
test_vector_array->release(stream);
lut_carry_propagation_sum->release(stream);
luts_array->release(stream);
luts_carry_propagation_sum->release(stream);
message_acc->release(stream);
delete test_vector_array;
delete lut_carry_propagation_sum;
delete luts_array;
delete luts_carry_propagation_sum;
delete message_acc;
}
};
@@ -537,10 +550,9 @@ template <typename Torus> struct int_mul_memory {
Torus *vector_result_sb;
Torus *block_mul_res;
Torus *small_lwe_vector;
Torus *lwe_pbs_out_array;
int_radix_lut<Torus> *test_vector_array; // lsb msb
int_radix_lut<Torus> *test_vector_message;
int_radix_lut<Torus> *test_vector_carry;
int_radix_lut<Torus> *luts_array; // lsb msb
int_radix_lut<Torus> *luts_message;
int_radix_lut<Torus> *luts_carry;
int_sc_prop_memory<Torus> *scp_mem;
int_radix_params params;
@@ -577,24 +589,20 @@ template <typename Torus> struct int_mul_memory {
stream);
small_lwe_vector = (Torus *)cuda_malloc_async(
total_block_count * (lwe_dimension + 1) * sizeof(Torus), stream);
lwe_pbs_out_array =
(Torus *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
total_block_count * sizeof(Torus),
stream);
// create int_radix_lut objects for lsb, msb, message, carry
// test_vector_array -> lut = {lsb_acc, msb_acc}
test_vector_array = new int_radix_lut<Torus>(
stream, params, 2, total_block_count, allocate_gpu_memory);
test_vector_message = new int_radix_lut<Torus>(
stream, params, 1, total_block_count, test_vector_array);
test_vector_carry = new int_radix_lut<Torus>(
stream, params, 1, total_block_count, test_vector_array);
// luts_array -> lut = {lsb_acc, msb_acc}
luts_array = new int_radix_lut<Torus>(stream, params, 2, total_block_count,
allocate_gpu_memory);
luts_message = new int_radix_lut<Torus>(stream, params, 1,
total_block_count, luts_array);
luts_carry = new int_radix_lut<Torus>(stream, params, 1, total_block_count,
luts_array);
auto lsb_acc = test_vector_array->get_lut(0);
auto msb_acc = test_vector_array->get_lut(1);
auto message_acc = test_vector_message->get_lut(0);
auto carry_acc = test_vector_carry->get_lut(0);
auto lsb_acc = luts_array->get_lut(0);
auto msb_acc = luts_array->get_lut(1);
auto message_acc = luts_message->get_lut(0);
auto carry_acc = luts_carry->get_lut(0);
// define functions for each accumulator
auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
@@ -624,12 +632,12 @@ template <typename Torus> struct int_mul_memory {
stream, msb_acc, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, lut_f_msb);
// tvi for test_vector_array should be reinitialized
// lut_indexes for luts_array should be reinitialized
// first lsb_vector_block_count value should reference to lsb_acc
// last msb_vector_block_count values should reference to msb_acc
// for message and carry default tvi is fine
// for message and carry default lut_indexes is fine
cuda_set_value_async<Torus>(
&(stream->stream), test_vector_array->get_tvi(lsb_vector_block_count),
&(stream->stream), luts_array->get_lut_indexes(lsb_vector_block_count),
1, msb_vector_block_count);
}
@@ -637,17 +645,16 @@ template <typename Torus> struct int_mul_memory {
cuda_drop_async(vector_result_sb, stream);
cuda_drop_async(block_mul_res, stream);
cuda_drop_async(small_lwe_vector, stream);
cuda_drop_async(lwe_pbs_out_array, stream);
test_vector_array->release(stream);
test_vector_message->release(stream);
test_vector_carry->release(stream);
luts_array->release(stream);
luts_message->release(stream);
luts_carry->release(stream);
scp_mem->release(stream);
delete test_vector_array;
delete test_vector_message;
delete test_vector_carry;
delete luts_array;
delete luts_message;
delete luts_carry;
delete scp_mem;
}
@@ -674,19 +681,22 @@ template <typename Torus> struct int_shift_buffer {
uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
tmp_rotated = (Torus *)cuda_malloc_async(
max_amount_of_pbs * big_lwe_size_bytes, stream);
(max_amount_of_pbs + 2) * big_lwe_size_bytes, stream);
cuda_memset_async(tmp_rotated, 0,
(max_amount_of_pbs + 2) * big_lwe_size_bytes, stream);
uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);
// LUT
// pregenerate lut vector and indexes
// lut for left shift
// here we generate 'num_bits_in_block' times test_vector
// here we generate 'num_bits_in_block' times lut
// one for each 'shift_within_block' = 'shift' % 'num_bits_in_block'
// even though test_vector_left contains 'num_bits_in_block' lut
// tvi will have indexes for single lut only and those indexes will be 0
// it means for pbs corresponding lut should be selected and pass along
// tvi filled with zeros
// even though lut_left contains 'num_bits_in_block' lut
// lut_indexes will have indexes for single lut only and those indexes
// will be 0 it means for pbs corresponding lut should be selected and
// pass along lut_indexes filled with zeros
// calculate bivariate lut for each 'shift_within_block'
for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
@@ -737,34 +747,6 @@ template <typename Torus> struct int_shift_buffer {
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
// here we generate 'message_modulus' times test_vector
// one for each 'shift'
// tvi will have indexes for single lut only and those indexes will be 0
// it means for pbs corresponding lut should be selected and pass along
// tvi filled with zeros
// calculate lut for each 'shift'
for (int shift = 0; shift < params.message_modulus; shift++) {
auto cur_lut =
new int_radix_lut<Torus>(stream, params, 1, 1, allocate_gpu_memory);
std::function<Torus(Torus)> shift_lut_f;
if (shift_type == LEFT_SHIFT)
shift_lut_f = [shift, params](Torus x) -> Torus {
return (x << shift) % params.message_modulus;
};
else
shift_lut_f = [shift, params](Torus x) -> Torus {
return (x >> shift) % params.message_modulus;
};
generate_device_accumulator<Torus>(
stream, cur_lut->lut, params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, shift_lut_f);
lut_buffers_univariate.push_back(cur_lut);
}
}
}
@@ -834,8 +816,6 @@ template <typename Torus> struct int_cmux_buffer {
if (allocate_gpu_memory) {
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
Torus small_size =
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
tmp_true_ct = (Torus *)cuda_malloc_async(big_size, stream);
tmp_false_ct = (Torus *)cuda_malloc_async(big_size, stream);
@@ -904,8 +884,10 @@ template <typename Torus> struct int_are_all_block_true_buffer {
COMPARISON_TYPE op;
int_radix_params params;
int_radix_lut<Torus> *is_max_value_lut;
int_radix_lut<Torus> *is_equal_to_num_blocks_lut;
// This map store LUTs that checks the equality between some input and values
// of interest in are_all_block_true(), as with max_value (the maximum message
// value).
std::unordered_map<int, int_radix_lut<Torus> *> is_equal_to_lut_map;
Torus *tmp_block_accumulated;
@@ -923,34 +905,14 @@ template <typename Torus> struct int_are_all_block_true_buffer {
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
tmp_block_accumulated = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * max_chunks * sizeof(Torus), stream);
// LUT
// We need three LUTs:
// (x & max_value as u64) == max_value
// x != 0
// (x & max_value as u64) == blocks.len()
auto is_max_value_lut_f = [total_modulus](Torus x) -> Torus {
Torus max_value = total_modulus - 1;
return (x & max_value) == max_value;
};
is_max_value_lut = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
is_equal_to_num_blocks_lut = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
generate_device_accumulator<Torus>(
stream, is_max_value_lut->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
is_max_value_lut_f);
}
}
void release(cuda_stream_t *stream) {
is_max_value_lut->release(stream);
delete is_max_value_lut;
is_equal_to_num_blocks_lut->release(stream);
delete is_equal_to_num_blocks_lut;
for (auto &lut : is_equal_to_lut_map) {
lut.second->release(stream);
}
is_equal_to_lut_map.clear();
cuda_drop_async(tmp_block_accumulated, stream);
}
@@ -965,6 +927,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
int_are_all_block_true_buffer<Torus> *are_all_block_true_buffer;
int_radix_lut<Torus> *scalar_comparison_luts;
int_comparison_eq_buffer(cuda_stream_t *stream, COMPARISON_TYPE op,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
@@ -1007,6 +971,22 @@ template <typename Torus> struct int_comparison_eq_buffer {
stream, is_non_zero_lut->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
is_non_zero_lut_f);
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
stream, params, total_modulus, num_radix_blocks, allocate_gpu_memory);
for (int i = 0; i < total_modulus; i++) {
auto lut_f = [i, operator_f](Torus x) -> Torus {
return operator_f(i, x);
};
Torus *lut = scalar_comparison_luts->lut +
i * (params.glwe_dimension + 1) * params.polynomial_size;
generate_device_accumulator<Torus>(
stream, lut, params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, lut_f);
}
}
}
@@ -1018,6 +998,9 @@ template <typename Torus> struct int_comparison_eq_buffer {
are_all_block_true_buffer->release(stream);
delete are_all_block_true_buffer;
scalar_comparison_luts->release(stream);
delete scalar_comparison_luts;
}
};
@@ -1048,13 +1031,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
return msb;
};
auto last_leaf_noop_lut_f = [this](Torus x) -> Torus {
int msb = (x >> 2) & 3;
int lsb = x & 3;
return this->block_selector_f(msb, lsb);
};
if (allocate_gpu_memory) {
tmp_x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
num_radix_blocks * sizeof(Torus),
@@ -1101,14 +1077,8 @@ template <typename Torus> struct int_comparison_diff_buffer {
std::function<Torus(Torus)> operator_f;
int_radix_lut<Torus> *is_zero_lut;
int_tree_sign_reduction_buffer<Torus> *tree_buffer;
// Used for scalar comparisons
cuda_stream_t *lsb_stream;
cuda_stream_t *msb_stream;
int_comparison_diff_buffer(cuda_stream_t *stream, COMPARISON_TYPE op,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
@@ -1132,8 +1102,6 @@ template <typename Torus> struct int_comparison_diff_buffer {
};
if (allocate_gpu_memory) {
lsb_stream = cuda_create_stream(stream->gpu_index);
msb_stream = cuda_create_stream(stream->gpu_index);
Torus big_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
@@ -1143,36 +1111,17 @@ template <typename Torus> struct int_comparison_diff_buffer {
tmp_packed_right =
(Torus *)cuda_malloc_async(big_size * (num_radix_blocks / 2), stream);
// LUTs
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
auto is_zero_f = [total_modulus](Torus x) -> Torus {
return (x % total_modulus) == 0;
};
is_zero_lut = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
generate_device_accumulator<Torus>(
stream, is_zero_lut->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
is_zero_f);
tree_buffer = new int_tree_sign_reduction_buffer<Torus>(
stream, operator_f, params, num_radix_blocks, allocate_gpu_memory);
}
}
void release(cuda_stream_t *stream) {
is_zero_lut->release(stream);
delete is_zero_lut;
tree_buffer->release(stream);
delete tree_buffer;
cuda_drop_async(tmp_packed_left, stream);
cuda_drop_async(tmp_packed_right, stream);
cuda_destroy_stream(lsb_stream);
cuda_destroy_stream(msb_stream);
}
};
@@ -1185,15 +1134,24 @@ template <typename Torus> struct int_comparison_buffer {
int_radix_lut<Torus> *cleaning_lut;
std::function<Torus(Torus)> cleaning_lut_f;
int_radix_lut<Torus> *is_zero_lut;
int_comparison_eq_buffer<Torus> *eq_buffer;
int_comparison_diff_buffer<Torus> *diff_buffer;
Torus *tmp_block_comparisons;
Torus *tmp_lwe_array_out;
// Scalar EQ / NE
Torus *tmp_packed_input;
// Max Min
Torus *tmp_lwe_array_out;
int_cmux_buffer<Torus> *cmux_buffer;
// Used for scalar comparisons
cuda_stream_t *lsb_stream;
cuda_stream_t *msb_stream;
int_comparison_buffer(cuda_stream_t *stream, COMPARISON_TYPE op,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
@@ -1203,10 +1161,17 @@ template <typename Torus> struct int_comparison_buffer {
cleaning_lut_f = [](Torus x) -> Torus { return x; };
if (allocate_gpu_memory) {
lsb_stream = cuda_create_stream(stream->gpu_index);
msb_stream = cuda_create_stream(stream->gpu_index);
tmp_lwe_array_out = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
stream);
tmp_packed_input = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * 2 * num_radix_blocks * sizeof(Torus),
stream);
// Block comparisons
tmp_block_comparisons = (Torus *)cuda_malloc_async(
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus),
@@ -1221,6 +1186,19 @@ template <typename Torus> struct int_comparison_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleaning_lut_f);
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
auto is_zero_f = [total_modulus](Torus x) -> Torus {
return (x % total_modulus) == 0;
};
is_zero_lut = new int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
generate_device_accumulator<Torus>(
stream, is_zero_lut->lut, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
is_zero_f);
switch (op) {
case COMPARISON_TYPE::MAX:
case COMPARISON_TYPE::MIN:
@@ -1264,8 +1242,14 @@ template <typename Torus> struct int_comparison_buffer {
break;
}
cleaning_lut->release(stream);
is_zero_lut->release(stream);
delete is_zero_lut;
cuda_drop_async(tmp_lwe_array_out, stream);
cuda_drop_async(tmp_block_comparisons, stream);
cuda_drop_async(tmp_packed_input, stream);
cuda_destroy_stream(lsb_stream);
cuda_destroy_stream(msb_stream);
}
};

View File

@@ -13,10 +13,6 @@ set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(
tfhe_cuda_backend
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES native)
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tfhe_cuda_backend PUBLIC cudart OpenMP::OpenMP_CXX)
target_include_directories(tfhe_cuda_backend PRIVATE .)

View File

@@ -106,23 +106,23 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
cudaSetDevice(stream->gpu_index);
constexpr int ideal_threads = 128;
int lwe_dim = lwe_dimension_out + 1;
int lwe_size = lwe_dimension_out + 1;
int lwe_lower, lwe_upper, cutoff;
if (lwe_dim % ideal_threads == 0) {
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = lwe_dim / ideal_threads;
if (lwe_size % ideal_threads == 0) {
lwe_lower = lwe_size / ideal_threads;
lwe_upper = lwe_size / ideal_threads;
cutoff = 0;
} else {
int y =
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
int y = ceil((double)lwe_size / (double)ideal_threads) * ideal_threads -
lwe_size;
cutoff = ideal_threads - y;
lwe_lower = lwe_dim / ideal_threads;
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
lwe_lower = lwe_size / ideal_threads;
lwe_upper = (int)ceil((double)lwe_size / (double)ideal_threads);
}
int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
int lwe_size_after = lwe_size * num_samples;
int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
int shared_mem = sizeof(Torus) * lwe_size;
cuda_memset_async(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, stream);
check_cuda_error(cudaGetLastError());
@@ -130,11 +130,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
dim3 grid(num_samples, 1, 1);
dim3 threads(ideal_threads, 1, 1);
// cudaFuncSetAttribute(keyswitch<Torus>,
// cudaFuncAttributeMaxDynamicSharedMemorySize,
// shared_mem);
keyswitch<<<grid, threads, shared_mem, stream->stream>>>(
keyswitch<Torus><<<grid, threads, shared_mem, stream->stream>>>(
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
lwe_dimension_in, lwe_dimension_out, base_log, level_count, lwe_lower,
lwe_upper, cutoff);

View File

@@ -4,25 +4,21 @@
/// Unsafe function to create a CUDA stream, must check first that GPU exists
cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
cudaSetDevice(gpu_index);
check_cuda_error(cudaSetDevice(gpu_index));
cuda_stream_t *stream = new cuda_stream_t(gpu_index);
return stream;
}
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
int cuda_destroy_stream(cuda_stream_t *stream) {
stream->release();
return 0;
}
void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); }
/// Unsafe function that will try to allocate even if gpu_index is invalid
/// or if there's not enough memory. A safe wrapper around it must call
/// cuda_check_valid_malloc() first
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
cudaSetDevice(gpu_index);
check_cuda_error(cudaSetDevice(gpu_index));
void *ptr;
cudaMalloc((void **)&ptr, size);
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaMalloc((void **)&ptr, size));
return ptr;
}
@@ -30,7 +26,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
/// Allocates a size-byte array at the device memory. Tries to do it
/// asynchronously.
void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
cudaSetDevice(stream->gpu_index);
check_cuda_error(cudaSetDevice(stream->gpu_index));
void *ptr;
#ifndef CUDART_VERSION
@@ -52,184 +48,88 @@ void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
return ptr;
}
/// Checks that allocation is valid
/// 0: valid
/// -1: invalid, not enough memory in device
/// -2: invalid, gpu index doesn't exist
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
/// Check that allocation is valid
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
size_t total_mem, free_mem;
cudaMemGetInfo(&free_mem, &total_mem);
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
if (size > free_mem) {
// error code: not enough memory
return -1;
PANIC("Cuda error: not enough memory on device. "
"Available: %zu vs Requested: %lu",
free_mem, size)
}
return 0;
}
/// Returns
/// -> 0 if Cooperative Groups is not supported.
/// -> 1 otherwise
int cuda_check_support_cooperative_groups() {
/// false if Cooperative Groups is not supported.
/// true otherwise
bool cuda_check_support_cooperative_groups() {
int cooperative_groups_supported = 0;
cudaDeviceGetAttribute(&cooperative_groups_supported,
cudaDevAttrCooperativeLaunch, 0);
check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported,
cudaDevAttrCooperativeLaunch, 0));
return cooperative_groups_supported > 0;
}
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
/// Copy memory to the GPU asynchronously
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
}
cudaSetDevice(stream->gpu_index);
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
return 0;
}
/// Tries to copy memory to the GPU synchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
if (size == 0) {
// error code: zero copy size
return -3;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
if (attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
return 0;
}
/// Tries to copy memory to the CPU synchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
if (size == 0) {
// error code: zero copy size
return -3;
}
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, src);
if (attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
}
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
return 0;
}
/// Tries to copy memory within a GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
/// Copy memory within a GPU asynchronously
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr_dest;
cudaPointerGetAttributes(&attr_dest, dest);
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
if (attr_dest.device != stream->gpu_index &&
attr_dest.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
}
cudaPointerAttributes attr_src;
cudaPointerGetAttributes(&attr_src, src);
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
if (attr_src.device != stream->gpu_index &&
attr_src.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
}
if (attr_src.device != attr_dest.device) {
// error code: different devices
return -1;
PANIC("Cuda error: different devices specified in copy from GPU to GPU.")
}
cudaSetDevice(stream->gpu_index);
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
stream->stream));
return 0;
}
/// Synchronizes device
/// 0: success
/// -2: error, gpu index doesn't exist
int cuda_synchronize_device(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
cudaDeviceSynchronize();
return 0;
void cuda_synchronize_device(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaDeviceSynchronize());
}
int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, dest);
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
}
cudaSetDevice(stream->gpu_index);
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
return 0;
}
template <typename Torus>
@@ -242,12 +142,18 @@ __global__ void cuda_set_value_kernel(Torus *array, Torus value, Torus n) {
template <typename Torus>
void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
Torus n) {
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, d_array));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda set value.")
}
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;
// Launch the kernel
cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
n);
check_cuda_error(cudaGetLastError());
}
/// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
@@ -256,57 +162,39 @@ template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
uint32_t value, uint32_t n);
/// Tries to copy memory to the GPU asynchronously
/// 0: success
/// -1: error, invalid device pointer
/// -2: error, gpu index doesn't exist
/// -3: error, zero copy size
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0) {
// error code: zero copy size
return -3;
}
if (stream->gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
/// Copy memory to the CPU asynchronously
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream) {
if (size == 0)
return;
cudaPointerAttributes attr;
cudaPointerGetAttributes(&attr, src);
check_cuda_error(cudaPointerGetAttributes(&attr, src));
if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
// error code: invalid device pointer
return -1;
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
}
cudaSetDevice(stream->gpu_index);
check_cuda_error(cudaSetDevice(stream->gpu_index));
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
return 0;
}
/// Return number of GPUs available
int cuda_get_number_of_gpus() {
int num_gpus;
cudaGetDeviceCount(&num_gpus);
check_cuda_error(cudaGetDeviceCount(&num_gpus));
return num_gpus;
}
/// Drop a cuda array
int cuda_drop(void *ptr, uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaFree(ptr));
return 0;
}
/// Drop a cuda array. Tries to do it asynchronously
int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
/// Drop a cuda array asynchronously, if supported on the device
void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
cudaSetDevice(stream->gpu_index);
check_cuda_error(cudaSetDevice(stream->gpu_index));
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11020)
@@ -323,18 +211,13 @@ int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
#else
check_cuda_error(cudaFree(ptr));
#endif
return 0;
}
/// Get the maximum size for the shared memory
int cuda_get_max_shared_memory(uint32_t gpu_index) {
if (gpu_index >= cuda_get_number_of_gpus()) {
// error code: invalid gpu_index
return -2;
}
cudaSetDevice(gpu_index);
check_cuda_error(cudaSetDevice(gpu_index));
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, gpu_index);
check_cuda_error(cudaGetDeviceProperties(&prop, gpu_index));
int max_shared_memory = 0;
if (prop.major >= 6) {
max_shared_memory = prop.sharedMemPerMultiprocessor;
@@ -344,7 +227,16 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
return max_shared_memory;
}
int cuda_synchronize_stream(cuda_stream_t *stream) {
stream->synchronize();
return 0;
void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); }
void cuda_stream_add_callback(cuda_stream_t *stream,
cudaStreamCallback_t callback, void *user_data) {
check_cuda_error(
cudaStreamAddCallback(stream->stream, callback, user_data, 0));
}
void host_free_on_stream_callback(cudaStream_t stream, cudaError_t status,
void *host_pointer) {
free(host_pointer);
}

View File

@@ -44,6 +44,7 @@ __host__ void scratch_cuda_integer_radix_bitop_kb(
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_bitop_buffer<Torus>(stream, op, params, num_radix_blocks,
allocate_gpu_memory);
}

View File

@@ -10,6 +10,7 @@ __host__ void zero_out_if(cuda_stream_t *stream, Torus *lwe_array_out,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
int big_lwe_size = params.big_lwe_dimension + 1;
@@ -94,6 +95,7 @@ __host__ void scratch_cuda_integer_radix_cmux_kb(
std::function<Torus(Torus)> predicate_lut_f, uint32_t num_radix_blocks,
int_radix_params params, bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_cmux_buffer<Torus>(stream, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory);
}

View File

@@ -70,7 +70,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
static_cast<uint64_t *>(ksk), lwe_ciphertext_count);
break;
default:
printf("Not implemented\n");
PANIC("Cuda error: integer operation not supported")
}
}

View File

@@ -37,6 +37,7 @@ __host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
Torus *input, uint32_t lwe_dimension,
uint32_t num_radix_blocks) {
cudaSetDevice(stream->gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
@@ -46,6 +47,13 @@ __host__ void accumulate_all_blocks(cuda_stream_t *stream, Torus *output,
check_cuda_error(cudaGetLastError());
}
/* This takes an array of lwe ciphertexts, where each is an encryption of
* either 0 or 1.
*
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if all input
* blocks are 1 otherwise the block encrypts 0
*
*/
template <typename Torus>
__host__ void
are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
@@ -53,6 +61,7 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -90,29 +99,34 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
input_blocks += (big_lwe_dimension + 1) * chunk_length;
}
accumulator = are_all_block_true_buffer->tmp_block_accumulated;
auto is_equal_to_num_blocks_map =
&are_all_block_true_buffer->is_equal_to_lut_map;
// Selects a LUT
int_radix_lut<Torus> *lut;
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
// is_non_zero_lut_buffer LUT
lut = mem_ptr->eq_buffer->is_non_zero_lut;
} else if (chunk_length == max_value) {
// is_max_value LUT
lut = are_all_block_true_buffer->is_max_value_lut;
} else {
// is_equal_to_num_blocks LUT
lut = are_all_block_true_buffer->is_equal_to_num_blocks_lut;
if (chunk_length != lut_num_blocks) {
if ((*is_equal_to_num_blocks_map).find(chunk_length) !=
(*is_equal_to_num_blocks_map).end()) {
// The LUT is already computed
lut = (*is_equal_to_num_blocks_map)[chunk_length];
} else {
// LUT needs to be computed
auto new_lut = new int_radix_lut<Torus>(stream, params, max_value,
num_radix_blocks, true);
auto is_equal_to_num_blocks_lut_f = [max_value,
chunk_length](Torus x) -> Torus {
return (x & max_value) == chunk_length;
};
generate_device_accumulator<Torus>(
stream, lut->lut, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, is_equal_to_num_blocks_lut_f);
stream, new_lut->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f);
// We don't have to generate this lut again
lut_num_blocks = chunk_length;
(*is_equal_to_num_blocks_map)[chunk_length] = new_lut;
lut = new_lut;
}
}
@@ -122,6 +136,60 @@ are_all_comparisons_block_true(cuda_stream_t *stream, Torus *lwe_array_out,
}
}
/* This takes an array of lwe ciphertexts, where each is an encryption of
* either 0 or 1.
*
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if at least
* one input ciphertext encrypts 1 otherwise encrypts 0
*/
template <typename Torus>
__host__ void is_at_least_one_comparisons_block_true(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t num_radix_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = total_modulus - 1;
cuda_memcpy_async_gpu_to_gpu(
lwe_array_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) * sizeof(Torus), stream);
uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 1) {
// Split in max_value chunks
uint32_t chunk_length = std::min(max_value, remaining_blocks);
int num_chunks = remaining_blocks / chunk_length;
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
auto input_blocks = lwe_array_out;
auto accumulator = buffer->tmp_block_accumulated;
for (int i = 0; i < num_chunks; i++) {
accumulate_all_blocks(stream, accumulator, input_blocks,
big_lwe_dimension, chunk_length);
accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
input_blocks += (big_lwe_dimension + 1) * chunk_length;
}
accumulator = buffer->tmp_block_accumulated;
// Selects a LUT
int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
// Applies the LUT
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, lwe_array_out, accumulator, bsk, ksk, num_chunks, lut);
}
}
// This takes an input slice of blocks.
//
// Each block can encrypt any value as long as its < message_modulus.
@@ -145,8 +213,9 @@ template <typename Torus>
__host__ void host_compare_with_zero_equality(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
int32_t num_radix_blocks) {
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -175,7 +244,6 @@ __host__ void host_compare_with_zero_equality(
num_sum_blocks = 1;
} else {
uint32_t remainder_blocks = num_radix_blocks;
auto sum_i = sum;
auto chunk = lwe_array_in;
while (remainder_blocks > 1) {
@@ -194,9 +262,8 @@ __host__ void host_compare_with_zero_equality(
}
}
auto is_equal_to_zero_lut = mem_ptr->diff_buffer->is_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, sum, sum, bsk, ksk, num_sum_blocks, is_equal_to_zero_lut);
stream, sum, sum, bsk, ksk, num_sum_blocks, zero_comparison);
are_all_comparisons_block_true(stream, lwe_array_out, sum, mem_ptr, bsk, ksk,
num_sum_blocks);
@@ -243,6 +310,7 @@ __host__ void scratch_cuda_integer_radix_equality_check_kb(
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_comparison_buffer<Torus>(
stream, op, params, num_radix_blocks, allocate_gpu_memory);
}
@@ -302,6 +370,7 @@ tree_sign_reduction(cuda_stream_t *stream, Torus *lwe_array_out,
std::function<Torus(Torus)> sign_handler_f, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = tree_buffer->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;

View File

@@ -59,7 +59,9 @@ void cuda_full_propagation_64_inplace(
ks_level, pbs_base_log, pbs_level, grouping_factor, num_blocks);
break;
default:
break;
PANIC("Cuda error (full propagation inplace): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..16384].")
}
}
@@ -86,10 +88,21 @@ void cleanup_cuda_full_propagation(cuda_stream_t *stream,
cuda_drop_async(mem_ptr->lut_buffer, stream);
cuda_drop_async(mem_ptr->lut_indexes, stream);
cuda_drop_async(mem_ptr->pbs_buffer, stream);
cuda_drop_async(mem_ptr->tmp_small_lwe_vector, stream);
cuda_drop_async(mem_ptr->tmp_big_lwe_vector, stream);
switch (mem_ptr->pbs_type) {
case LOW_LAT: {
auto x = (pbs_buffer<uint64_t, LOW_LAT> *)(mem_ptr->pbs_buffer);
x->release(stream);
} break;
case MULTI_BIT: {
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(mem_ptr->pbs_buffer);
x->release(stream);
} break;
default:
PANIC("Cuda error (PBS): unsupported implementation variant.")
}
}
void scratch_cuda_propagate_single_carry_low_latency_kb_64_inplace(

View File

@@ -1,89 +1,17 @@
#ifndef CUDA_INTEGER_CUH
#define CUDA_INTEGER_CUH
#include "bootstrap.h"
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/scalar_addition.cuh"
#include "linear_algebra.h"
#include "linearalgebra/addition.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
#include <functional>
template <typename Torus>
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
if (sizeof(Torus) == sizeof(uint32_t)) {
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
printf("multibit\n");
printf("Error: 32-bit multibit PBS is not supported.\n");
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
default:
break;
}
} else {
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
input_lwe_ciphertext_count, num_lut_vectors, lwe_idx,
max_shared_memory);
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
break;
default:
break;
}
}
}
// function rotates right radix ciphertext with specific value
// grid is one dimensional
// blockIdx.x represents x_th block of radix ciphertext
@@ -155,6 +83,7 @@ __host__ void pack_bivariate_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
uint32_t message_modulus,
uint32_t num_radix_blocks) {
cudaSetDevice(stream->gpu_index);
// Left message is shifted
int num_blocks = 0, num_threads = 0;
int num_entries = num_radix_blocks * (lwe_dimension + 1);
@@ -169,6 +98,7 @@ template <typename Torus>
__host__ void integer_radix_apply_univariate_lookup_table_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in, void *bsk,
Torus *ksk, uint32_t num_radix_blocks, int_radix_lut<Torus> *lut) {
cudaSetDevice(stream->gpu_index);
// apply_lookup_table
auto params = lut->params;
auto pbs_type = params.pbs_type;
@@ -188,12 +118,12 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lut->lwe_indexes, ksk, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
execute_pbs(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes, bsk,
lut->pbs_buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
execute_pbs<Torus>(stream, lwe_array_out, lut->lwe_indexes, lut->lut,
lut->lut_indexes, lut->tmp_lwe_after_ks, lut->lwe_indexes,
bsk, lut->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, pbs_base_log, pbs_level, grouping_factor,
num_radix_blocks, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), pbs_type);
}
template <typename Torus>
@@ -201,6 +131,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_1,
Torus *lwe_array_2, void *bsk, Torus *ksk, uint32_t num_radix_blocks,
int_radix_lut<Torus> *lut) {
cudaSetDevice(stream->gpu_index);
// apply_lookup_table_bivariate
auto params = lut->params;
@@ -303,13 +234,13 @@ void generate_device_accumulator_bivariate(
generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);
// copy host lut and tvi to device
// copy host lut and lut_indexes to device
cuda_memcpy_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
cuda_synchronize_stream(stream);
free(h_lut);
// Release memory when possible
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
}
/*
@@ -335,13 +266,13 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);
// copy host lut and tvi to device
// copy host lut and lut_indexes to device
cuda_memcpy_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream);
cuda_synchronize_stream(stream);
free(h_lut);
// Release memory when possible
cuda_stream_add_callback(stream, host_free_on_stream_callback, h_lut);
}
template <typename Torus>
@@ -363,20 +294,19 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
auto generates_or_propagates = mem->generates_or_propagates;
auto step_output = mem->step_output;
auto test_vector_array = mem->test_vector_array;
auto lut_carry_propagation_sum = mem->lut_carry_propagation_sum;
auto luts_array = mem->luts_array;
auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
auto message_acc = mem->message_acc;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
test_vector_array);
luts_array);
// compute prefix sum with hillis&steele
@@ -392,7 +322,7 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
lut_carry_propagation_sum);
luts_carry_propagation_sum);
cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
cur_blocks,
@@ -414,10 +344,9 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
/*
* input_blocks: input radix ciphertext propagation will happen inplace
* acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
* tvi_message_carry: tvi for message and carry, should always be {0, 1}
* small_lwe_vector: output of keyswitch should have
* size = 2 * (lwe_dimension + 1) * sizeof(Torus)
* big_lwe_vector: output of pbs should have
* lut_indexes_message_carry: lut_indexes for message and carry, should always
* be {0, 1} small_lwe_vector: output of keyswitch should have size = 2 *
* (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should have
* size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
*/
template <typename Torus, typename STorus, class params>
@@ -474,31 +403,12 @@ void scratch_cuda_full_propagation(
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
// PBS
int8_t *pbs_buffer;
if (pbs_type == MULTI_BIT) {
uint32_t lwe_chunk_size = get_average_lwe_chunk_size(
lwe_dimension, pbs_level, glwe_dimension, num_radix_blocks);
// Only 64 bits is supported
scratch_cuda_multi_bit_pbs_64(stream, &pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_level,
grouping_factor, num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory, lwe_chunk_size);
} else {
// Classic
// We only use low latency for classic mode
if (sizeof(Torus) == sizeof(uint32_t))
scratch_cuda_bootstrap_low_latency_32(
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory);
else
scratch_cuda_bootstrap_low_latency_64(
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
num_radix_blocks, cuda_get_max_shared_memory(stream->gpu_index),
allocate_gpu_memory);
}
execute_scratch_pbs<Torus>(stream, &pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, pbs_level, grouping_factor,
num_radix_blocks,
cuda_get_max_shared_memory(stream->gpu_index),
pbs_type, allocate_gpu_memory);
// LUT
Torus *lut_buffer;
@@ -551,8 +461,8 @@ void scratch_cuda_full_propagation(
h_lwe_indexes[i] = i;
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
stream);
cuda_synchronize_stream(stream);
free(h_lwe_indexes);
cuda_stream_add_callback(stream, host_free_on_stream_callback,
h_lwe_indexes);
}
// Temporary arrays
@@ -621,7 +531,11 @@ template <typename Torus>
__host__ void pack_blocks(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_array_in, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t factor) {
assert(lwe_array_out != lwe_array_in);
if (lwe_array_out == lwe_array_in)
PANIC("Cuda error in pack blocks: input and output pointers must be "
"different.");
cudaSetDevice(stream->gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
@@ -651,6 +565,7 @@ create_trivial_radix(cuda_stream_t *stream, Torus *lwe_array_out,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks,
uint64_t message_modulus, uint64_t carry_modulus) {
cudaSetDevice(stream->gpu_index);
size_t radix_size = (lwe_dimension + 1) * num_radix_blocks;
cuda_memset_async(lwe_array_out, 0, radix_size * sizeof(Torus), stream);

View File

@@ -24,7 +24,8 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
allocate_gpu_memory);
break;
default:
break;
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Only N = 2048 is supported")
}
}
@@ -75,7 +76,8 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
num_blocks);
break;
default:
break;
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
"Only N = 2048 is supported")
}
}

View File

@@ -7,15 +7,11 @@
#endif
#include "bootstrap.h"
#include "bootstrap_multibit.h"
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer.h"
#include "integer/integer.cuh"
#include "linear_algebra.h"
#include "pbs/bootstrap_amortized.cuh"
#include "pbs/bootstrap_low_latency.cuh"
#include "pbs/bootstrap_multibit.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <fstream>
@@ -82,6 +78,7 @@ void compress_device_array_with_map(cuda_stream_t *stream, Torus *src,
Torus *dst, int *S, int *F, int num_blocks,
uint32_t map_size, uint32_t unit_size,
int &total_copied, bool is_message) {
cudaSetDevice(stream->gpu_index);
for (int i = 0; i < map_size; i++) {
int s_index = i * num_blocks + S[i];
int number_of_unit = F[i] - S[i] + is_message;
@@ -100,6 +97,7 @@ void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
int &total_copied,
int &total_radix_copied,
int num_blocks, bool is_message) {
cudaSetDevice(stream->gpu_index);
size_t radix_size = unit_size * num_blocks;
for (int i = 0; i < map_size; i++) {
auto cur_dst_radix = &dst[total_radix_copied * radix_size];
@@ -227,6 +225,7 @@ __host__ void host_integer_mult_radix_kb(
uint64_t *radix_lwe_right, void *bsk, uint64_t *ksk,
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
cudaSetDevice(stream->gpu_index);
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
@@ -277,26 +276,21 @@ __host__ void host_integer_mult_radix_kb(
// lwe_dimension +1 coefficients
auto small_lwe_vector = mem_ptr->small_lwe_vector;
// buffer to keep pbs result for num_blocks^2 lwe_ciphertext
// in total it has num_blocks^2 big lwe ciphertexts with
// glwe_dimension * polynomial_size + 1 coefficients
auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array;
// it contains two test vector, first for lsb extraction,
// it contains two lut, first for lsb extraction,
// second for msb extraction, with total length =
// 2 * (glwe_dimension + 1) * polynomial_size
auto test_vector_array = mem_ptr->test_vector_array;
auto luts_array = mem_ptr->luts_array;
// accumulator to extract message
// with length (glwe_dimension + 1) * polynomial_size
auto test_vector_message = mem_ptr->test_vector_message;
auto luts_message = mem_ptr->luts_message;
// accumulator to extract carry
// with length (glwe_dimension + 1) * polynomial_size
auto test_vector_carry = mem_ptr->test_vector_carry;
auto luts_carry = mem_ptr->luts_carry;
// to be used as default indexing
auto lwe_indexes = test_vector_array->lwe_indexes;
auto lwe_indexes = luts_array->lwe_indexes;
auto vector_result_lsb = &vector_result_sb[0];
auto vector_result_msb =
@@ -316,7 +310,7 @@ __host__ void host_integer_mult_radix_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, block_mul_res, block_mul_res, vector_result_sb, bsk, ksk,
total_block_count, test_vector_array);
total_block_count, luts_array);
vector_result_lsb = &block_mul_res[0];
vector_result_msb = &block_mul_res[lsb_vector_block_count *
@@ -408,19 +402,19 @@ __host__ void host_integer_mult_radix_kb(
polynomial_size * glwe_dimension, lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_copied);
execute_pbs<Torus>(
stream, message_blocks_vector, lwe_indexes, test_vector_message->lut,
test_vector_message->lut_indexes, small_lwe_vector, lwe_indexes, bsk,
test_vector_message->pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
message_count, 1, 0, max_shared_memory, mem_ptr->params.pbs_type);
execute_pbs<Torus>(stream, message_blocks_vector, lwe_indexes,
luts_message->lut, luts_message->lut_indexes,
small_lwe_vector, lwe_indexes, bsk, luts_message->buffer,
glwe_dimension, lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, message_count, 1, 0,
max_shared_memory, mem_ptr->params.pbs_type);
execute_pbs<Torus>(stream, carry_blocks_vector, lwe_indexes,
test_vector_carry->lut, test_vector_carry->lut_indexes,
luts_carry->lut, luts_carry->lut_indexes,
&small_lwe_vector[message_count * (lwe_dimension + 1)],
lwe_indexes, bsk, test_vector_carry->pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size,
lwe_indexes, bsk, luts_carry->buffer, glwe_dimension,
lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, carry_count, 1, 0,
max_shared_memory, mem_ptr->params.pbs_type);
@@ -455,15 +449,15 @@ __host__ void host_integer_mult_radix_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, vector_result_sb, radix_lwe_out, bsk, ksk, num_blocks,
test_vector_message);
luts_message);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, &block_mul_res[big_lwe_size], radix_lwe_out, bsk, ksk, num_blocks,
test_vector_carry);
luts_carry);
cuda_memset_async(block_mul_res, 0, big_lwe_size * sizeof(Torus), stream);
host_addition(stream, radix_lwe_out, vector_result_sb, block_mul_res,
big_lwe_size, num_blocks);
big_lwe_dimension, num_blocks);
host_propagate_single_carry_low_latency<Torus>(
stream, radix_lwe_out, mem_ptr->scp_mem, bsk, ksk, num_blocks);
@@ -474,6 +468,7 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
cuda_stream_t *stream, int_mul_memory<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_mul_memory<Torus>(stream, params, num_radix_blocks,
allocate_gpu_memory);
}
@@ -544,16 +539,16 @@ void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
// when message and carry have tobe extracted
// for first message_count blocks we need message_acc
// for last carry_count blocks we need carry_acc
Torus *cur_tvi;
Torus *cur_lut_indexes;
if (lsb_msb_mode) {
cur_tvi = (big_lwe_start_index < lsb_message_blocks_count)
? mem_ptr->tvi_lsb_multi_gpu[i]
: mem_ptr->tvi_msb_multi_gpu[i];
cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
? mem_ptr->lut_indexes_lsb_multi_gpu[i]
: mem_ptr->lut_indexes_msb_multi_gpu[i];
} else {
cur_tvi = (big_lwe_start_index < lsb_message_blocks_count)
? mem_ptr->tvi_message_multi_gpu[i]
: mem_ptr->tvi_carry_multi_gpu[i];
cur_lut_indexes = (big_lwe_start_index < lsb_message_blocks_count)
? mem_ptr->lut_indexes_message_multi_gpu[i]
: mem_ptr->lut_indexes_carry_multi_gpu[i];
}
// execute keyswitch on a current gpu with corresponding input and output
@@ -568,7 +563,7 @@ void apply_lookup_table(Torus *input_ciphertexts, Torus *output_ciphertexts,
// execute pbs on a current gpu with corresponding input and output
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
this_stream, i, mem_ptr->pbs_output_multi_gpu[i],
mem_ptr->test_vector_multi_gpu[i], cur_tvi,
mem_ptr->lut_multi_gpu[i], cur_lut_indexes,
mem_ptr->pbs_input_multi_gpu[i], mem_ptr->bsk_multi_gpu[i],
mem_ptr->pbs_buffer_multi_gpu[i], lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, pbs_base_log, pbs_level,

View File

@@ -11,6 +11,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
int_bitop_buffer<Torus> *mem_ptr, void *bsk, Torus *ksk,
uint32_t num_radix_blocks, BITOP_TYPE op) {
cudaSetDevice(stream->gpu_index);
auto lut = mem_ptr->lut;
auto params = lut->params;
auto big_lwe_dimension = params.big_lwe_dimension;
@@ -19,7 +20,6 @@ __host__ void host_integer_radix_scalar_bitop_kb(
if (num_clear_blocks == 0) {
if (op == SCALAR_BITAND) {
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
cuda_memset_async(lwe_array_out, 0,
num_radix_blocks * lwe_size * sizeof(Torus), stream);
} else {
@@ -28,7 +28,6 @@ __host__ void host_integer_radix_scalar_bitop_kb(
stream);
}
} else {
auto lut_buffer = lut->lut;
// We have all possible LUTs pre-computed and we use the decomposed scalar
// as index to recover the right one
cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
@@ -38,7 +37,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
lut);
if (op == SCALAR_BITAND) {
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
cuda_memset_async(lwe_array_out_block, 0,
(num_radix_blocks - num_clear_blocks) * lwe_size *

View File

@@ -8,17 +8,14 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
switch (buffer->op) {
// case EQ:
// case NE:
// host_integer_radix_equality_check_kb<uint64_t>(
// stream, static_cast<uint64_t *>(lwe_array_out),
// static_cast<uint64_t *>(lwe_array_1),
// static_cast<uint64_t *>(lwe_array_2), buffer, bsk,
// static_cast<uint64_t *>(ksk), glwe_dimension, polynomial_size,
// big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log,
// pbs_level, pbs_base_log, grouping_factor, lwe_ciphertext_count,
// message_modulus, carry_modulus);
// break;
case EQ:
case NE:
host_integer_radix_scalar_equality_check_kb<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(scalar_blocks), buffer, bsk,
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
break;
case GT:
case GE:
case LT:
@@ -39,6 +36,6 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
static_cast<uint64_t *>(ksk), lwe_ciphertext_count, num_scalar_blocks);
break;
default:
printf("Not implemented\n");
PANIC("Cuda error: integer operation not supported")
}
}

View File

@@ -11,6 +11,7 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
std::function<Torus(Torus)> sign_handler_f, void *bsk, Torus *ksk,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -46,9 +47,9 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
if (total_num_scalar_blocks == 0) {
// We only have to compare blocks with zero
// means scalar is zero
host_compare_with_zero_equality(stream, mem_ptr->tmp_lwe_array_out,
lwe_array_in, mem_ptr, bsk, ksk,
total_num_radix_blocks);
host_compare_with_zero_equality(
stream, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsk, ksk,
total_num_radix_blocks, mem_ptr->is_zero_lut);
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -84,8 +85,8 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
auto lwe_array_msb_out = lwe_array_lsb_out + big_lwe_size;
cuda_synchronize_stream(stream);
auto lsb_stream = diff_buffer->lsb_stream;
auto msb_stream = diff_buffer->msb_stream;
auto lsb_stream = mem_ptr->lsb_stream;
auto msb_stream = mem_ptr->msb_stream;
#pragma omp parallel sections
{
@@ -128,8 +129,8 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
//////////////
// msb
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
mem_ptr, bsk, ksk,
num_msb_radix_blocks);
mem_ptr, bsk, ksk, num_msb_radix_blocks,
mem_ptr->is_zero_lut);
}
}
cuda_synchronize_stream(lsb_stream);
@@ -209,17 +210,9 @@ scalar_compare_radix_blocks_kb(cuda_stream_t *stream, Torus *lwe_array_out,
int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
auto pbs_type = params.pbs_type;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
auto pbs_level = params.pbs_level;
auto pbs_base_log = params.pbs_base_log;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
@@ -267,6 +260,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
Torus *ksk, uint32_t total_num_radix_blocks,
uint32_t total_num_scalar_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = mem_ptr->params;
// Calculates the difference sign between the ciphertext and the scalar
@@ -295,4 +289,115 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
stream, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsk, ksk, total_num_radix_blocks);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_equality_check_kb(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_array_in,
Torus *scalar_blocks, int_comparison_buffer<Torus> *mem_ptr, void *bsk,
Torus *ksk, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto eq_buffer = mem_ptr->eq_buffer;
size_t big_lwe_size = big_lwe_dimension + 1;
size_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
auto scalar_comparison_luts = eq_buffer->scalar_comparison_luts;
uint32_t num_halved_scalar_blocks =
(num_scalar_blocks / 2) + (num_scalar_blocks % 2);
uint32_t num_lsb_radix_blocks =
std::min(num_radix_blocks, 2 * num_halved_scalar_blocks);
uint32_t num_msb_radix_blocks = num_radix_blocks - num_lsb_radix_blocks;
uint32_t num_halved_lsb_radix_blocks =
(num_lsb_radix_blocks / 2) + (num_lsb_radix_blocks % 2);
auto lsb = lwe_array_in;
auto msb = lwe_array_in + big_lwe_size * num_lsb_radix_blocks;
auto lwe_array_lsb_out = mem_ptr->tmp_lwe_array_out;
auto lwe_array_msb_out =
lwe_array_lsb_out + big_lwe_size * num_halved_lsb_radix_blocks;
cuda_synchronize_stream(stream);
auto lsb_stream = mem_ptr->lsb_stream;
auto msb_stream = mem_ptr->msb_stream;
#pragma omp parallel sections
{
// Both sections may be executed in parallel
#pragma omp section
{
if (num_halved_scalar_blocks > 0) {
auto packed_blocks = mem_ptr->tmp_packed_input;
auto packed_scalar =
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
pack_blocks(lsb_stream, packed_blocks, lsb, big_lwe_dimension,
num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_stream, packed_scalar, scalar_blocks, 0,
num_scalar_blocks, message_modulus);
cuda_memcpy_async_gpu_to_gpu(
scalar_comparison_luts->lut_indexes, packed_scalar,
num_halved_scalar_blocks * sizeof(Torus), lsb_stream);
integer_radix_apply_univariate_lookup_table_kb(
lsb_stream, lwe_array_lsb_out, packed_blocks, bsk, ksk,
num_halved_lsb_radix_blocks, scalar_comparison_luts);
}
}
#pragma omp section
{
//////////////
// msb
if (num_msb_radix_blocks > 0) {
int_radix_lut<Torus> *msb_lut;
switch (mem_ptr->op) {
case COMPARISON_TYPE::EQ:
msb_lut = mem_ptr->is_zero_lut;
break;
case COMPARISON_TYPE::NE:
msb_lut = mem_ptr->eq_buffer->is_non_zero_lut;
break;
default:
PANIC("Cuda error: integer operation not supported")
}
host_compare_with_zero_equality(msb_stream, lwe_array_msb_out, msb,
mem_ptr, bsk, ksk, num_msb_radix_blocks,
msb_lut);
}
}
}
cuda_synchronize_stream(lsb_stream);
cuda_synchronize_stream(msb_stream);
switch (mem_ptr->op) {
case COMPARISON_TYPE::EQ:
are_all_comparisons_block_true(
stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
case COMPARISON_TYPE::NE:
is_at_least_one_comparisons_block_true(
stream, lwe_array_out, lwe_array_lsb_out, mem_ptr, bsk, ksk,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
default:
PANIC("Cuda error: integer operation not supported")
}
// The result will be in the two first block. Everything else is
// garbage.
if (num_radix_blocks > 1)
cuda_memset_async(lwe_array_out + big_lwe_size, 0,
big_lwe_size_bytes * (num_radix_blocks - 1), stream);
}
#endif

View File

@@ -30,6 +30,7 @@ __host__ void scratch_cuda_integer_radix_scalar_rotate_kb(
uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
num_radix_blocks, allocate_gpu_memory);
}
@@ -39,6 +40,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
cuda_stream_t *stream, Torus *lwe_array, uint32_t n,
int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;

View File

@@ -17,6 +17,7 @@ __host__ void scratch_cuda_integer_radix_scalar_shift_kb(
uint32_t num_radix_blocks, int_radix_params params, SHIFT_TYPE shift_type,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
*mem_ptr = new int_shift_buffer<Torus>(stream, shift_type, params,
num_radix_blocks, allocate_gpu_memory);
}
@@ -26,6 +27,7 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
cuda_stream_t *stream, Torus *lwe_array, uint32_t shift,
int_shift_buffer<Torus> *mem, void *bsk, Torus *ksk, uint32_t num_blocks) {
cudaSetDevice(stream->gpu_index);
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
@@ -44,10 +46,10 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
size_t rotations = std::min(shift / num_bits_in_block, (size_t)num_blocks);
size_t shift_within_block = shift % num_bits_in_block;
Torus *rotated_buffer = mem->tmp_rotated;
Torus *full_rotated_buffer = mem->tmp_rotated;
Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size];
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
auto lut_univariate = mem->lut_buffers_univariate[shift_within_block];
// rotate right all the blocks in radix ciphertext
// copy result in new buffer
@@ -68,23 +70,15 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
return;
}
// check if we have enough blocks for partial processing
if (rotations < num_blocks - 1) {
auto partial_current_blocks = &lwe_array[(rotations + 1) * big_lwe_size];
auto partial_previous_blocks = &lwe_array[rotations * big_lwe_size];
auto partial_current_blocks = &lwe_array[rotations * big_lwe_size];
auto partial_previous_blocks =
&full_rotated_buffer[rotations * big_lwe_size];
size_t partial_block_count = num_blocks - rotations - 1;
size_t partial_block_count = num_blocks - rotations;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, partial_current_blocks, partial_current_blocks,
partial_previous_blocks, bsk, ksk, partial_block_count,
lut_bivariate);
}
auto rest = &lwe_array[rotations * big_lwe_size];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, rest, rest, bsk, ksk, 1, lut_univariate);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, partial_current_blocks, partial_current_blocks,
partial_previous_blocks, bsk, ksk, partial_block_count, lut_bivariate);
} else {
// right shift
@@ -102,23 +96,14 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
return;
}
// check if we have enough blocks for partial processing
if (rotations < num_blocks - 1) {
auto partial_current_blocks = lwe_array;
auto partial_next_blocks = &lwe_array[big_lwe_size];
auto partial_current_blocks = lwe_array;
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
size_t partial_block_count = num_blocks - rotations - 1;
size_t partial_block_count = num_blocks - rotations;
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, partial_current_blocks, partial_current_blocks,
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
}
// The right-most block is done separately as it does not
// need to recuperate the shifted bits from its right neighbour.
auto last_block = &lwe_array[(num_blocks - rotations - 1) * big_lwe_size];
integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, last_block, last_block, bsk, ksk, 1, lut_univariate);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, partial_current_blocks, partial_current_blocks,
partial_next_blocks, bsk, ksk, partial_block_count, lut_bivariate);
}
}

View File

@@ -1 +1 @@
#include "bootstrapping_key.cuh"
#include "bootstrap.cuh"

View File

@@ -0,0 +1,136 @@
#include "../../include/bootstrap.h"
#include "../../include/device.h"
#include "../include/device.h"
#include "bootstrap_low_latency.cuh"
#include "bootstrap_multibit.cuh"
template <typename Torus>
void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, void *bootstrapping_key,
int8_t *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
PANIC("Error: 32-bit multibit PBS is not supported.\n")
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
default:
break;
}
break;
case sizeof(uint64_t):
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
input_lwe_ciphertext_count, num_luts, lwe_idx, max_shared_memory);
break;
case LOW_LAT:
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_luts, lwe_idx, max_shared_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.")
}
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
"moduli are supported.")
}
}
template <typename Torus>
void execute_scratch_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
PANIC("Error: 32-bit multibit PBS is not supported.\n")
case LOW_LAT:
scratch_cuda_bootstrap_low_latency_32(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case AMORTIZED:
scratch_cuda_bootstrap_amortized_32(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.")
}
break;
case sizeof(uint64_t):
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
scratch_cuda_multi_bit_pbs_64(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, grouping_factor, input_lwe_ciphertext_count,
max_shared_memory, allocate_gpu_memory);
break;
case LOW_LAT:
scratch_cuda_bootstrap_low_latency_64(
stream, pbs_buffer, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case AMORTIZED:
scratch_cuda_bootstrap_amortized_64(
stream, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
PANIC("Error: unsupported cuda PBS type.")
}
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
"moduli are supported.")
}
}

View File

@@ -11,31 +11,9 @@ uint64_t get_buffer_size_bootstrap_amortized_64(
max_shared_memory);
}
/*
* Runs standard checks to validate the inputs
*/
void checks_fast_bootstrap_amortized(int polynomial_size) {
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
}
/*
* Runs standard checks to validate the inputs
*/
void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
assert(("Error (GPU amortized PBS): base log should be <= nbits",
base_log <= nbits));
checks_fast_bootstrap_amortized(polynomial_size);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
* the amortized PBS on 32 bits inputs, into `buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
@@ -43,7 +21,6 @@ void scratch_cuda_bootstrap_amortized_32(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_bootstrap_amortized(polynomial_size);
switch (polynomial_size) {
case 256:
@@ -82,13 +59,15 @@ void scratch_cuda_bootstrap_amortized_32(
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
* the amortized PBS on 64 bits inputs, into `buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
*/
@@ -96,7 +75,6 @@ void scratch_cuda_bootstrap_amortized_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory) {
checks_fast_bootstrap_amortized(polynomial_size);
switch (polynomial_size) {
case 256:
@@ -135,7 +113,9 @@ void scratch_cuda_bootstrap_amortized_64(
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
@@ -148,9 +128,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
checks_bootstrap_amortized(32, base_log, polynomial_size);
if (base_log > 32)
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
"the ciphertext representation (32)");
switch (polynomial_size) {
case 256:
@@ -159,8 +141,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 512:
host_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
@@ -168,8 +150,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
@@ -177,8 +159,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
@@ -186,8 +168,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
@@ -195,8 +177,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
@@ -204,8 +186,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 16384:
host_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
@@ -213,11 +195,13 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
(uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
(uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
default:
break;
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
@@ -228,11 +212,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* - lut_vector: should hold as many luts of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* num_luts vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to
* which test vector of lut_vector to use for each LWE input in
* which lut of lut_vector to use for each LWE input in
* lwe_array_in
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
@@ -244,17 +228,17 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
* bsk is thus a tensor of size (k+1)^2.l.N.n
* where l is the number of decomposition levels and
* k is the GLWE dimension, N is the polynomial size for
* GLWE. The polynomial size for GLWE and the test vector
* GLWE. The polynomial size for GLWE and the lut
* are the same because they have to be in the same ring
* to be multiplied.
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* - polynomial_size: size of the test polynomial (lut) and size of the
* GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
* - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
* - num_samples: number of encrypted input messages
* - num_lut_vectors: parameter to set the actual number of test vectors to be
* - num_luts: parameter to set the actual number of luts to be
* used
* - lwe_idx: the index of the LWE input to consider for the GPU of index
* gpu_index. In case of multi-GPU computing, it is assumed that only a part of
@@ -292,9 +276,11 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory) {
checks_bootstrap_amortized(64, base_log, polynomial_size);
if (base_log > 64)
PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
"the ciphertext representation (64)");
switch (polynomial_size) {
case 256:
@@ -303,8 +289,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 512:
host_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
@@ -312,8 +298,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
@@ -321,8 +307,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
@@ -330,8 +316,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
@@ -339,8 +325,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
@@ -348,8 +334,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
case 16384:
host_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
@@ -357,17 +343,19 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
(uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
(uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
polynomial_size, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory);
break;
default:
break;
PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
/*
* This cleanup function frees the data for the amortized PBS on GPU in
* pbs_buffer for 32 or 64 bits inputs.
* buffer for 32 or 64 bits inputs.
*/
void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer) {

View File

@@ -24,10 +24,10 @@ template <typename Torus, class params, sharedMemDegree SMD>
* Uses shared memory to increase performance
* - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
* (a0,..an-1,b) where n is the LWE dimension
* - lut_vector: should hold as many test vectors of size polynomial_size
* - lut_vector: should hold as many luts of size polynomial_size
* as there are input ciphertexts, but actually holds
* num_lut_vectors vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to which test vector
* num_luts vectors to reduce memory usage
* - lut_vector_indexes: stores the index corresponding to which lut
* to use for each sample in lut_vector
* - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
* mask values + 1 body value
@@ -37,7 +37,7 @@ template <typename Torus, class params, sharedMemDegree SMD>
* == NOSM or PARTIALSM)
* - lwe_dimension: size of the Torus vector used to encrypt the input
* LWE ciphertexts - referred to as n above (~ 600)
* - polynomial_size: size of the test polynomial (test vector) and size of the
* - polynomial_size: size of the test polynomial (lut) and size of the
* GLWE polynomial (~1024)
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
* - level_count: number of decomposition levels in the gadget matrix (~4)
@@ -288,8 +288,8 @@ __host__ void host_bootstrap_amortized(
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(

View File

@@ -245,51 +245,10 @@ __global__ void device_bootstrap_fast_low_latency(
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_fast_low_latency(
uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
polynomial_size);
uint64_t partial_dm = full_sm - partial_sm;
uint64_t full_dm = full_sm;
uint64_t device_mem = 0;
if (max_shared_memory < partial_sm) {
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
} else if (max_shared_memory < full_sm) {
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
(glwe_dimension + 1);
}
uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_fast_low_latency(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
@@ -316,13 +275,10 @@ __host__ void scratch_bootstrap_fast_low_latency(
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
*buffer = new pbs_buffer<Torus, LOW_LAT>(
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::FAST, allocate_gpu_memory);
}
/*
@@ -333,11 +289,11 @@ template <typename Torus, class params>
__host__ void host_bootstrap_fast_low_latency(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, LOW_LAT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
// With SM each block corresponds to either the mask or body, no need to
@@ -353,15 +309,8 @@ __host__ void host_bootstrap_fast_low_latency(
uint64_t partial_dm = full_dm - partial_sm;
int8_t *d_mem = pbs_buffer;
double2 *buffer_fft =
(double2 *)d_mem +
(ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory) /
sizeof(double2) -
(glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count * polynomial_size / 2);
int8_t *d_mem = buffer->d_mem;
double2 *buffer_fft = buffer->global_accumulator_fft;
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
@@ -436,12 +385,12 @@ __host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
thds, 0);
thds, partial_sm);
} else {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
0);
full_sm);
}
// Get the number of streaming multiprocessors
@@ -450,4 +399,46 @@ __host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
// Verify if the grid size for the low latency kernel satisfies the cooperative
// group constraints
template <typename Torus>
__host__ bool supports_cooperative_groups_on_lowlat_pbs(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 512:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 1024:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 2048:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 4096:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 8192:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 16384:
return verify_cuda_bootstrap_fast_low_latency_grid_size<
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
default:
PANIC("Cuda error (low latency PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..16384].")
}
}
#endif // LOWLAT_FAST_PBS_H

View File

@@ -3,7 +3,6 @@
#include "bootstrap.h"
#include "bootstrap_multibit.cuh"
#include "bootstrap_multibit.h"
#include "cooperative_groups.h"
#include "crypto/gadget.cuh"
#include "crypto/ggsw.cuh"
@@ -155,11 +154,11 @@ __host__ __device__ uint64_t get_buffer_size_fast_multibit_bootstrap(
template <typename Torus, typename STorus, typename params>
__host__ void scratch_fast_multi_bit_pbs(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0) {
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
@@ -184,30 +183,25 @@ __host__ void scratch_fast_multi_bit_pbs(
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (allocate_gpu_memory) {
if (!lwe_chunk_size)
lwe_chunk_size =
get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
input_lwe_ciphertext_count);
uint64_t buffer_size = get_buffer_size_fast_multibit_bootstrap<Torus>(
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, lwe_chunk_size,
max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
if (!lwe_chunk_size)
lwe_chunk_size = get_average_lwe_chunk_size(
lwe_dimension, level_count, glwe_dimension, input_lwe_ciphertext_count);
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::FAST,
allocate_gpu_memory);
}
template <typename Torus, typename STorus, class params>
__host__ void host_fast_multi_bit_pbs(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
if (!lwe_chunk_size)
@@ -215,15 +209,9 @@ __host__ void host_fast_multi_bit_pbs(
glwe_dimension, num_samples);
//
double2 *keybundle_fft = (double2 *)pbs_buffer;
double2 *buffer_fft = (double2 *)keybundle_fft +
num_samples * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2);
Torus *global_accumulator =
(Torus *)buffer_fft +
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) / sizeof(Torus));
double2 *keybundle_fft = pbs_buffer->keybundle_fft;
Torus *global_accumulator = pbs_buffer->global_accumulator;
double2 *buffer_fft = pbs_buffer->global_accumulator_fft;
//
uint64_t full_sm_keybundle =
@@ -319,4 +307,46 @@ verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
// Verify if the grid size for the multi-bit kernel satisfies the cooperative
// group constraints
template <typename Torus>
__host__ bool supports_cooperative_groups_on_multibit_pbs(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<Torus,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 512:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<Torus,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 1024:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 2048:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 4096:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 8192:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 16384:
return verify_cuda_bootstrap_fast_multi_bit_grid_size<
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
#endif // FASTMULTIBIT_PBS_H

View File

@@ -221,27 +221,6 @@ __global__ void device_bootstrap_low_latency_step_two(
}
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_one(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator_rotated
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_full_sm_bootstrap_low_latency_step_two(
uint32_t polynomial_size) {
return sizeof(Torus) * polynomial_size + // accumulator
sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
return sizeof(double2) * polynomial_size / 2; // accumulator fft
}
template <typename Torus>
__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -285,8 +264,8 @@ __host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
template <typename Torus, typename STorus, typename params>
__host__ void scratch_bootstrap_low_latency(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
cuda_stream_t *stream, pbs_buffer<Torus, LOW_LAT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory) {
cudaSetDevice(stream->gpu_index);
@@ -338,13 +317,9 @@ __host__ void scratch_bootstrap_low_latency(
check_cuda_error(cudaGetLastError());
}
if (allocate_gpu_memory) {
uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
*buffer = new pbs_buffer<Torus, LOW_LAT>(
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
}
template <typename Torus, class params>
@@ -432,11 +407,11 @@ template <typename Torus, class params>
__host__ void host_bootstrap_low_latency(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {
Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, LOW_LAT> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_luts, uint32_t max_shared_memory) {
cudaSetDevice(stream->gpu_index);
// With SM each block corresponds to either the mask or body, no need to
@@ -456,16 +431,9 @@ __host__ void host_bootstrap_low_latency(
uint64_t full_dm_step_one = full_sm_step_one;
uint64_t full_dm_step_two = full_sm_step_two;
double2 *global_accumulator_fft = (double2 *)pbs_buffer;
Torus *global_accumulator =
(Torus *)global_accumulator_fft +
(ptrdiff_t)(sizeof(double2) * (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count * (polynomial_size / 2) /
sizeof(Torus));
int8_t *d_mem = (int8_t *)global_accumulator +
(ptrdiff_t)(sizeof(Torus) * (glwe_dimension + 1) *
input_lwe_ciphertext_count * polynomial_size /
sizeof(int8_t));
Torus *global_accumulator = pbs_buffer->global_accumulator;
double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
int8_t *d_mem = pbs_buffer->d_mem;
for (int i = 0; i < lwe_dimension; i++) {
execute_low_latency_step_one<Torus, params>(

View File

@@ -3,365 +3,357 @@
#include "bootstrap_multibit.cuh"
#include "bootstrap_multibit.h"
void checks_multi_bit_pbs(int polynomial_size) {
assert(
("Error (GPU multi-bit PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
bool has_support_to_cuda_bootstrap_fast_multi_bit(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory) {
return supports_cooperative_groups_on_multibit_pbs<uint64_t>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory);
}
template <typename Torus>
void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
"the ciphertext representation (64)");
switch (polynomial_size) {
case 256:
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 512:
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<512>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 1024:
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<1024>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 2048:
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<2048>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 4096:
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<4096>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 8192:
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<8192>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 16384:
host_fast_multi_bit_pbs<Torus, int64_t, AmortizedDegree<16384>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
template <typename Torus>
void cuda_multi_bit_pbs_lwe_ciphertext_vector(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
"the ciphertext representation (64)");
switch (polynomial_size) {
case 256:
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 512:
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<512>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 1024:
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<1024>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 2048:
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<2048>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 4096:
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<4096>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 8192:
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<8192>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
case 16384:
host_multi_bit_pbs<Torus, int64_t, AmortizedDegree<16384>>(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key,
pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, num_samples, num_luts, lwe_idx,
max_shared_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
checks_multi_bit_pbs(polynomial_size);
if (supports_cooperative_groups_on_multibit_pbs<uint64_t>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory))
cuda_fast_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key),
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
else
cuda_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key),
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
}
template <typename Torus, typename STorus>
void scratch_cuda_fast_multi_bit_pbs(
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<256>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 512:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<512>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 1024:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<1024>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 2048:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<2048>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 4096:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<4096>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 8192:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<8192>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 16384:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<16384>>(
glwe_dimension, level_count, num_samples, max_shared_memory)) {
host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
} else {
host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
static_cast<uint64_t *>(lut_vector),
static_cast<uint64_t *>(lut_vector_indexes),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), pbs_buffer,
glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
max_shared_memory, lwe_chunk_size);
}
scratch_fast_multi_bit_pbs<Torus, STorus, AmortizedDegree<16384>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
template <typename Torus, typename STorus>
void scratch_cuda_multi_bit_pbs(
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
switch (polynomial_size) {
case 256:
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<256>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 512:
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<512>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 1024:
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<1024>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 2048:
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<2048>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 4096:
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<4096>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 8192:
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<8192>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
case 16384:
scratch_multi_bit_pbs<Torus, STorus, AmortizedDegree<16384>>(
stream, buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..16384].")
}
}
void scratch_cuda_multi_bit_pbs_64(
cuda_stream_t *stream, int8_t **pbs_buffer, uint32_t lwe_dimension,
cuda_stream_t *stream, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t max_shared_memory, bool allocate_gpu_memory,
uint32_t lwe_chunk_size) {
switch (polynomial_size) {
case 256:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<256>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 512:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<512>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 1024:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<1024>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 2048:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<2048>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 4096:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<4096>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 8192:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<8192>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
case 16384:
if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
AmortizedDegree<16384>>(
glwe_dimension, level_count, input_lwe_ciphertext_count,
max_shared_memory)) {
scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
} else {
scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, grouping_factor,
max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
}
break;
default:
break;
}
if (supports_cooperative_groups_on_multibit_pbs<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
else
scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
lwe_chunk_size);
}
void cleanup_cuda_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer) {
// Free memory
cuda_drop_async(*pbs_buffer, stream);
void cleanup_cuda_multi_bit_pbs_32(cuda_stream_t *stream, int8_t **buffer) {
auto x = (pbs_buffer<uint32_t, MULTI_BIT> *)(*buffer);
x->release(stream);
}
void cleanup_cuda_multi_bit_pbs_64(cuda_stream_t *stream, int8_t **buffer) {
auto x = (pbs_buffer<uint64_t, MULTI_BIT> *)(*buffer);
x->release(stream);
}
// Pick the best possible chunk size for each GPU
@@ -427,7 +419,12 @@ __host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
return 9;
} else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
// Tesla H100
return 45;
if (num_samples < 1024)
return 128;
else if (num_samples < 4096)
return 64;
else
return 32;
}
// Generic case
@@ -455,11 +452,11 @@ __host__ uint32_t get_average_lwe_chunk_size(uint32_t lwe_dimension,
return (ct_count > 10000) ? 30 : 45;
} else if (std::strstr(deviceProp.name, h100Name) != nullptr) {
// Tesla H100
return (ct_count > 10000) ? 30 : 45;
return 64;
}
// Generic case
return (ct_count > 10000) ? 2 : 10;
return (ct_count > 10000) ? 2 : 1;
}
// Returns the maximum buffer size required to execute batches up to
@@ -473,14 +470,51 @@ __host__ uint64_t get_max_buffer_size_multibit_bootstrap(
for (uint32_t input_lwe_ciphertext_count = 1;
input_lwe_ciphertext_count <= max_input_lwe_ciphertext_count;
input_lwe_ciphertext_count *= 2) {
max_buffer_size = std::max(
max_buffer_size,
get_buffer_size_multibit_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
input_lwe_ciphertext_count)));
max_buffer_size =
std::max(max_buffer_size,
get_buffer_size_multibit_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
get_average_lwe_chunk_size(lwe_dimension, level_count,
glwe_dimension,
input_lwe_ciphertext_count)));
}
return max_buffer_size;
}
template void scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template void cuda_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
cuda_stream_t *stream, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size);
template void scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
cuda_stream_t *stream, pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
template void cuda_fast_multi_bit_pbs_lwe_ciphertext_vector<uint64_t>(
cuda_stream_t *stream, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
uint64_t *lut_vector_indexes, uint64_t *lwe_array_in,
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size);

View File

@@ -329,13 +329,12 @@ __host__ __device__ uint64_t get_buffer_size_multibit_bootstrap(
}
template <typename Torus, typename STorus, typename params>
__host__ void
scratch_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
__host__ void scratch_multi_bit_pbs(
cuda_stream_t *stream, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t grouping_factor, uint32_t max_shared_memory,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
@@ -374,29 +373,25 @@ scratch_multi_bit_pbs(cuda_stream_t *stream, int8_t **pbs_buffer,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
if (allocate_gpu_memory) {
if (!lwe_chunk_size)
lwe_chunk_size =
get_average_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
input_lwe_ciphertext_count);
uint64_t buffer_size = get_buffer_size_multibit_bootstrap<Torus>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size);
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream);
check_cuda_error(cudaGetLastError());
}
if (!lwe_chunk_size)
lwe_chunk_size = get_average_lwe_chunk_size(
lwe_dimension, level_count, glwe_dimension, input_lwe_ciphertext_count);
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
allocate_gpu_memory);
}
template <typename Torus, typename STorus, class params>
__host__ void host_multi_bit_pbs(
cuda_stream_t *stream, Torus *lwe_array_out, Torus *lwe_output_indexes,
Torus *lut_vector, Torus *lut_vector_indexes, Torus *lwe_array_in,
Torus *lwe_input_indexes, uint64_t *bootstrapping_key, int8_t *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
uint32_t lwe_chunk_size = 0) {
cudaSetDevice(stream->gpu_index);
// If a chunk size is not passed to this function, select one.
@@ -404,15 +399,9 @@ __host__ void host_multi_bit_pbs(
lwe_chunk_size = get_average_lwe_chunk_size(lwe_dimension, level_count,
glwe_dimension, num_samples);
//
double2 *keybundle_fft = (double2 *)pbs_buffer;
double2 *global_accumulator_fft =
(double2 *)keybundle_fft +
num_samples * lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
Torus *global_accumulator =
(Torus *)global_accumulator_fft +
(ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
level_count * (polynomial_size / 2) / sizeof(Torus));
double2 *keybundle_fft = buffer->keybundle_fft;
Torus *global_accumulator = buffer->global_accumulator;
double2 *global_accumulator_fft = buffer->global_accumulator_fft;
//
uint64_t full_sm_keybundle =

View File

@@ -0,0 +1,78 @@
#include "bootstrapping_key.cuh"
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
(double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
(double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor) {
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
level_count * (1 << grouping_factor) /
grouping_factor;
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
stream);
}
// We need these lines so the compiler knows how to specialize these functions
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);

View File

@@ -100,11 +100,10 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream);
double2 *buffer;
double2 *buffer = (double2 *)cuda_malloc_async(0, stream);
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -123,7 +122,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -142,7 +140,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -161,7 +158,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -180,7 +176,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -199,7 +194,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -218,7 +212,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(stream->gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -236,7 +229,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
}
break;
default:
break;
PANIC("Cuda error (convert KSK): unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..16384].")
}
cuda_drop_async(d_bsk, stream);
@@ -244,43 +238,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src,
free(h_bsk);
}
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
(double2 *)dest, (int32_t *)src, stream, input_lwe_dim, glwe_dim,
level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src,
cuda_stream_t *stream,
uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size) {
uint32_t total_polynomials =
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
(double2 *)dest, (int64_t *)src, stream, input_lwe_dim, glwe_dim,
level_count, polynomial_size, total_polynomials);
}
void cuda_convert_lwe_multi_bit_bootstrap_key_64(
void *dest, void *src, cuda_stream_t *stream, uint32_t input_lwe_dim,
uint32_t glwe_dim, uint32_t level_count, uint32_t polynomial_size,
uint32_t grouping_factor) {
uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
level_count * (1 << grouping_factor) /
grouping_factor;
size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
stream);
}
void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
cuda_stream_t *stream,
uint32_t polynomial_size,
@@ -458,43 +415,4 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
cuda_drop_async(buffer, stream);
}
// We need these lines so the compiler knows how to specialize these functions
template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
int k, int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
int level,
uint32_t polynomial_size,
int glwe_dimension,
uint32_t level_count);
template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
#endif // CNCRT_BSK_H

View File

@@ -0,0 +1,12 @@
option(TFHE_CUDA_BACKEND_BUILD_TESTS "Build the test tool" OFF)
option(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS "Build the benchmark tool" OFF)
if(TFHE_CUDA_BACKEND_BUILD_TESTS)
message(STATUS "Building the test tool")
add_subdirectory(tests)
endif()
if(TFHE_CUDA_BACKEND_BUILD_BENCHMARKS)
message(STATUS "Building the benchmark tool")
add_subdirectory(benchmarks)
endif()

View File

@@ -0,0 +1,88 @@
project(benchmark_tfhe_cuda_backend LANGUAGES CXX)
# See if the minimum CUDA version is available. If not, only enable documentation building.
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
include(CheckLanguage)
# See if CUDA is available
check_language(CUDA)
# If so, enable CUDA to check the version.
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
endif()
# If CUDA is not available, or the minimum version is too low do not build
if(NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR "Cuda compiler not found.")
endif()
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
# Disable the Google Benchmark requirement on Google Test
set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
set(BENCHMARK_ENABLE_TESTING OFF)
include(FetchContent)
FetchContent_Declare(
googlebenchmark
GIT_REPOSITORY https://github.com/google/benchmark.git
GIT_TAG v1.7.1)
FetchContent_MakeAvailable(googlebenchmark)
# Enable ExternalProject CMake module
include(ExternalProject)
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
# Enable ExternalProject CMake module
include(ExternalProject)
set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../")
set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
if(NOT TARGET tfhe-rs)
ExternalProject_Add(
tfhe-rs
SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
BUILD_IN_SOURCE 1
BUILD_ALWAYS 1
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
DOWNLOAD_COMMAND ""
BUILD_COMMAND make build_c_api
INSTALL_COMMAND ""
LOG_BUILD ON)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/src)
include_directories(${TFHE_RS_BINARY_DIR})
include_directories(${TFHE_RS_BINARY_DIR}/deps)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
find_package(OpenMP REQUIRED)
# Add the OpenMP flag to the compiler flags
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
add_library(tfhe_rs_lib STATIC IMPORTED)
add_dependencies(tfhe_rs_lib tfhe-rs)
set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
set(BINARY benchmark_tfhe_cuda_backend)
file(
GLOB_RECURSE BENCH_SOURCES
LIST_DIRECTORIES false
benchmark*.cpp main.cpp)
add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
set_target_properties(benchmark_tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS
ON)
target_link_libraries(
benchmark_tfhe_cuda_backend
PUBLIC benchmark::benchmark tfhe_rs_lib tfhe_cuda_backend OpenMP::OpenMP_CXX
PRIVATE CUDA::cudart)

View File

@@ -0,0 +1,73 @@
#include <benchmark/benchmark.h>
#include <cstdint>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
size_t polynomial_size;
int samples;
} FourierTransformTestParams;
class FourierTransformTestPrimitives_u64 : public benchmark::Fixture {
protected:
size_t polynomial_size;
int num_samples;
cuda_stream_t *stream;
int gpu_index = 0;
double *poly1;
double *poly2; // will be used as extracted result for cuda mult
double2 *h_cpoly1;
double2 *h_cpoly2; // will be used as a result poly
double2 *d_cpoly1;
double2 *d_cpoly2; // will be used as a result poly
public:
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(0);
// get test params
polynomial_size = state.range(0);
num_samples = state.range(1);
fft_setup(stream, &poly1, &poly2, &h_cpoly1, &h_cpoly2, &d_cpoly1,
&d_cpoly2, polynomial_size, num_samples);
}
void TearDown(const ::benchmark::State &state) {
fft_teardown(stream, poly1, poly2, h_cpoly1, h_cpoly2, d_cpoly1, d_cpoly2);
}
};
BENCHMARK_DEFINE_F(FourierTransformTestPrimitives_u64, cuda_fft_mult)
(benchmark::State &st) {
for (auto _ : st) {
cuda_fourier_polynomial_mul(d_cpoly1, d_cpoly2, d_cpoly2, stream,
polynomial_size, num_samples);
cuda_synchronize_stream(stream);
}
}
static void FFTBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// n, input_lwe_ciphertext_count
std::vector<FourierTransformTestParams> params = {
(FourierTransformTestParams){256, 100},
(FourierTransformTestParams){512, 100},
(FourierTransformTestParams){1024, 100},
(FourierTransformTestParams){2048, 100},
(FourierTransformTestParams){4096, 100},
(FourierTransformTestParams){8192, 100},
(FourierTransformTestParams){16384, 100},
};
// Add to the list of parameters to benchmark
for (auto x : params)
b->Args({x.polynomial_size, x.samples});
}
BENCHMARK_REGISTER_F(FourierTransformTestPrimitives_u64, cuda_fft_mult)
->Apply(FFTBenchmarkGenerateParams)
->ArgNames({"polynomial_size", "samples"});

View File

@@ -0,0 +1,372 @@
#include <benchmark/benchmark.h>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <omp.h>
#include <setup_and_teardown.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int input_lwe_ciphertext_count;
int grouping_factor;
int chunk_size;
} MultiBitPBSBenchmarkParams;
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int pbs_base_log;
int pbs_level;
int input_lwe_ciphertext_count;
} BootstrapBenchmarkParams;
class MultiBitBootstrap_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int input_lwe_ciphertext_count;
int grouping_factor;
DynamicDistribution lwe_modular_variance;
DynamicDistribution glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus = 4;
int carry_modulus = 4;
int payload_modulus;
uint64_t delta;
cuda_stream_t *stream;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
uint64_t *d_bsk;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_out_array;
uint64_t *d_lwe_input_indexes;
uint64_t *d_lwe_output_indexes;
int8_t *buffer;
int chunk_size;
public:
void SetUp(const ::benchmark::State &state) {
int gpu_index = 0;
stream = cuda_create_stream(gpu_index);
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
input_lwe_ciphertext_count = state.range(5);
grouping_factor = state.range(6);
chunk_size = state.range(7);
DynamicDistribution lwe_modular_variance =
new_gaussian_from_std_dev(sqrt(0.000007069849454709433));
DynamicDistribution glwe_modular_variance =
new_gaussian_from_std_dev(sqrt(0.00000000000000029403601535432533));
Seed seed;
init_seed(&seed);
bootstrap_multibit_setup(
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk, &plaintexts,
&d_lut_pbs_identity, &d_lut_pbs_indexes, &d_lwe_ct_in_array,
&d_lwe_input_indexes, &d_lwe_ct_out_array, &d_lwe_output_indexes,
&buffer, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, lwe_modular_variance, glwe_modular_variance,
pbs_base_log, pbs_level, message_modulus, carry_modulus,
&payload_modulus, &delta, input_lwe_ciphertext_count, 1, 1);
}
void TearDown(const ::benchmark::State &state) {
bootstrap_multibit_teardown(
stream, lwe_sk_in_array, lwe_sk_out_array, d_bsk, plaintexts,
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
cudaDeviceReset();
}
};
class ClassicalBootstrap_u64 : public benchmark::Fixture {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
int input_lwe_ciphertext_count;
DynamicDistribution lwe_modular_variance;
DynamicDistribution glwe_modular_variance;
int pbs_base_log;
int pbs_level;
int message_modulus = 4;
int carry_modulus = 4;
int payload_modulus;
uint64_t delta;
double *d_fourier_bsk;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_input_indexes;
uint64_t *d_lwe_output_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_array;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
int8_t *buffer;
cuda_stream_t *stream;
public:
void SetUp(const ::benchmark::State &state) {
int gpu_index = 0;
stream = cuda_create_stream(gpu_index);
lwe_dimension = state.range(0);
glwe_dimension = state.range(1);
polynomial_size = state.range(2);
pbs_base_log = state.range(3);
pbs_level = state.range(4);
input_lwe_ciphertext_count = state.range(5);
DynamicDistribution lwe_modular_variance =
new_gaussian_from_std_dev(sqrt(0.000007069849454709433));
DynamicDistribution glwe_modular_variance =
new_gaussian_from_std_dev(sqrt(0.00000000000000029403601535432533));
Seed seed;
init_seed(&seed);
bootstrap_classical_setup(
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_fourier_bsk,
&plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes,
&d_lwe_ct_in_array, &d_lwe_input_indexes, &d_lwe_ct_out_array,
&d_lwe_output_indexes, lwe_dimension, glwe_dimension, polynomial_size,
lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
message_modulus, carry_modulus, &payload_modulus, &delta,
input_lwe_ciphertext_count, 1, 1);
}
void TearDown(const ::benchmark::State &state) {
bootstrap_classical_teardown(
stream, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk, plaintexts,
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
cudaDeviceReset();
}
};
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, FastMultiBit)
(benchmark::State &st) {
if (!has_support_to_cuda_bootstrap_fast_multi_bit(
glwe_dimension, polynomial_size, pbs_level,
input_lwe_ciphertext_count,
cuda_get_max_shared_memory(stream->gpu_index))) {
st.SkipWithError("Configuration not supported for fast operation");
return;
}
scratch_cuda_fast_multi_bit_pbs<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
true, chunk_size);
for (auto _ : st) {
// Execute PBS
cuda_fast_multi_bit_pbs_lwe_ciphertext_vector(
stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, input_lwe_ciphertext_count, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
cuda_synchronize_stream(stream);
}
cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
}
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
(benchmark::State &st) {
scratch_cuda_multi_bit_pbs<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
true, chunk_size);
for (auto _ : st) {
// Execute PBS
cuda_multi_bit_pbs_lwe_ciphertext_vector(
stream, d_lwe_ct_out_array, d_lwe_output_indexes, d_lut_pbs_identity,
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, input_lwe_ciphertext_count, 1, 0,
cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
cuda_synchronize_stream(stream);
}
cleanup_cuda_multi_bit_pbs_64(stream, &buffer);
}
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, FastLowLatencyPBS)
(benchmark::State &st) {
if (!has_support_to_cuda_bootstrap_fast_low_latency<uint64_t>(
glwe_dimension, polynomial_size, pbs_level,
input_lwe_ciphertext_count,
cuda_get_max_shared_memory(stream->gpu_index))) {
st.SkipWithError("Configuration not supported for fast operation");
return;
}
scratch_cuda_fast_bootstrap_low_latency<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, LOW_LAT> **)&buffer, glwe_dimension,
polynomial_size, pbs_level, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(stream->gpu_index), true);
for (auto _ : st) {
// Execute PBS
cuda_bootstrap_fast_low_latency_lwe_ciphertext_vector<uint64_t>(
stream, (uint64_t *)d_lwe_ct_out_array,
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
(pbs_buffer<uint64_t, LOW_LAT> *)buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
0, cuda_get_max_shared_memory(stream->gpu_index));
cuda_synchronize_stream(stream);
}
cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
}
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultLowLatencyPBS)
(benchmark::State &st) {
scratch_cuda_bootstrap_low_latency<uint64_t, int64_t>(
stream, (pbs_buffer<uint64_t, LOW_LAT> **)&buffer, glwe_dimension,
polynomial_size, pbs_level, input_lwe_ciphertext_count,
cuda_get_max_shared_memory(stream->gpu_index), true);
for (auto _ : st) {
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector<uint64_t>(
stream, (uint64_t *)d_lwe_ct_out_array,
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
(uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
(pbs_buffer<uint64_t, LOW_LAT> *)buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
0, cuda_get_max_shared_memory(stream->gpu_index));
cuda_synchronize_stream(stream);
}
cleanup_cuda_bootstrap_low_latency_64(stream, &buffer);
}
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS)
(benchmark::State &st) {
scratch_cuda_bootstrap_amortized_64(
stream, &buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
true);
for (auto _ : st) {
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in_array, (void *)d_lwe_input_indexes,
(void *)d_fourier_bsk, buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, input_lwe_ciphertext_count, 1,
0, cuda_get_max_shared_memory(stream->gpu_index));
cuda_synchronize_stream(stream);
}
cleanup_cuda_bootstrap_amortized(stream, &buffer);
}
static void
MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
// input_lwe_ciphertext_count
std::vector<MultiBitPBSBenchmarkParams> params = {
// 4_bits_multi_bit_group_2
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2, 0},
// 4_bits_multi_bit_group_3
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3, 0},
};
// Add to the list of parameters to benchmark
for (auto x : params) {
for (int input_lwe_ciphertext_count = 1; input_lwe_ciphertext_count <= 4096;
input_lwe_ciphertext_count *= 2) {
for (int lwe_chunk_size = 1;
lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
lwe_chunk_size *= 2)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
x.grouping_factor, lwe_chunk_size});
int lwe_chunk_size = x.lwe_dimension / x.grouping_factor;
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
x.grouping_factor, lwe_chunk_size});
}
}
}
static void
BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// Define the parameters to benchmark
// lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
// input_lwe_ciphertext_count
// PARAM_MESSAGE_2_CARRY_2_KS_PBS
std::vector<BootstrapBenchmarkParams> params = {
(BootstrapBenchmarkParams){742, 1, 2048, 23, 1, 1},
};
// Add to the list of parameters to benchmark
for (int num_samples = 1; num_samples <= 4096; num_samples *= 2)
for (auto x : params) {
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, num_samples});
}
}
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, FastMultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
"grouping_factor", "chunk_size"});
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
"grouping_factor", "chunk_size"});
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, DefaultLowLatencyPBS)
->Apply(BootstrapBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, AmortizedPBS)
->Apply(BootstrapBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count"});

View File

@@ -0,0 +1,3 @@
#include <benchmark/benchmark.h>
BENCHMARK_MAIN();

View File

@@ -0,0 +1,71 @@
#ifndef SETUP_AND_TEARDOWN_H
#define SETUP_AND_TEARDOWN_H
#include <bootstrap.h>
#include <bootstrap_multibit.h>
#include <device.h>
#include <keyswitch.h>
#include <utils.h>
void bootstrap_classical_setup(
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **plaintexts, uint64_t **d_lut_pbs_identity,
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_input_indexes, uint64_t **d_lwe_ct_out_array,
uint64_t **d_lwe_output_indexes, int lwe_dimension, int glwe_dimension,
int polynomial_size, DynamicDistribution lwe_noise_distribution,
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
uint64_t *delta, int number_of_inputs, int repetitions, int samples);
void bootstrap_classical_teardown(
cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
uint64_t *plaintexts, uint64_t *d_lut_pbs_identity,
uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_input_indexes, uint64_t *d_lwe_ct_out_array,
uint64_t *d_lwe_output_indexes);
void bootstrap_multibit_setup(
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, uint64_t **d_bsk_array, uint64_t **plaintexts,
uint64_t **d_lut_pbs_identity, uint64_t **d_lut_pbs_indexes,
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
int8_t **pbs_buffer, int lwe_dimension, int glwe_dimension,
int polynomial_size, int grouping_factor,
DynamicDistribution lwe_noise_distribution,
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
uint64_t *delta, int number_of_inputs, int repetitions, int samples,
int chunk_size = 0);
void bootstrap_multibit_teardown(
cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, uint64_t *d_bsk_array, uint64_t *plaintexts,
uint64_t *d_lut_pbs_identity, uint64_t *d_lut_pbs_indexes,
uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_input_indexes,
uint64_t *d_lwe_ct_out_array, uint64_t *d_lwe_output_indexes);
void keyswitch_setup(
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, uint64_t **d_ksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
int input_lwe_dimension, int output_lwe_dimension,
DynamicDistribution lwe_noise_distribution, int ksk_base_log, int ksk_level,
int message_modulus, int carry_modulus, int *payload_modulus,
uint64_t *delta, int number_of_inputs, int repetitions, int samples);
void keyswitch_teardown(cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, uint64_t *d_ksk_array,
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
uint64_t *lwe_input_indexes,
uint64_t *d_lwe_ct_out_array,
uint64_t *lwe_output_indexes);
void fft_setup(cuda_stream_t *stream, double **poly1, double **poly2,
double2 **h_cpoly1, double2 **h_cpoly2, double2 **d_cpoly1,
double2 **d_cpoly2, size_t polynomial_size, int samples);
void fft_teardown(cuda_stream_t *stream, double *poly1, double *poly2,
double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
double2 *d_cpoly2);
#endif // SETUP_AND_TEARDOWN_H

View File

@@ -0,0 +1,54 @@
#ifndef UTILS_H
#define UTILS_H
#include <device.h>
#include <functional>
#include <tfhe.h>
typedef struct Seed {
uint64_t lo;
uint64_t hi;
} Seed;
void init_seed(Seed *seed);
void shuffle_seed(Seed *seed);
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
int number_of_inputs, const unsigned repetitions,
const unsigned samples);
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
int message_modulus, int carry_modulus,
std::function<uint64_t(uint64_t)> func);
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
Seed *seed, const unsigned repetitions);
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
int polynomial_size, Seed *seed,
const unsigned repetitions);
void generate_lwe_bootstrap_keys(cuda_stream_t *stream,
double **d_fourier_bsk_array,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size,
int pbs_level, int pbs_base_log, Seed *seed,
DynamicDistribution noise_distribution,
const unsigned repetitions);
void generate_lwe_multi_bit_pbs_keys(
cuda_stream_t *stream, uint64_t **d_bsk_array, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, int lwe_dimension, int glwe_dimension,
int polynomial_size, int pbs_level, int pbs_base_log, int grouping_factor,
Seed *seed, DynamicDistribution noise_distribution,
const unsigned repetitions);
void generate_lwe_keyswitch_keys(
cuda_stream_t *stream, uint64_t **d_ksk_array, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, int input_lwe_dimension,
int output_lwe_dimension, int ksk_level, int ksk_base_log, Seed *seed,
DynamicDistribution noise_distribution, const unsigned repetitions);
#endif

View File

@@ -0,0 +1,438 @@
#include <cmath>
#include <random>
#include <setup_and_teardown.h>
void bootstrap_classical_setup(
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
uint64_t **plaintexts, uint64_t **d_lut_pbs_identity,
uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
uint64_t **d_lwe_input_indexes, uint64_t **d_lwe_ct_out_array,
uint64_t **d_lwe_output_indexes, int lwe_dimension, int glwe_dimension,
int polynomial_size, DynamicDistribution lwe_noise_distribution,
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
uint64_t *delta, int number_of_inputs, int repetitions, int samples) {
*payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
// Generate the keys
shuffle_seed(seed);
generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, seed, repetitions);
shuffle_seed(seed);
generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
seed, repetitions);
shuffle_seed(seed);
generate_lwe_bootstrap_keys(stream, d_fourier_bsk_array, *lwe_sk_in_array,
*lwe_sk_out_array, lwe_dimension, glwe_dimension,
polynomial_size, pbs_level, pbs_base_log, seed,
glwe_noise_distribution, repetitions);
shuffle_seed(seed);
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
repetitions, samples);
// Create the LUT
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
[](int x) -> int { return x; });
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * repetitions *
samples * sizeof(uint64_t));
// Create the input/output ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(lwe_dimension + 1));
core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in, lwe_dimension,
lwe_noise_distribution, seed->lo, seed->hi);
shuffle_seed(seed);
}
}
}
// Initialize and copy things in/to the device
*d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream);
cuda_memcpy_async_to_gpu(
*d_lut_pbs_identity, lut_pbs_identity,
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t), stream);
*d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream);
cuda_memset_async(*d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
stream);
// Input and output LWEs
*d_lwe_ct_out_array =
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream);
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * number_of_inputs * repetitions * samples *
sizeof(uint64_t),
stream);
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream);
uint64_t *h_lwe_indexes =
(uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
*d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream);
*d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream);
for (int i = 0; i < number_of_inputs; i++)
h_lwe_indexes[i] = i;
cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
number_of_inputs * sizeof(uint64_t), stream);
cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
number_of_inputs * sizeof(uint64_t), stream);
stream->synchronize();
free(lwe_ct_in_array);
free(lut_pbs_identity);
free(h_lwe_indexes);
}
void bootstrap_classical_teardown(
cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
uint64_t *plaintexts, uint64_t *d_lut_pbs_identity,
uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_input_indexes, uint64_t *d_lwe_ct_out_array,
uint64_t *d_lwe_output_indexes) {
cuda_synchronize_stream(stream);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_fourier_bsk_array, stream);
cuda_drop_async(d_lut_pbs_identity, stream);
cuda_drop_async(d_lut_pbs_indexes, stream);
cuda_drop_async(d_lwe_ct_in_array, stream);
cuda_drop_async(d_lwe_ct_out_array, stream);
cuda_drop_async(d_lwe_input_indexes, stream);
cuda_drop_async(d_lwe_output_indexes, stream);
stream->synchronize();
stream->release();
}
void bootstrap_multibit_setup(
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, uint64_t **d_bsk_array, uint64_t **plaintexts,
uint64_t **d_lut_pbs_identity, uint64_t **d_lut_pbs_indexes,
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
int8_t **pbs_buffer, int lwe_dimension, int glwe_dimension,
int polynomial_size, int grouping_factor,
DynamicDistribution lwe_noise_distribution,
DynamicDistribution glwe_noise_distribution, int pbs_base_log,
int pbs_level, int message_modulus, int carry_modulus, int *payload_modulus,
uint64_t *delta, int number_of_inputs, int repetitions, int samples,
int lwe_chunk_size) {
cudaSetDevice(stream->gpu_index);
*payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
// Generate the keys
shuffle_seed(seed);
generate_lwe_secret_keys(lwe_sk_in_array, lwe_dimension, seed, repetitions);
shuffle_seed(seed);
generate_lwe_secret_keys(lwe_sk_out_array, glwe_dimension * polynomial_size,
seed, repetitions);
shuffle_seed(seed);
generate_lwe_multi_bit_pbs_keys(
stream, d_bsk_array, *lwe_sk_in_array, *lwe_sk_out_array, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_level, pbs_base_log,
seed, glwe_noise_distribution, repetitions);
shuffle_seed(seed);
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
repetitions, samples);
// Create the LUT
uint64_t *lut_pbs_identity = generate_identity_lut_pbs(
polynomial_size, glwe_dimension, message_modulus, carry_modulus,
[](int x) -> int { return x; });
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc((lwe_dimension + 1) * number_of_inputs * repetitions *
samples * sizeof(uint64_t));
// Create the input/output ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in = *lwe_sk_in_array + (ptrdiff_t)(r * lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(lwe_dimension + 1));
core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in, lwe_dimension,
lwe_noise_distribution, seed->lo, seed->hi);
shuffle_seed(seed);
}
}
}
// Initialize and copy things in/to the device
*d_lut_pbs_identity = (uint64_t *)cuda_malloc_async(
(glwe_dimension + 1) * polynomial_size * sizeof(uint64_t), stream);
cuda_memcpy_async_to_gpu(
*d_lut_pbs_identity, lut_pbs_identity,
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t), stream);
*d_lut_pbs_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream);
cuda_memset_async(*d_lut_pbs_indexes, 0, number_of_inputs * sizeof(uint64_t),
stream);
// Input and output LWEs
*d_lwe_ct_out_array =
(uint64_t *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream);
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(lwe_dimension + 1) * number_of_inputs * repetitions * samples *
sizeof(uint64_t),
stream);
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * number_of_inputs *
(lwe_dimension + 1) * sizeof(uint64_t),
stream);
uint64_t *h_lwe_indexes =
(uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
*d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream);
*d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream);
for (int i = 0; i < number_of_inputs; i++)
h_lwe_indexes[i] = i;
cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
number_of_inputs * sizeof(uint64_t), stream);
cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
number_of_inputs * sizeof(uint64_t), stream);
scratch_cuda_multi_bit_pbs_64(
stream, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
pbs_level, grouping_factor, number_of_inputs,
cuda_get_max_shared_memory(stream->gpu_index), true, lwe_chunk_size);
stream->synchronize();
free(h_lwe_indexes);
free(lut_pbs_identity);
free(lwe_ct_in_array);
}
void bootstrap_multibit_teardown(
cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, uint64_t *d_bsk_array, uint64_t *plaintexts,
uint64_t *d_lut_pbs_identity, uint64_t *d_lut_pbs_indexes,
uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_input_indexes,
uint64_t *d_lwe_ct_out_array, uint64_t *d_lwe_output_indexes) {
cuda_synchronize_stream(stream);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_bsk_array, stream);
cuda_drop_async(d_lut_pbs_identity, stream);
cuda_drop_async(d_lut_pbs_indexes, stream);
cuda_drop_async(d_lwe_ct_in_array, stream);
cuda_drop_async(d_lwe_ct_out_array, stream);
cuda_drop_async(d_lwe_input_indexes, stream);
cuda_drop_async(d_lwe_output_indexes, stream);
stream->synchronize();
stream->release();
}
void keyswitch_setup(
cuda_stream_t *stream, Seed *seed, uint64_t **lwe_sk_in_array,
uint64_t **lwe_sk_out_array, uint64_t **d_ksk_array, uint64_t **plaintexts,
uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_input_indexes,
uint64_t **d_lwe_ct_out_array, uint64_t **d_lwe_output_indexes,
int input_lwe_dimension, int output_lwe_dimension,
DynamicDistribution lwe_noise_distribution, int ksk_base_log, int ksk_level,
int message_modulus, int carry_modulus, int *payload_modulus,
uint64_t *delta, int number_of_inputs, int repetitions, int samples) {
*payload_modulus = message_modulus * carry_modulus;
// Value of the shift we multiply our messages by
*delta = ((uint64_t)(1) << 63) / (uint64_t)(*payload_modulus);
// Generate the keys
shuffle_seed(seed);
generate_lwe_secret_keys(lwe_sk_in_array, input_lwe_dimension, seed,
repetitions);
shuffle_seed(seed);
generate_lwe_secret_keys(lwe_sk_out_array, output_lwe_dimension, seed,
repetitions);
shuffle_seed(seed);
generate_lwe_keyswitch_keys(stream, d_ksk_array, *lwe_sk_in_array,
*lwe_sk_out_array, input_lwe_dimension,
output_lwe_dimension, ksk_level, ksk_base_log,
seed, lwe_noise_distribution, repetitions);
shuffle_seed(seed);
*plaintexts = generate_plaintexts(*payload_modulus, *delta, number_of_inputs,
repetitions, samples);
*d_lwe_ct_out_array = (uint64_t *)cuda_malloc_async(
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t), stream);
*d_lwe_ct_in_array = (uint64_t *)cuda_malloc_async(
(input_lwe_dimension + 1) * number_of_inputs * repetitions * samples *
sizeof(uint64_t),
stream);
uint64_t *lwe_ct_in_array =
(uint64_t *)malloc((input_lwe_dimension + 1) * number_of_inputs *
repetitions * samples * sizeof(uint64_t));
// Create the input/output ciphertexts
for (int r = 0; r < repetitions; r++) {
uint64_t *lwe_sk_in =
*lwe_sk_in_array + (ptrdiff_t)(r * input_lwe_dimension);
for (int s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = (*plaintexts)[r * samples * number_of_inputs +
s * number_of_inputs + i];
uint64_t *lwe_ct_in =
lwe_ct_in_array + (ptrdiff_t)((r * samples * number_of_inputs +
s * number_of_inputs + i) *
(input_lwe_dimension + 1));
core_crypto_lwe_encrypt(lwe_ct_in, plaintext, lwe_sk_in,
input_lwe_dimension, lwe_noise_distribution,
seed->lo, seed->hi);
shuffle_seed(seed);
}
}
}
cuda_memcpy_async_to_gpu(*d_lwe_ct_in_array, lwe_ct_in_array,
repetitions * samples * number_of_inputs *
(input_lwe_dimension + 1) * sizeof(uint64_t),
stream);
stream->synchronize();
uint64_t *h_lwe_indexes =
(uint64_t *)malloc(number_of_inputs * sizeof(uint64_t));
*d_lwe_input_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream);
*d_lwe_output_indexes = (uint64_t *)cuda_malloc_async(
number_of_inputs * sizeof(uint64_t), stream);
for (int i = 0; i < number_of_inputs; i++)
h_lwe_indexes[i] = i;
cuda_memcpy_async_to_gpu(*d_lwe_input_indexes, h_lwe_indexes,
number_of_inputs * sizeof(uint64_t), stream);
cuda_memcpy_async_to_gpu(*d_lwe_output_indexes, h_lwe_indexes,
number_of_inputs * sizeof(uint64_t), stream);
cuda_synchronize_stream(stream);
free(h_lwe_indexes);
free(lwe_ct_in_array);
}
void keyswitch_teardown(cuda_stream_t *stream, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, uint64_t *d_ksk_array,
uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
uint64_t *d_lwe_input_indexes,
uint64_t *d_lwe_ct_out_array,
uint64_t *d_lwe_output_indexes) {
cuda_synchronize_stream(stream);
free(lwe_sk_in_array);
free(lwe_sk_out_array);
free(plaintexts);
cuda_drop_async(d_ksk_array, stream);
cuda_drop_async(d_lwe_ct_in_array, stream);
cuda_drop_async(d_lwe_ct_out_array, stream);
cuda_drop_async(d_lwe_input_indexes, stream);
cuda_drop_async(d_lwe_output_indexes, stream);
stream->synchronize();
stream->release();
}
void fft_setup(cuda_stream_t *stream, double **_poly1, double **_poly2,
double2 **_h_cpoly1, double2 **_h_cpoly2, double2 **_d_cpoly1,
double2 **_d_cpoly2, size_t polynomial_size, int samples) {
auto &poly1 = *_poly1;
auto &poly2 = *_poly2;
auto &h_cpoly1 = *_h_cpoly1;
auto &h_cpoly2 = *_h_cpoly2;
auto &d_cpoly1 = *_d_cpoly1;
auto &d_cpoly2 = *_d_cpoly2;
poly1 = (double *)malloc(polynomial_size * samples * sizeof(double));
poly2 = (double *)malloc(polynomial_size * samples * sizeof(double));
h_cpoly1 = (double2 *)malloc(polynomial_size / 2 * samples * sizeof(double2));
h_cpoly2 = (double2 *)malloc(polynomial_size / 2 * samples * sizeof(double2));
d_cpoly1 = (double2 *)cuda_malloc_async(
polynomial_size / 2 * samples * sizeof(double2), stream);
d_cpoly2 = (double2 *)cuda_malloc_async(
polynomial_size / 2 * samples * sizeof(double2), stream);
double lower_bound = -1;
double upper_bound = 1;
std::uniform_real_distribution<double> unif(lower_bound, upper_bound);
std::default_random_engine re;
// Fill test data with random values
for (size_t i = 0; i < polynomial_size * samples; i++) {
poly1[i] = unif(re);
poly2[i] = unif(re);
}
// prepare data for device
// compress
for (size_t p = 0; p < (size_t)samples; p++) {
auto left_cpoly = &h_cpoly1[p * polynomial_size / 2];
auto right_cpoly = &h_cpoly2[p * polynomial_size / 2];
auto left = &poly1[p * polynomial_size];
auto right = &poly2[p * polynomial_size];
for (std::size_t i = 0; i < polynomial_size / 2; ++i) {
left_cpoly[i].x = left[i];
left_cpoly[i].y = left[i + polynomial_size / 2];
right_cpoly[i].x = right[i];
right_cpoly[i].y = right[i + polynomial_size / 2];
}
}
// copy memory cpu->gpu
cuda_memcpy_async_to_gpu(d_cpoly1, h_cpoly1,
polynomial_size / 2 * samples * sizeof(double2),
stream);
cuda_memcpy_async_to_gpu(d_cpoly2, h_cpoly2,
polynomial_size / 2 * samples * sizeof(double2),
stream);
stream->synchronize();
}
void fft_teardown(cuda_stream_t *stream, double *poly1, double *poly2,
double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
double2 *d_cpoly2) {
stream->synchronize();
free(poly1);
free(poly2);
free(h_cpoly1);
free(h_cpoly2);
cuda_drop_async(d_cpoly1, stream);
cuda_drop_async(d_cpoly2, stream);
stream->synchronize();
stream->release();
}

View File

@@ -0,0 +1,81 @@
project(test_tfhe_cuda_backend LANGUAGES CXX)
# See if the minimum CUDA version is available. If not, only enable documentation building.
set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
include(CheckLanguage)
# See if CUDA is available
check_language(CUDA)
# If so, enable CUDA to check the version.
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
endif()
# If CUDA is not available, or the minimum version is too low do not build
if(NOT CMAKE_CUDA_COMPILER)
message(FATAL_ERROR "Cuda compiler not found.")
endif()
include(FetchContent)
FetchContent_Declare(googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt
ON
CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
# Enable ExternalProject CMake module
include(ExternalProject)
set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/../../../../")
set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
if(NOT TARGET tfhe-rs)
ExternalProject_Add(
tfhe-rs
SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
BUILD_IN_SOURCE 1
BUILD_ALWAYS 1
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
DOWNLOAD_COMMAND ""
BUILD_COMMAND make build_c_api
INSTALL_COMMAND ""
LOG_BUILD ON)
endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
include_directories(${TFHE_RS_BINARY_DIR})
include_directories(${TFHE_RS_BINARY_DIR}/deps)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}")
include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}")
add_library(tfhe_rs_lib STATIC IMPORTED)
add_dependencies(tfhe_rs_lib tfhe-rs)
set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed -ldl")
set(BINARY test_tfhe_cuda_backend)
file(
GLOB_RECURSE TEST_SOURCES
LIST_DIRECTORIES false
test_*.cpp)
add_executable(${BINARY} ${TEST_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
add_test(NAME ${BINARY} COMMAND ${BINARY})
set_target_properties(
${BINARY}
PROPERTIES CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
CUDA_ARCHITECTURES native)
target_link_libraries(${BINARY} PUBLIC GTest::gtest_main tfhe_rs_lib tfhe_cuda_backend cudart)
include(GoogleTest)
gtest_discover_tests(${BINARY})

View File

@@ -0,0 +1,61 @@
# test_tfhe_cuda_backend
This test tool is written over GoogleTest library. It tests the correctness of basic
cryptographic primitives accelerated using CUDA and helps identify arithmetic flaws.
The output format can be adjusted according to the user's interest.
A particular function will be executed for each test case, and the result will be verified considering the expected behavior. This will be repeated for multiple encryption keys and samples per key. These can be modified by changing `REPETITIONS` and `SAMPLES` variables at the beginning of each test file.
## How to Compile
The first step in compiling code with CMake is to create a build directory. This directory will
contain all the files generated during the build process, such as object files and executables.
We recommend creating this directory outside of the source directory, but inside the
implementation folder, to keep the source directory clean.
```bash
$ cd tfhe-rs/backends/tfhe-cuda-backend/cuda
$ mkdir build
$ cd build
```
Run CMake to generate the build files and then use make to compile the project.
```bash
$ cmake ..
$ make
```
The binary will be found in
`ctfhe-rs/backends/tfhe-cuda-backend/cuda/build/tests/src`.
## How to Run Tests
To run tests, you can simply execute the `test_tfhe_cuda_backend` executable with no arguments:
```bash
$ tests/src/test_tfhe_cuda_backend
```
This will run all the available tests.
## How to Filter Tests
You can select a subset of sets by specifying a filter for the name of the tests of interest as
an argument. Only tests whose full name matches the filter will be executed.
For example, to run only tests whose name starts with the word "Bootstrap", you can execute:
```bash
$ tests/src/test_tfhe_cuda_backend --gtest_filter=Bootstrap*
```
The parameter `--gtest_list_tests` can be used to list all the available tests, and a better
description on how to select a subset of tests can be found in
[GoogleTest documentation](http://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests).
## Conclusion
With these options, you can easily verify the correctness of concrete-cuda's implementations. If
you have any questions or issues, please feel free to contact us.
To learn more about GoogleTest library, please refer to the [official user guide](http://google.github.io/googletest/).

View File

@@ -0,0 +1,387 @@
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <utils.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
DynamicDistribution lwe_noise_distribution;
DynamicDistribution glwe_noise_distribution;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int repetitions;
int samples;
} ClassicalBootstrapTestParams;
class ClassicalBootstrapTestPrimitives_u64
: public ::testing::TestWithParam<ClassicalBootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
DynamicDistribution lwe_noise_distribution;
DynamicDistribution glwe_noise_distribution;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int payload_modulus;
int number_of_inputs;
int repetitions;
int samples;
uint64_t delta;
cuda_stream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
double *d_fourier_bsk_array;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *d_lwe_input_indexes;
uint64_t *d_lwe_output_indexes;
uint64_t *lwe_ct_out_array;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(gpu_index);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
lwe_noise_distribution =
(DynamicDistribution)GetParam().lwe_noise_distribution;
glwe_noise_distribution =
(DynamicDistribution)GetParam().glwe_noise_distribution;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
repetitions = (int)GetParam().repetitions;
samples = (int)GetParam().samples;
Seed seed;
init_seed(&seed);
bootstrap_classical_setup(
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array,
&d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
&d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_input_indexes,
&d_lwe_ct_out_array, &d_lwe_output_indexes, lwe_dimension,
glwe_dimension, polynomial_size, lwe_noise_distribution,
glwe_noise_distribution, pbs_base_log, pbs_level, message_modulus,
carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
samples);
lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
}
void TearDown() {
free(lwe_ct_out_array);
bootstrap_classical_teardown(
stream, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
}
};
TEST_P(ClassicalBootstrapTestPrimitives_u64, amortized_bootstrap) {
int8_t *pbs_buffer;
scratch_cuda_bootstrap_amortized_64(
stream, &pbs_buffer, glwe_dimension, polynomial_size, number_of_inputs,
cuda_get_max_shared_memory(gpu_index), true);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_lwe_input_indexes,
(void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta)
<< "Repetition: " << r << ", sample: " << s;
}
}
}
cleanup_cuda_bootstrap_amortized(stream, &pbs_buffer);
}
TEST_P(ClassicalBootstrapTestPrimitives_u64, low_latency_bootstrap) {
int8_t *pbs_buffer;
scratch_cuda_bootstrap_low_latency_64(
stream, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level,
number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
// Here execute the PBS
for (int r = 0; r < repetitions; r++) {
double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_lwe_input_indexes,
(void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
cuda_get_max_shared_memory(gpu_index));
// Copy result back
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
EXPECT_NE(decrypted, plaintext);
// let err = (decrypted >= plaintext) ? decrypted - plaintext :
// plaintext
// - decrypted;
// error_sample_vec.push(err);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
cleanup_cuda_bootstrap_low_latency_64(stream, &pbs_buffer);
}
// Defines for which parameters set the PBS will be tested.
// It executes each src for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<ClassicalBootstrapTestParams>
pbs_params_u64 = ::testing::Values(
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// message_modulus, carry_modulus, number_of_inputs, repetitions,
// samples
// BOOLEAN_DEFAULT_PARAMETERS
(ClassicalBootstrapTestParams){
777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
2, 2, 2, 40},
// BOOLEAN_TFHE_LIB_PARAMETERS
(ClassicalBootstrapTestParams){
830, 2, 1024,
new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
2, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
(ClassicalBootstrapTestParams){
678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
1, 2, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
(ClassicalBootstrapTestParams){
684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
2, 2, 2, 40},
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
(ClassicalBootstrapTestParams){
656, 2, 512,
new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
2, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
(ClassicalBootstrapTestParams){
742, 2, 1024,
new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
2, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
(ClassicalBootstrapTestParams){
745, 1, 2048,
new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
2, 2, 40},
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
(ClassicalBootstrapTestParams){
807, 1, 4096,
new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
2, 1, 40},
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
(ClassicalBootstrapTestParams){
915, 1, 8192,
new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
2, 1, 5},
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
(ClassicalBootstrapTestParams){
864, 1, 8192,
new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
2, 1, 5},
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
(ClassicalBootstrapTestParams){
930, 1, 16384,
new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
1, 2, 1, 5},
// BOOLEAN_DEFAULT_PARAMETERS
(ClassicalBootstrapTestParams){
777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
2, 100, 2, 40},
// BOOLEAN_TFHE_LIB_PARAMETERS
(ClassicalBootstrapTestParams){
830, 2, 1024,
new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
100, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
(ClassicalBootstrapTestParams){
678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
1, 100, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
(ClassicalBootstrapTestParams){
684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
2, 100, 2, 40},
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
(ClassicalBootstrapTestParams){
656, 2, 512,
new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
100, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
(ClassicalBootstrapTestParams){
742, 2, 1024,
new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
100, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
(ClassicalBootstrapTestParams){
745, 1, 2048,
new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
100, 2, 40},
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
(ClassicalBootstrapTestParams){
807, 1, 4096,
new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
100, 1, 40},
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
(ClassicalBootstrapTestParams){
915, 1, 8192,
new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
100, 1, 5},
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
(ClassicalBootstrapTestParams){
864, 1, 8192,
new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
100, 1, 5},
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
(ClassicalBootstrapTestParams){
930, 1, 16384,
new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
1, 100, 1, 5});
std::string
printParamName(::testing::TestParamInfo<ClassicalBootstrapTestParams> p) {
ClassicalBootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_number_of_inputs_" +
std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(ClassicalBootstrapInstantiation,
ClassicalBootstrapTestPrimitives_u64, pbs_params_u64,
printParamName);

View File

@@ -0,0 +1,127 @@
#include "utils.h"
#include "gtest/gtest.h"
#include <bootstrap.h>
#include <cstdint>
#include <device.h>
#include <functional>
#include <random>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
typedef struct {
size_t polynomial_size;
int samples;
} FourierTransformTestParams;
class FourierTransformTestPrimitives_u64
: public ::testing::TestWithParam<FourierTransformTestParams> {
protected:
size_t polynomial_size;
int samples;
cuda_stream_t *stream;
int gpu_index = 0;
double *poly1;
double *poly2; // will be used as extracted result for cuda mult
double *poly_exp_result;
double2 *h_cpoly1;
double2 *h_cpoly2; // will be used as a result poly
double2 *d_cpoly1;
double2 *d_cpoly2; // will be used as a result poly
public:
void SetUp() {
stream = cuda_create_stream(gpu_index);
// get src params
polynomial_size = (int)GetParam().polynomial_size;
samples = (int)GetParam().samples;
fft_setup(stream, &poly1, &poly2, &h_cpoly1, &h_cpoly2, &d_cpoly1,
&d_cpoly2, polynomial_size, samples);
// allocate memory
poly_exp_result =
(double *)malloc(polynomial_size * 2 * samples * sizeof(double));
memset(poly_exp_result, 0., polynomial_size * 2 * samples * sizeof(double));
// execute school book multiplication
for (size_t p = 0; p < (size_t)samples; p++) {
auto left = &poly1[p * polynomial_size];
auto right = &poly2[p * polynomial_size];
auto res = &poly_exp_result[p * polynomial_size * 2];
// multiplication
for (std::size_t i = 0; i < polynomial_size; ++i) {
for (std::size_t j = 0; j < polynomial_size; ++j) {
res[i + j] += left[i] * right[j];
}
}
// make result negacyclic
for (size_t i = 0; i < polynomial_size; i++) {
res[i] = res[i] - res[i + polynomial_size];
}
}
}
void TearDown() {
fft_teardown(stream, poly1, poly2, h_cpoly1, h_cpoly2, d_cpoly1, d_cpoly2);
free(poly_exp_result);
}
};
TEST_P(FourierTransformTestPrimitives_u64, cuda_fft_mult) {
int r = 0;
auto cur_input1 = &d_cpoly1[r * polynomial_size / 2 * samples];
auto cur_input2 = &d_cpoly2[r * polynomial_size / 2 * samples];
auto cur_h_c_res = &h_cpoly2[r * polynomial_size / 2 * samples];
auto cur_poly2 = &poly2[r * polynomial_size * samples];
auto cur_expected = &poly_exp_result[r * polynomial_size * 2 * samples];
cuda_fourier_polynomial_mul(cur_input1, cur_input2, cur_input2, stream,
polynomial_size, samples);
cuda_memcpy_async_to_cpu(cur_h_c_res, cur_input2,
polynomial_size / 2 * samples * sizeof(double2),
stream);
cuda_synchronize_stream(stream);
for (int p = 0; p < samples; p++) {
for (size_t i = 0; i < (size_t)polynomial_size / 2; i++) {
cur_poly2[p * polynomial_size + i] =
cur_h_c_res[p * polynomial_size / 2 + i].x;
cur_poly2[p * polynomial_size + i + polynomial_size / 2] =
cur_h_c_res[p * polynomial_size / 2 + i].y;
}
}
for (size_t p = 0; p < (size_t)samples; p++) {
for (size_t i = 0; i < (size_t)polynomial_size; i++) {
EXPECT_NEAR(cur_poly2[p * polynomial_size + i],
cur_expected[p * 2 * polynomial_size + i], 1e-9);
}
}
}
::testing::internal::ParamGenerator<FourierTransformTestParams> fft_params_u64 =
::testing::Values((FourierTransformTestParams){256, 100},
(FourierTransformTestParams){512, 100},
(FourierTransformTestParams){1024, 100},
(FourierTransformTestParams){2048, 100},
(FourierTransformTestParams){4096, 100},
(FourierTransformTestParams){8192, 50},
(FourierTransformTestParams){16384, 10});
std::string
printParamName(::testing::TestParamInfo<FourierTransformTestParams> p) {
FourierTransformTestParams params = p.param;
return "N_" + std::to_string(params.polynomial_size) + "_samples_" +
std::to_string(params.samples);
}
INSTANTIATE_TEST_CASE_P(fftInstantiation, FourierTransformTestPrimitives_u64,
fft_params_u64, printParamName);

View File

@@ -0,0 +1,162 @@
#include <cmath>
#include <cstdint>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <stdio.h>
#include <stdlib.h>
const unsigned REPETITIONS = 2;
const unsigned SAMPLES = 50;
typedef struct {
int input_lwe_dimension;
int output_lwe_dimension;
DynamicDistribution noise_distribution;
int ksk_base_log;
int ksk_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
} KeyswitchTestParams;
class KeyswitchTestPrimitives_u64
: public ::testing::TestWithParam<KeyswitchTestParams> {
protected:
int input_lwe_dimension;
int output_lwe_dimension;
DynamicDistribution noise_distribution;
int ksk_base_log;
int ksk_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int payload_modulus;
uint64_t delta;
cuda_stream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
uint64_t *d_ksk_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *d_lwe_ct_in_array;
uint64_t *lwe_in_ct;
uint64_t *lwe_out_ct;
uint64_t *lwe_input_indexes;
uint64_t *lwe_output_indexes;
public:
// Test arithmetic functions
void SetUp() {
stream = cuda_create_stream(gpu_index);
// TestParams
input_lwe_dimension = (int)GetParam().input_lwe_dimension;
output_lwe_dimension = (int)GetParam().output_lwe_dimension;
noise_distribution = (DynamicDistribution)GetParam().noise_distribution;
ksk_base_log = (int)GetParam().ksk_base_log;
ksk_level = (int)GetParam().ksk_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
Seed seed;
init_seed(&seed);
keyswitch_setup(stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array,
&d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
&lwe_input_indexes, &d_lwe_ct_out_array,
&lwe_output_indexes, input_lwe_dimension,
output_lwe_dimension, noise_distribution, ksk_base_log,
ksk_level, message_modulus, carry_modulus, &payload_modulus,
&delta, number_of_inputs, REPETITIONS, SAMPLES);
}
void TearDown() {
keyswitch_teardown(stream, lwe_sk_in_array, lwe_sk_out_array, d_ksk_array,
plaintexts, d_lwe_ct_in_array, lwe_input_indexes,
d_lwe_ct_out_array, lwe_output_indexes);
}
};
TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
uint64_t *lwe_out_ct = (uint64_t *)malloc(
(output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t));
for (uint r = 0; r < REPETITIONS; r++) {
uint64_t *lwe_out_sk =
lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
for (uint s = 0; s < SAMPLES; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
(input_lwe_dimension + 1));
// Execute keyswitch
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, (void *)d_lwe_ct_out_array, (void *)lwe_output_indexes,
(void *)d_lwe_ct_in, (void *)lwe_input_indexes, (void *)d_ksk,
input_lwe_dimension, output_lwe_dimension, ksk_base_log, ksk_level,
number_of_inputs);
// Copy result back
cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_ct_out_array,
number_of_inputs * (output_lwe_dimension + 1) *
sizeof(uint64_t),
stream);
for (int i = 0; i < number_of_inputs; i++) {
uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
s * number_of_inputs + i];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted,
lwe_out_ct + i * (output_lwe_dimension + 1),
lwe_out_sk, output_lwe_dimension);
EXPECT_NE(decrypted, plaintext);
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta);
}
}
}
free(lwe_out_ct);
}
// Defines for which parameters set the PBS will be tested.
// It executes each src for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<KeyswitchTestParams> ksk_params_u64 =
::testing::Values(
// n, k*N, noise_distribution, ks_base_log, ks_level,
// message_modulus, carry_modulus, number_of_inputs
(KeyswitchTestParams){
567, 1280, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
3, 3, 2, 1, 10},
(KeyswitchTestParams){
694, 1536, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
4, 3, 2, 1, 10},
(KeyswitchTestParams){
769, 2048, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
4, 3, 2, 1, 10},
(KeyswitchTestParams){
754, 2048, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
3, 5, 2, 1, 10},
(KeyswitchTestParams){742, 2048,
new_gaussian_from_std_dev(sqrt(4.9982771e-11)), 3,
5, 4, 1, 10},
(KeyswitchTestParams){
847, 4096, new_gaussian_from_std_dev(sqrt(2.9802322387695312e-18)),
4, 4, 2, 1, 10});
std::string printParamName(::testing::TestParamInfo<KeyswitchTestParams> p) {
KeyswitchTestParams params = p.param;
return "na_" + std::to_string(params.input_lwe_dimension) + "_nb_" +
std::to_string(params.output_lwe_dimension) + "_baselog_" +
std::to_string(params.ksk_base_log) + "_ksk_level_" +
std::to_string(params.ksk_level);
}
INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64,
ksk_params_u64, printParamName);

View File

@@ -0,0 +1,215 @@
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <gtest/gtest.h>
#include <setup_and_teardown.h>
#include <utils.h>
typedef struct {
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
DynamicDistribution lwe_noise_distribution;
DynamicDistribution glwe_noise_distribution;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int grouping_factor;
int repetitions;
int samples;
} MultiBitBootstrapTestParams;
class MultiBitBootstrapTestPrimitives_u64
: public ::testing::TestWithParam<MultiBitBootstrapTestParams> {
protected:
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
DynamicDistribution lwe_noise_distribution;
DynamicDistribution glwe_noise_distribution;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int payload_modulus;
int number_of_inputs;
int grouping_factor;
uint64_t delta;
cuda_stream_t *stream;
int gpu_index = 0;
uint64_t *lwe_sk_in_array;
uint64_t *lwe_sk_out_array;
uint64_t *plaintexts;
uint64_t *d_bsk_array;
uint64_t *d_lut_pbs_identity;
uint64_t *d_lut_pbs_indexes;
uint64_t *d_lwe_ct_in_array;
uint64_t *d_lwe_ct_out_array;
uint64_t *lwe_ct_out_array;
uint64_t *d_lwe_input_indexes;
uint64_t *d_lwe_output_indexes;
int8_t *pbs_buffer;
int repetitions;
int samples;
public:
void SetUp() {
stream = cuda_create_stream(gpu_index);
// TestParams
lwe_dimension = (int)GetParam().lwe_dimension;
glwe_dimension = (int)GetParam().glwe_dimension;
polynomial_size = (int)GetParam().polynomial_size;
grouping_factor = (int)GetParam().grouping_factor;
lwe_noise_distribution =
(DynamicDistribution)GetParam().lwe_noise_distribution;
glwe_noise_distribution =
(DynamicDistribution)GetParam().glwe_noise_distribution;
pbs_base_log = (int)GetParam().pbs_base_log;
pbs_level = (int)GetParam().pbs_level;
message_modulus = (int)GetParam().message_modulus;
carry_modulus = (int)GetParam().carry_modulus;
number_of_inputs = (int)GetParam().number_of_inputs;
Seed seed;
init_seed(&seed);
repetitions = (int)GetParam().repetitions;
samples = (int)GetParam().samples;
bootstrap_multibit_setup(
stream, &seed, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk_array,
&plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes,
&d_lwe_ct_in_array, &d_lwe_input_indexes, &d_lwe_ct_out_array,
&d_lwe_output_indexes, &pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, lwe_noise_distribution,
glwe_noise_distribution, pbs_base_log, pbs_level, message_modulus,
carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
samples);
lwe_ct_out_array =
(uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t));
}
void TearDown() {
free(lwe_ct_out_array);
cleanup_cuda_multi_bit_pbs_64(stream, &pbs_buffer);
bootstrap_multibit_teardown(
stream, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array, plaintexts,
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_lwe_ct_out_array, d_lwe_output_indexes);
}
};
TEST_P(MultiBitBootstrapTestPrimitives_u64, multi_bit_pbs) {
int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
(glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
(1 << grouping_factor);
for (int r = 0; r < repetitions; r++) {
uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
uint64_t *lwe_sk_out =
lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
for (int s = 0; s < samples; s++) {
uint64_t *d_lwe_ct_in =
d_lwe_ct_in_array +
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
stream, (void *)d_lwe_ct_out_array, (void *)d_lwe_output_indexes,
(void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
(void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_bsk,
pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, pbs_base_log, pbs_level, number_of_inputs, 1, 0,
cuda_get_max_shared_memory(gpu_index));
// Copy result to the host memory
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
(glwe_dimension * polynomial_size + 1) *
number_of_inputs * sizeof(uint64_t),
stream);
for (int j = 0; j < number_of_inputs; j++) {
uint64_t *result =
lwe_ct_out_array +
(ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
s * number_of_inputs + j];
uint64_t decrypted = 0;
core_crypto_lwe_decrypt(&decrypted, result, lwe_sk_out,
glwe_dimension * polynomial_size);
EXPECT_NE(decrypted, plaintext)
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
// The bit before the message
uint64_t rounding_bit = delta >> 1;
// Compute the rounding bit
uint64_t rounding = (decrypted & rounding_bit) << 1;
uint64_t decoded = (decrypted + rounding) / delta;
EXPECT_EQ(decoded, plaintext / delta)
<< "Repetition: " << r << ", sample: " << s << ", input: " << j;
}
}
}
}
// Defines for which parameters set the PBS will be tested.
// It executes each src for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<MultiBitBootstrapTestParams>
multipbs_params_u64 = ::testing::Values(
// fast src
(MultiBitBootstrapTestParams){
16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
2, 1, 2, 1, 10},
(MultiBitBootstrapTestParams){
16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
2, 128, 2, 1, 10},
// 4_bits_multi_bit_group_2
(MultiBitBootstrapTestParams){
818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 22, 1, 2,
2, 1, 2, 1, 10},
(MultiBitBootstrapTestParams){
818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-15)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-24)), 22, 1, 2,
2, 128, 2, 1, 10},
// 4_bits_multi_bit_group_3
(MultiBitBootstrapTestParams){
888, 1, 2048,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
1, 3, 1, 10},
(MultiBitBootstrapTestParams){
888, 1, 2048,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
128, 3, 1, 10});
std::string
printParamName(::testing::TestParamInfo<MultiBitBootstrapTestParams> p) {
MultiBitBootstrapTestParams params = p.param;
return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
std::to_string(params.glwe_dimension) + "_N_" +
std::to_string(params.polynomial_size) + "_pbs_base_log_" +
std::to_string(params.pbs_base_log) + "_pbs_level_" +
std::to_string(params.pbs_level) + "_grouping_factor_" +
std::to_string(params.grouping_factor) + "_number_of_inputs_" +
std::to_string(params.number_of_inputs);
}
INSTANTIATE_TEST_CASE_P(MultiBitBootstrapInstantiation,
MultiBitBootstrapTestPrimitives_u64,
multipbs_params_u64, printParamName);

View File

@@ -0,0 +1,249 @@
#include <algorithm>
#include <bootstrap.h>
#include <bootstrap_multibit.h>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <device.h>
#include <functional>
#include <random>
#include <utils.h>
void init_seed(Seed *seed) {
seed->lo = 0;
seed->hi = 0;
}
void shuffle_seed(Seed *seed) {
// std::random_device rd;
// std::mt19937 gen(rd());
// std::uniform_int_distribution<unsigned long long> dis(
// std::numeric_limits<std::uint64_t>::min(),
// std::numeric_limits<std::uint64_t>::max());
//
// seed.lo += dis(gen);
// seed.hi += dis(gen);
// This is a more convenient solution for testing
seed->lo += 1;
seed->hi += 1;
}
// For each sample and repetition, create a plaintext
// The payload_modulus is the message modulus times the carry modulus
// (so the total message modulus)
uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
int number_of_inputs, const unsigned repetitions,
const unsigned samples) {
uint64_t *plaintext_array = (uint64_t *)malloc(
repetitions * samples * number_of_inputs * sizeof(uint64_t));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<unsigned long long> dis(
std::numeric_limits<std::uint64_t>::min(),
std::numeric_limits<std::uint64_t>::max());
for (uint r = 0; r < repetitions; r++) {
for (uint s = 0; s < samples; s++) {
for (int i = 0; i < number_of_inputs; i++) {
plaintext_array[r * samples * number_of_inputs + s * number_of_inputs +
i] = (dis(gen) % payload_modulus) * delta;
}
}
}
return plaintext_array;
}
uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
int message_modulus, int carry_modulus,
std::function<uint64_t(uint64_t)> func) {
// Modulus of the msg contained in the msg bits and operations buffer
uint64_t modulus_sup = message_modulus * carry_modulus;
// N/(p/2) = size of each block
uint64_t box_size = polynomial_size / modulus_sup;
// Value of the shift we multiply our messages by
uint64_t delta = ((uint64_t)1 << 63) / (uint64_t)(modulus_sup);
// Create the plaintext lut_pbs
uint64_t *plaintext_lut_pbs =
(uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
// This plaintext_lut_pbs extracts the carry bits
for (uint64_t i = 0; i < modulus_sup; i++) {
uint64_t index = i * box_size;
for (uint64_t j = index; j < index + box_size; j++) {
plaintext_lut_pbs[j] = func(i) * delta;
}
}
uint64_t half_box_size = box_size / 2;
// Negate the first half_box_size coefficients
for (uint64_t i = 0; i < half_box_size; i++) {
plaintext_lut_pbs[i] = -plaintext_lut_pbs[i];
}
// Rotate the plaintext_lut_pbs
std::rotate(plaintext_lut_pbs, plaintext_lut_pbs + half_box_size,
plaintext_lut_pbs + polynomial_size);
// Create the GLWE lut_pbs
uint64_t *lut_pbs = (uint64_t *)malloc(
polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t));
for (int i = 0; i < polynomial_size * glwe_dimension; i++) {
lut_pbs[i] = 0;
}
for (int i = 0; i < polynomial_size; i++) {
int glwe_index = glwe_dimension * polynomial_size + i;
lut_pbs[glwe_index] = plaintext_lut_pbs[i];
}
free(plaintext_lut_pbs);
return lut_pbs;
}
// Generate repetitions LWE secret keys
void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
Seed *seed, const unsigned repetitions) {
*lwe_sk_array =
(uint64_t *)malloc(lwe_dimension * repetitions * sizeof(uint64_t));
int shift = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the lwe secret key for each repetition
core_crypto_lwe_secret_key(*lwe_sk_array + (ptrdiff_t)(shift),
lwe_dimension, seed->lo, seed->hi);
shift += lwe_dimension;
}
}
// Generate repetitions GLWE secret keys
void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
int polynomial_size, Seed *seed,
const unsigned repetitions) {
int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions;
*glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t));
int shift = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the lwe secret key for each repetition
core_crypto_lwe_secret_key(*glwe_sk_array + (ptrdiff_t)(shift),
glwe_dimension * polynomial_size, seed->lo,
seed->hi);
shift += glwe_dimension * polynomial_size;
}
}
// Generate repetitions LWE bootstrap keys
void generate_lwe_bootstrap_keys(cuda_stream_t *stream,
double **d_fourier_bsk_array,
uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, int lwe_dimension,
int glwe_dimension, int polynomial_size,
int pbs_level, int pbs_base_log, Seed *seed,
DynamicDistribution noise_distribution,
const unsigned repetitions) {
int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
polynomial_size * (lwe_dimension + 1);
int bsk_array_size = bsk_size * repetitions;
uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
*d_fourier_bsk_array =
(double *)cuda_malloc_async(bsk_array_size * sizeof(double), stream);
int shift_in = 0;
int shift_out = 0;
int shift_bsk = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the bootstrap key for each repetition
core_crypto_par_generate_lwe_bootstrapping_key(
bsk_array + (ptrdiff_t)(shift_bsk), pbs_base_log, pbs_level,
lwe_sk_in_array + (ptrdiff_t)(shift_in), lwe_dimension,
lwe_sk_out_array + (ptrdiff_t)(shift_out), glwe_dimension,
polynomial_size, noise_distribution, seed->lo, seed->hi);
double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk);
uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
cuda_synchronize_stream(stream);
cuda_convert_lwe_bootstrap_key_64((void *)(d_fourier_bsk), (void *)(bsk),
stream, lwe_dimension, glwe_dimension,
pbs_level, polynomial_size);
shift_in += lwe_dimension;
shift_out += glwe_dimension * polynomial_size;
shift_bsk += bsk_size;
}
cuda_synchronize_stream(stream);
free(bsk_array);
}
void generate_lwe_multi_bit_pbs_keys(
cuda_stream_t *stream, uint64_t **d_bsk_array, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, int lwe_dimension, int glwe_dimension,
int polynomial_size, int grouping_factor, int pbs_level, int pbs_base_log,
Seed *seed, DynamicDistribution noise_distribution,
const unsigned repetitions) {
int bsk_size = lwe_dimension * pbs_level * (glwe_dimension + 1) *
(glwe_dimension + 1) * polynomial_size *
(1 << grouping_factor) / grouping_factor;
int bsk_array_size = bsk_size * repetitions;
uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
*d_bsk_array =
(uint64_t *)cuda_malloc_async(bsk_array_size * sizeof(uint64_t), stream);
for (uint r = 0; r < repetitions; r++) {
int shift_in = 0;
int shift_out = 0;
int shift_bsk = 0;
core_crypto_par_generate_lwe_multi_bit_bootstrapping_key(
lwe_sk_in_array + (ptrdiff_t)(shift_in), lwe_dimension,
lwe_sk_out_array + (ptrdiff_t)(shift_out), glwe_dimension,
polynomial_size, bsk_array + (ptrdiff_t)(shift_bsk), pbs_base_log,
pbs_level, grouping_factor, noise_distribution, 0, 0);
uint64_t *d_bsk = *d_bsk_array + (ptrdiff_t)(shift_bsk);
uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
cuda_convert_lwe_multi_bit_bootstrap_key_64(
d_bsk, bsk, stream, lwe_dimension, glwe_dimension, pbs_level,
polynomial_size, grouping_factor);
shift_in += lwe_dimension;
shift_out += glwe_dimension * polynomial_size;
shift_bsk += bsk_size;
}
cuda_synchronize_stream(stream);
free(bsk_array);
}
// Generate repetitions keyswitch keys
void generate_lwe_keyswitch_keys(
cuda_stream_t *stream, uint64_t **d_ksk_array, uint64_t *lwe_sk_in_array,
uint64_t *lwe_sk_out_array, int input_lwe_dimension,
int output_lwe_dimension, int ksk_level, int ksk_base_log, Seed *seed,
DynamicDistribution noise_distribution, const unsigned repetitions) {
int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
int ksk_array_size = ksk_size * repetitions;
uint64_t *ksk_array = (uint64_t *)malloc(ksk_array_size * sizeof(uint64_t));
*d_ksk_array =
(uint64_t *)cuda_malloc_async(ksk_array_size * sizeof(uint64_t), stream);
int shift_in = 0;
int shift_out = 0;
int shift_ksk = 0;
for (uint r = 0; r < repetitions; r++) {
// Generate the keyswitch key for each repetition
core_crypto_par_generate_lwe_keyswitch_key(
ksk_array + (ptrdiff_t)(shift_ksk), ksk_base_log, ksk_level,
lwe_sk_in_array + (ptrdiff_t)(shift_in), input_lwe_dimension,
lwe_sk_out_array + (ptrdiff_t)(shift_out), output_lwe_dimension,
noise_distribution, seed->lo, seed->hi);
uint64_t *d_ksk = *d_ksk_array + (ptrdiff_t)(shift_ksk);
uint64_t *ksk = ksk_array + (ptrdiff_t)(shift_ksk);
cuda_memcpy_async_to_gpu(d_ksk, ksk, ksk_size * sizeof(uint64_t), stream);
shift_in += input_lwe_dimension;
shift_out += output_lwe_dimension;
shift_ksk += ksk_size;
}
cuda_synchronize_stream(stream);
free(ksk_array);
}

View File

@@ -6,8 +6,8 @@ extern "C" {
/// Create a new Cuda stream on GPU `gpu_index`
pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void;
/// Destroy the Cuda stream `v_stream` on GPU `gpu_index`
pub fn cuda_destroy_stream(v_stream: *mut c_void) -> i32;
/// Destroy the Cuda stream `v_stream`
pub fn cuda_destroy_stream(v_stream: *mut c_void);
/// Allocate `size` memory on GPU `gpu_index` asynchronously
pub fn cuda_malloc_async(size: u64, v_stream: *const c_void) -> *mut c_void;
@@ -19,7 +19,7 @@ extern "C" {
src: *const c_void,
size: u64,
v_stream: *const c_void,
) -> i32;
);
/// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
/// the Cuda stream `v_stream`.
@@ -28,7 +28,7 @@ extern "C" {
src: *const c_void,
size: u64,
v_stream: *const c_void,
) -> i32;
);
/// Copy `size` memory asynchronously from `src` to `dest` on the same GPU `gpu_index` using
/// the Cuda stream `v_stream`.
@@ -37,31 +37,26 @@ extern "C" {
src: *const c_void,
size: u64,
v_stream: *const c_void,
) -> i32;
);
/// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
/// the Cuda stream `v_stream`.
pub fn cuda_memset_async(
dest: *mut c_void,
value: u64,
size: u64,
v_stream: *const c_void,
) -> i32;
pub fn cuda_memset_async(dest: *mut c_void, value: u64, size: u64, v_stream: *const c_void);
/// Get the total number of Nvidia GPUs detected on the platform
pub fn cuda_get_number_of_gpus() -> i32;
/// Synchronize all streams on GPU `gpu_index`
pub fn cuda_synchronize_device(gpu_index: u32) -> i32;
pub fn cuda_synchronize_device(gpu_index: u32);
/// Synchronize Cuda stream
pub fn cuda_synchronize_stream(v_stream: *const c_void) -> i32;
pub fn cuda_synchronize_stream(v_stream: *const c_void);
/// Free memory for pointer `ptr` on GPU `gpu_index` asynchronously, using stream `v_stream`
pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void) -> i32;
pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void);
/// Free memory for pointer `ptr` on GPU `gpu_index` synchronously
pub fn cuda_drop(ptr: *mut c_void) -> i32;
pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
/// Get the maximum amount of shared memory on GPU `gpu_index`
pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32;
@@ -220,7 +215,7 @@ extern "C" {
/// This cleanup function frees the data for the low latency PBS on GPU
/// contained in pbs_buffer for 32 or 64-bit inputs.
pub fn cleanup_cuda_bootstrap_low_latency(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
pub fn cleanup_cuda_bootstrap_low_latency_64(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
/// This scratch function allocates the necessary amount of data on the GPU for
/// the multi-bit PBS on 64-bit inputs into `pbs_buffer`.
@@ -302,7 +297,7 @@ extern "C" {
/// This cleanup function frees the data for the multi-bit PBS on GPU
/// contained in pbs_buffer for 64-bit inputs.
pub fn cleanup_cuda_multi_bit_pbs(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
pub fn cleanup_cuda_multi_bit_pbs_64(v_stream: *const c_void, pbs_buffer: *mut *mut i8);
/// Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
///

View File

@@ -4,6 +4,7 @@ benchmark_parser
Parse criterion benchmark or keys size results.
"""
import argparse
import csv
import pathlib
@@ -11,45 +12,97 @@ import json
import sys
ONE_HOUR_IN_NANOSECONDS = 3600E9
ONE_HOUR_IN_NANOSECONDS = 3600e9
parser = argparse.ArgumentParser()
parser.add_argument('results',
help='Location of criterion benchmark results directory.'
'If the --key-size option is used, then the value would have to point to'
'a CSV file.')
parser.add_argument('output_file', help='File storing parsed results')
parser.add_argument('-d', '--database', dest='database',
help='Name of the database used to store results')
parser.add_argument('-w', '--hardware', dest='hardware',
help='Hardware reference used to perform benchmark')
parser.add_argument('-V', '--project-version', dest='project_version',
help='Commit hash reference')
parser.add_argument('-b', '--branch', dest='branch',
help='Git branch name on which benchmark was performed')
parser.add_argument('--commit-date', dest='commit_date',
help='Timestamp of commit hash used in project_version')
parser.add_argument('--bench-date', dest='bench_date',
help='Timestamp when benchmark was run')
parser.add_argument('--name-suffix', dest='name_suffix', default='',
help='Suffix to append to each of the result test names')
parser.add_argument('--append-results', dest='append_results', action='store_true',
help='Append parsed results to an existing file')
parser.add_argument('--walk-subdirs', dest='walk_subdirs', action='store_true',
help='Check for results in subdirectories')
parser.add_argument('--key-sizes', dest='key_sizes', action='store_true',
help='Parse only the results regarding keys size measurements')
parser.add_argument('--key-gen', dest='key_gen', action='store_true',
help='Parse only the results regarding keys generation time measurements')
parser.add_argument('--throughput', dest='throughput', action='store_true',
help='Compute and append number of operations per second and'
'operations per dollar')
parser.add_argument('--backend', dest='backend', default='cpu',
help='Backend on which benchmarks have run')
parser.add_argument(
"results",
help="Location of criterion benchmark results directory."
"If the --key-size option is used, then the value would have to point to"
"a CSV file.",
)
parser.add_argument("output_file", help="File storing parsed results")
parser.add_argument(
"-d",
"--database",
dest="database",
help="Name of the database used to store results",
)
parser.add_argument(
"-w",
"--hardware",
dest="hardware",
help="Hardware reference used to perform benchmark",
)
parser.add_argument(
"-V", "--project-version", dest="project_version", help="Commit hash reference"
)
parser.add_argument(
"-b",
"--branch",
dest="branch",
help="Git branch name on which benchmark was performed",
)
parser.add_argument(
"--commit-date",
dest="commit_date",
help="Timestamp of commit hash used in project_version",
)
parser.add_argument(
"--bench-date", dest="bench_date", help="Timestamp when benchmark was run"
)
parser.add_argument(
"--name-suffix",
dest="name_suffix",
default="",
help="Suffix to append to each of the result test names",
)
parser.add_argument(
"--append-results",
dest="append_results",
action="store_true",
help="Append parsed results to an existing file",
)
parser.add_argument(
"--walk-subdirs",
dest="walk_subdirs",
action="store_true",
help="Check for results in subdirectories",
)
parser.add_argument(
"--key-sizes",
dest="key_sizes",
action="store_true",
help="Parse only the results regarding keys size measurements",
)
parser.add_argument(
"--key-gen",
dest="key_gen",
action="store_true",
help="Parse only the results regarding keys generation time measurements",
)
parser.add_argument(
"--throughput",
dest="throughput",
action="store_true",
help="Compute and append number of operations per second and"
"operations per dollar",
)
parser.add_argument(
"--backend",
dest="backend",
default="cpu",
help="Backend on which benchmarks have run",
)
def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throughput=False,
hardware_hourly_cost=None):
def recursive_parse(
directory,
walk_subdirs=False,
name_suffix="",
compute_throughput=False,
hardware_hourly_cost=None,
):
"""
Parse all the benchmark results in a directory. It will attempt to parse all the files having a
.json extension at the top-level of this directory.
@@ -84,7 +137,9 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
full_name, test_name = parse_benchmark_file(subdir)
if test_name is None:
parsing_failures.append((full_name, "'function_id' field is null in report"))
parsing_failures.append(
(full_name, "'function_id' field is null in report")
)
continue
try:
@@ -94,7 +149,9 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
continue
for stat_name, value in parse_estimate_file(subdir).items():
test_name_parts = list(filter(None, [test_name, stat_name, name_suffix]))
test_name_parts = list(
filter(None, [test_name, stat_name, name_suffix])
)
result_values.append(
_create_point(
@@ -104,19 +161,26 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
"latency",
operator,
params,
display_name=display_name
display_name=display_name,
)
)
lowercase_test_name = test_name.lower()
# This is a special case where PBS are blasted as vector LWE ciphertext with
# variable length to saturate the machine. To get the actual throughput we need to
# multiply by the length of the vector.
if "PBS_throughput" in test_name and "chunk" in test_name:
if (
"pbs_throughput" in lowercase_test_name
and lowercase_test_name.endswith("chunk")
):
try:
multiplier = int(test_name.split("chunk")[0].split("_")[-1])
multiplier = int(
lowercase_test_name.strip("chunk").split("::")[-1]
)
except ValueError:
parsing_failures.append((full_name,
"failed to extract throughput multiplier"))
parsing_failures.append(
(full_name, "failed to extract throughput multiplier")
)
continue
else:
multiplier = 1
@@ -132,7 +196,7 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
"throughput",
operator,
params,
display_name="_".join([display_name, test_suffix])
display_name="_".join([display_name, test_suffix]),
)
)
test_name_parts.pop()
@@ -142,20 +206,23 @@ def recursive_parse(directory, walk_subdirs=False, name_suffix="", compute_throu
test_name_parts.append(test_suffix)
result_values.append(
_create_point(
multiplier * compute_ops_per_dollar(value, hardware_hourly_cost),
multiplier
* compute_ops_per_dollar(value, hardware_hourly_cost),
"_".join(test_name_parts),
bench_class,
"throughput",
operator,
params,
display_name="_".join([display_name, test_suffix])
display_name="_".join([display_name, test_suffix]),
)
)
return result_values, parsing_failures
def _create_point(value, test_name, bench_class, bench_type, operator, params, display_name=None):
def _create_point(
value, test_name, bench_class, bench_type, operator, params, display_name=None
):
return {
"value": value,
"test": test_name,
@@ -163,7 +230,8 @@ def _create_point(value, test_name, bench_class, bench_type, operator, params, d
"class": bench_class,
"type": bench_type,
"operator": operator,
"params": params}
"params": params,
}
def parse_benchmark_file(directory):
@@ -206,21 +274,24 @@ def _parse_key_results(result_file, bench_type):
with result_file.open() as csv_file:
reader = csv.reader(csv_file)
for (test_name, value) in reader:
for test_name, value in reader:
try:
params, display_name, operator = get_parameters(test_name)
except Exception as err:
parsing_failures.append((test_name, f"failed to get parameters: {err}"))
continue
result_values.append({
"value": int(value),
"test": test_name,
"name": display_name,
"class": "keygen",
"type": bench_type,
"operator": operator,
"params": params})
result_values.append(
{
"value": int(value),
"test": test_name,
"name": display_name,
"class": "keygen",
"type": bench_type,
"operator": operator,
"params": params,
}
)
return result_values, parsing_failures
@@ -288,7 +359,7 @@ def compute_ops_per_second(data_point):
:return: number of operations per second
"""
return 1E9 / data_point
return 1e9 / data_point
def _parse_file_to_json(directory, filename):
@@ -337,9 +408,16 @@ def check_mandatory_args(input_args):
missing_args = []
for arg_name in vars(input_args):
if arg_name in ["results_dir", "output_file", "name_suffix",
"append_results", "walk_subdirs", "key_sizes",
"key_gen", "throughput"]:
if arg_name in [
"results_dir",
"output_file",
"name_suffix",
"append_results",
"walk_subdirs",
"key_sizes",
"key_gen",
"throughput",
]:
continue
if not getattr(input_args, arg_name):
missing_args.append(arg_name)
@@ -354,7 +432,7 @@ if __name__ == "__main__":
args = parser.parse_args()
check_mandatory_args(args)
#failures = []
# failures = []
raw_results = pathlib.Path(args.results)
if args.key_sizes or args.key_gen:
if args.key_sizes:
@@ -370,7 +448,8 @@ if __name__ == "__main__":
if args.throughput:
print("Throughput computation enabled")
ec2_costs = json.loads(
pathlib.Path("ci/ec2_products_cost.json").read_text(encoding="utf-8"))
pathlib.Path("ci/ec2_products_cost.json").read_text(encoding="utf-8")
)
try:
hardware_cost = abs(ec2_costs[args.hardware])
print(f"Hardware hourly cost: {hardware_cost} $/h")
@@ -378,8 +457,13 @@ if __name__ == "__main__":
print(f"Cannot find hardware hourly cost for '{args.hardware}'")
sys.exit(1)
results, failures = recursive_parse(raw_results, args.walk_subdirs, args.name_suffix,
args.throughput, hardware_cost)
results, failures = recursive_parse(
raw_results,
args.walk_subdirs,
args.name_suffix,
args.throughput,
hardware_cost,
)
print("Parsing results done")

View File

@@ -3,5 +3,6 @@
"hpc7a.96xlarge": 7.7252,
"p3.2xlarge": 3.06,
"p4d.24xlarge": 32.7726,
"p5.48xlarge": 98.32
"p5.48xlarge": 98.32,
"rtx4090": 0.04
}

View File

@@ -33,6 +33,18 @@ def check_security(filename):
print(f"\t{param.tag}...\t", end= "")
is_n_size_too_low = param.n <= 450
is_noise_level_too_low = param.Xe.stddev < 4.0
if is_n_size_too_low :
reason = f"n size is too low {param.n} minimum is 450"
elif is_noise_level_too_low:
reason = f"noise level is too low {round(param.Xe.stddev,3)} minimum is 4.0"
if is_n_size_too_low or is_noise_level_too_low:
print(f"FAIL\t{reason}")
to_update.append((param, reason))
continue
try:
# The lattice estimator is not able to manage such large dimension.
# If we have the security for smaller `n` then we have security for larger ones.
@@ -70,7 +82,7 @@ if __name__ == "__main__":
print("Some parameters need update")
print("----------------------------")
for param, reason in params_to_update:
print(f"[{param.tag}] reason: {reason} (param)")
print(f"[{param.tag}] reason: {reason} (param: {param})")
sys.exit(int(1)) # Explicit conversion is needed to make this call work
else:
print("All parameters passed the security check")

View File

@@ -20,7 +20,7 @@ instance_type = "hpc7a.96xlarge"
[profile.gpu-test]
region = "us-east-1"
image_id = "ami-05b4b37bcbb24dc48"
image_id = "ami-0c0bf195ca4c175b6"
instance_type = "p3.2xlarge"
# One spawn attempt every 30 seconds for 1 hour
spawn_retry_attempts = 120
@@ -28,7 +28,7 @@ spawn_retry_duration = 60
[profile.gpu-bench]
region = "us-east-1"
image_id = "ami-05b4b37bcbb24dc48"
image_id = "ami-0c0bf195ca4c175b6"
instance_type = "p4d.24xlarge"
# One spawn attempt every 30 seconds for 6 hours
spawn_retry_attempts = 720
@@ -37,7 +37,7 @@ max_spot_hourly_price = "100.0"
[profile.gpu-bench-big]
region = "us-east-1"
image_id = "ami-05b4b37bcbb24dc48"
image_id = "ami-0c0bf195ca4c175b6"
instance_type = "p5.48xlarge"
spawn_retry_attempts = 720
spawn_retry_duration = 360
@@ -63,11 +63,6 @@ workflow = "aws_tfhe_wasm_tests.yml"
profile = "cpu-small"
check_run_name = "CPU AWS WASM Tests"
[command.cpu_fast_test]
workflow = "aws_tfhe_fast_tests.yml"
profile = "cpu-big"
check_run_name = "CPU AWS Fast Tests"
[command.gpu_test]
workflow = "aws_tfhe_gpu_tests.yml"
profile = "gpu-test"
@@ -133,10 +128,15 @@ workflow = "boolean_benchmark.yml"
profile = "bench"
check_run_name = "Boolean CPU AWS Benchmarks"
[command.pbs_bench]
workflow = "pbs_benchmark.yml"
[command.core_crypto_bench]
workflow = "core_crypto_benchmark.yml"
profile = "bench"
check_run_name = "PBS CPU AWS Benchmarks"
check_run_name = "Core crypto CPU AWS Benchmarks"
[command.core_crypto_gpu_bench]
workflow = "core_crypto_gpu_benchmark.yml"
profile = "gpu-test"
check_run_name = "Core crypto GPU AWS Benchmarks"
[command.wasm_client_bench]
workflow = "wasm_client_benchmark.yml"

Some files were not shown because too many files have changed in this diff Show More