Compare commits

..

173 Commits

Author SHA1 Message Date
Baptiste Roux
51401dcf24 feat(hpu): Add Shift/Rot/Min/Max operation for Hpu in hlapi bench 2025-06-20 09:24:11 +02:00
Baptiste Roux
a271cedb05 fix(hpu): Remove some hardcoded filename in tandem
Also enhance error handling related to user misconfiguration.
And remove a bug with ami devn reading
2025-06-20 09:04:22 +02:00
Arthur Meyre
9eb0e831f5 chore: fix use proper parameter for wasm bench
javascript and their nonsensical fallbacks be damned to eternal suffering
2025-06-19 19:34:04 +02:00
Enzo Di Maria
7e4abfa4ff refactor(gpu): moving extend_radix_with_sign_msb_async to backend 2025-06-19 14:51:02 +02:00
Nicolas Sarlin
ce7c15585e chore(zk): refactor hashes to reuse code between proof and verify 2025-06-19 13:48:20 +02:00
Nicolas Sarlin
58f7457660 chore(zk): rename verify_inner to verify_impl to match the proof 2025-06-19 13:48:20 +02:00
David Testé
2d224e75a1 chore(ci): set pull-requests permission to write in commit checks
This is mandatory according to the action documentation,
notably to be able to write issue comment within the pull-request.
2025-06-19 13:45:44 +02:00
Agnes Leroy
e5a9145cce fix(gpu): fix perf regression introduced in 1936ec6d84 2025-06-19 13:34:36 +02:00
tmontaigu
f5f7213289 feat: improve division for 2_2 parameters
The improvement is to compute the quotient digit by digit and
not bit by bit.

This could also probably work for 3_3 and 4_4 but it is not a priority

This brings the 64bits division down to ~5.5s from 8.6s
2025-06-19 13:03:40 +02:00
tmontaigu
b917cf4530 feat(core): plug XofSeed 2025-06-19 09:57:29 +02:00
Mayeul@Zama
1873b627d6 chore: add TODO for Glwe MS 2025-06-18 16:54:12 +02:00
Mayeul@Zama
cb8d753ea6 refactor(core): cleanup unused function 2025-06-18 16:54:12 +02:00
Mayeul@Zama
88e8fa6da9 refactor(core): separate BR and MS 2025-06-18 16:54:12 +02:00
Mayeul@Zama
0ea7c29dbd refactor(core): separate BR and MS 2025-06-18 16:54:12 +02:00
Mayeul@Zama
d90bd8bf89 feat(core): add grouping_factor to MultiBitModulusSwitchedCt trait 2025-06-18 16:54:12 +02:00
Mayeul@Zama
bf5e4474a2 feat(core): add ModulusSwitchedCt trait 2025-06-18 16:54:12 +02:00
Mayeul@Zama
7fd5321b78 refactor(core): std_multi_bit_blind_rotate_assign takes msed input 2025-06-18 16:54:12 +02:00
Mayeul@Zama
c168dea284 refactor(core): rename MultiBitModulusSwitchedCt MultiBitModulusSwitchedLweCiphertext 2025-06-18 16:54:12 +02:00
Beka Barbakadze
1936ec6d84 refactor(gpu): refactor and optimize sum_ciphertext in cuda backend 2025-06-18 16:44:20 +02:00
Agnes Leroy
9864dba009 fix(gpu): fix degrees after scalar bitxor 2025-06-18 15:50:59 +02:00
Nicolas Sarlin
8c1ece4fd9 refactor(shortint): improve handling of empty compressed ct list 2025-06-18 11:08:59 +02:00
Nicolas Sarlin
343cad641c chore: TFHE-rs 1.3.0 2025-06-18 10:20:49 +02:00
David Testé
39d77299ed chore(bench): harmonize dex benchmark function names 2025-06-18 09:47:57 +02:00
Arthur Meyre
c841e3be6e chore: add new codeowners for HPU and CUDA code 2025-06-17 11:07:38 +02:00
Nicolas Sarlin
aaeb46f074 feat(shortint): add compression for squashed noise ciphertexts 2025-06-16 18:08:51 +02:00
Nicolas Sarlin
8f7281c219 fix(shortint): handle empty list in compression 2025-06-16 18:08:51 +02:00
tmontaigu
11e86e6162 chore(csprng): bump to 0.6.0
Some (breaking) changes were made to a trait in CSPRNG
2025-06-16 14:05:47 +02:00
David Testé
41c92e06a8 chore(ci): add missing env variable in cuda release workflow 2025-06-16 12:55:31 +02:00
Pedro Alves
f25f394763 feat(gpu): add support to GPU-accelerated expand to HL's CompactCiphertextList
- Drops integer's CudaCompactCiphertextList
2025-06-13 19:18:11 +02:00
pgardratzama
3230c4bb97 chore(hpu): devo was still there, tool compiled in release mode 2025-06-13 16:46:31 +02:00
Baptiste Roux
159a85fc8c chore(hpu): Fix some lint in hpu-v80 ffi 2025-06-13 16:46:31 +02:00
Helder Campos
4cec6fb247 chore(hpu): Fixed some README.md typos.
Also rename pdi_mgmt to hpu_archive_mgmt to match new naming
2025-06-13 16:46:31 +02:00
Baptiste Roux
16c997d686 feat(hpu): Add support for tandem pdi
Add support for hpu archive and Fpga reloading.
Rely on Tandem implementation for hot-reloading of FPGA.
Add reload procedure inside ffi/v80 backend.

Now when starting application on HpuV80, a first check of version is done.
If version mismatch, a pdi reload is triggered
2025-06-13 16:46:31 +02:00
David Testé
dcd1af72d4 chore(ci): fix missing env variable for 4090 benchmark 2025-06-13 11:43:16 +02:00
Agnes Leroy
9bf9107e9e feat(gpu): add memory tracking functions for rand/rand bounded 2025-06-12 17:50:15 +02:00
Andrei Stoian
7986e0bf1d chore(gpu): skip packing ks test if it needs more ram than available 2025-06-12 17:47:10 +02:00
Agnes Leroy
55179c52a7 chore(gpu): fix ci on H100 2025-06-12 16:22:57 +02:00
Emmanuel Ferdman
c103f0380c fix(hpu): modernize logger interface
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
2025-06-12 09:04:31 +02:00
Nicolas Sarlin
8024753be0 fix(zk): test failed with trivial ct equal to 0 2025-06-11 18:40:32 +02:00
Nicolas Sarlin
506fdfbdd1 chore(zk): use Shake256 XoF instead of rand to generate gamma values 2025-06-11 18:03:12 +02:00
David Testé
a0e08b80e6 chore(ci): use gh cli instead of api calls done with curl
This refactor simplifies reading of data_pr_close workflow.
2025-06-11 16:55:37 +02:00
Enzo Di Maria
d06c3e3926 refactor(gpu): moving sub_assign_async to backend 2025-06-11 16:34:46 +02:00
Arthur Meyre
2bf9d25402 feat: add multi bit pbs 128 with GGSW preparation in FFT domain 2025-06-11 14:47:05 +02:00
Arthur Meyre
fe73f101cc test: add test for an adapted parameter set for std multi bit pbs 128 2025-06-11 14:47:05 +02:00
Arthur Meyre
d12de58284 feat: add std pbs 128 2025-06-11 14:47:05 +02:00
Arthur Meyre
d4ea8cd85f feat: add parallel conversion fo Fourier 128 for LweMultiBitBootstrappigKey 2025-06-11 14:47:05 +02:00
Arthur Meyre
64f2befd6c feat: add Fourier 128 variant of LweMultiBitBootstrappingKey 2025-06-11 14:47:05 +02:00
Arthur Meyre
ce372dcea9 refactor(core): split the LWE multi bit bsk entity file
- to prepare for fft128 variant addition
2025-06-11 14:47:05 +02:00
Agnes Leroy
46b4958c9c feat(gpu): add memory tracking functions for booleans 2025-06-11 13:32:06 +02:00
Agnes Leroy
b25bcbc607 feat(gpu): add mem tracking for eq/ne 2025-06-11 13:32:06 +02:00
Agnes Leroy
5dfacc7975 feat(gpu): add memory tracking for compression and decompression 2025-06-11 11:49:09 +02:00
Guillermo Oyarzun
3d857f62cc refactor(gpu): return trivial indexes after ms noise reduction 2025-06-11 11:28:10 +02:00
Nicolas Sarlin
54c314cd71 chore(ci): always run zk tests 2025-06-11 10:29:53 +02:00
Nicolas Sarlin
38a9853140 chore(zk): check crs conformance in backward compat test 2025-06-11 10:29:53 +02:00
Nicolas Sarlin
360097d70e chore(zk): use random seed in tests 2025-06-11 10:29:53 +02:00
Nicolas Sarlin
c94a76a85a fix(zk): overflow in noise tests 2025-06-11 10:29:53 +02:00
Nicolas Sarlin
be1ade6dd2 chore(zk)!: use 8 bytes dsep and 128bits SID in hash functions
BREAKING_CHANGE:
- PublicParams::from_vec methods have been updated to take 8 bytes dsep and an
  SID. CRS generated before this PR are still supported.
2025-06-11 10:29:53 +02:00
Pedro Alves
53845b298a fix(gpu): fix the packing keyswitch buffer not being allocated on large parameter sets 2025-06-11 08:58:09 +02:00
David Testé
11c0340eca chore(bench): plug server-side proof in zk benchmarks 2025-06-10 18:00:39 +02:00
Baptiste Roux
5e966a3d78 chore(hpu): changes based on code review 2025-06-10 17:43:35 +02:00
Baptiste Roux
443e02215f feat(hpu): Add recent IOp in integer benchmarks 2025-06-10 17:43:35 +02:00
Baptiste Roux
3c632c06ba chore(hpu): Fix/Changes to be compliant with CI 2025-06-10 17:43:35 +02:00
Baptiste Roux
833b593845 feat(hpu): Add support for Lead/Trail/Count/ilog2 in high-level API 2025-06-10 17:43:35 +02:00
JJ-hw
a20c90b090 feat(hpu): Add ILOG2/COUNT0/COUNT1/LEAD0/LEAD1/TRAIL0/TRAIL1 IOp.
Those IOp are tested within new bitcnt category
2025-06-10 17:43:35 +02:00
Baptiste Roux
71e86f0522 feat(hpu): Add support for Shift/Rotate in high-level API
Scalar version not supported yet
2025-06-10 17:43:35 +02:00
Baptiste Roux
cb45f7f429 feat(hpu): Add Rot/Shift IOp
Proper implementation of Scalar version need update in the firmware.
And thus it wasn't done yet.
2025-06-10 17:43:35 +02:00
Baptiste Roux
05a51d47fa feat(hpu): Add check with Pbs gid definition
Currently it's a runtime check, but prevent attribution of same Gid for two != lut
2025-06-10 17:43:35 +02:00
Baptiste Roux
2eb1ccd128 feat(hpu): Add support for DivRem in high-level API 2025-06-10 17:43:35 +02:00
JJ-hw
b7a518b9ee chore(hpu): Cleanup code following clippy advices
Also applied cargo fmt
2025-06-10 17:43:35 +02:00
JJ-hw
39faca219f feat(hpu): Add modulo. Note: not optimized version. Use same algo as the division. 2025-06-10 17:43:35 +02:00
JJ-hw
fe7a8915bc feat(hpu): Add DIV/Divs IOp support
Thoses IOp outputs quotient and remainder for numerator and divider of same size.
2025-06-10 17:43:35 +02:00
Baptiste Roux
96c8c44c71 feat(hpu): Enable some erc20 impl
With the support of overflowing ops, those impl are now available to Hpu
2025-06-10 17:43:35 +02:00
Baptiste Roux
24d581afeb feat(hpu): Add support for Neg operation 2025-06-10 17:43:35 +02:00
Baptiste Roux
3c383ba18f feat(hpu): Add support for overflowing ops in the high-level-api 2025-06-10 17:43:35 +02:00
Baptiste Roux
7622122d90 feat(hpu): add support for overflowing iop
Add new Hpu IOp with overflowing flags. Currently only have simple Ilp implementation.
Should be extend to llt one in the future.
2025-06-10 17:43:35 +02:00
Baptiste Roux
949d3e2153 feat(hpu): Add support for Min/Max in hl-api 2025-06-10 17:43:35 +02:00
Baptiste Roux
fb82c652e2 feat(hpu): Remove nops features
This feature was supersed by the trivial one.
Nops option wasn't relevant and usefull anymore.
2025-06-10 17:43:35 +02:00
Baptiste Roux
3af1937250 feat(hpu): add trivial execution in mockup
Enhance FW debug with quicker execution time.
Also add trivial option in the hpu regression test for quick regression
of the FW generation with the mockup.
2025-06-10 17:43:35 +02:00
Baptiste Roux
05f7869c88 feat(mockup): Add support for trivial ciphertext display
Here to enhance IOp firmware debug experience
2025-06-10 17:43:35 +02:00
Nicolas Sarlin
ab0ec4a238 chore(zk): mark non-pke proofs as experimental 2025-06-10 17:07:33 +02:00
dependabot[bot]
167329c52a chore(deps): bump dtolnay/rust-toolchain
Bumps [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) from 888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 to b3b07ba8b418998c39fb20f53e8b695cdcc8de1b.
- [Release notes](https://github.com/dtolnay/rust-toolchain/releases)
- [Commits](888c2e1ea6...b3b07ba8b4)

---
updated-dependencies:
- dependency-name: dtolnay/rust-toolchain
  dependency-version: b3b07ba8b418998c39fb20f53e8b695cdcc8de1b
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-06-10 17:04:02 +02:00
Arthur Meyre
8a280642a7 chore(shortint): alias correct parameters for KS PKE to compute
- has no impact today as parameters were the same, however given the
comment it is best to expose the proper KS from PKE to compute parameters
2025-06-10 17:03:41 +02:00
Arthur Meyre
b29e82b96e test: add noise test check for High Level API 2025-06-10 17:03:41 +02:00
Arthur Meyre
9bda365691 chore(core): add noise distribution test tooling 2025-06-10 17:03:41 +02:00
Arthur Meyre
b686d5cb6a chore: update .gitignore 2025-06-10 17:03:41 +02:00
David Testé
2829c9cc92 chore(ci): parse all pbs counts files for dex benchmarks 2025-06-10 14:19:24 +02:00
Guillermo Oyarzun
0d81623a23 feat(gpu): add squash noise in the hlapi 2025-06-10 13:14:29 +02:00
David Testé
13d797fe9b chore(ci): add bench type to zk benchmarks artifact name
This is done to avoid name collision when both latency and throughput benchmarks are executed within the same workflow run.
2025-06-10 12:30:42 +02:00
Baptiste Roux
0ba2e5c6fd chore(hpu): fix issue with linter
Issue arise following a version update.
This function is clearly unused but kept for huge memory api coherency
and should be used in the future.
2025-06-06 17:55:53 +02:00
Enzo Di Maria
ad3edf3cc3 refactor(gpu): moving scalar_mul_high_async to backend 2025-06-06 17:55:53 +02:00
Pedro Alves
16ff092ce4 fix(gpu): fix race condition on expand when on multi-gpu 2025-06-06 09:34:09 +02:00
Pedro Alves
f511bdc279 chore(gpu): add HL test for GPU expand and fix an issue with exception handling 2025-06-06 09:34:09 +02:00
Agnes Leroy
d3f36b1d86 chore(gpu): fallback to a6000 in case L40's are out of stock 2025-06-05 16:55:10 +02:00
Mayeul@Zama
fbb6b9ace3 refactor(shortint): rename apply_blind_rotate apply_ms_blind_rotate 2025-06-05 14:56:09 +02:00
Mayeul@Zama
d1d1579187 refactor(core): separate modulus_switch from multi_bit_blind_rotate_assign 2025-06-05 14:56:09 +02:00
Mayeul@Zama
a268dced53 refactor(core): refactor MultiBitModulusSwitchedCt 2025-06-05 14:56:09 +02:00
Mayeul@Zama
13b2372624 refactor(core): refactor packing 2025-06-05 14:56:09 +02:00
Agnes Leroy
e2d622b186 feat(gpu): add memory tracking for neg and scalar div 2025-06-05 13:35:01 +02:00
Agnes Leroy
f55007c6fb feat(gpu): add mem tracking for div 2025-06-05 13:35:01 +02:00
Agnes Leroy
3ca85bf904 feat(boolean): add move_to_current_device for booleans 2025-06-05 11:45:54 +02:00
Andrei Stoian
ec78318af3 chore(gpu): prevent nvToolsExt inclusion when not profiling
chore(gpu): prevent nvToolsExt inclusion when not profiling

fix(gpu): stdint
2025-06-05 11:45:32 +02:00
David Testé
8a312afbb7 chore(ci): fix benchmark matrix parameters generation
Previous implementation was done to please Zizmor and avoid
template-injection findings during analysis. This had a downside,
using env directive implies a double-interpolation that messes with
fromJSON() later and build badly formatted matrix parameters.
2025-06-05 11:15:39 +02:00
David Testé
b61f1d864c chore(ci): check ks32 parameters with lattice estimator
A small refactoring has been done to handle ciphertext modulus in a more convenient way.
2025-06-04 17:19:17 +02:00
Agnes Leroy
15983a0718 feat(gpu): add memory tracking for mul/scalar mul 2025-06-04 16:42:51 +02:00
David Testé
856fc1a709 chore(ci): ignore stale action refs on rust-toolchain action
This action doesn't create releases so the action refs doesn't point to a known tag.
If this zizmor findings is not ignored, then continuous integration pipeline is broken.
2025-06-04 11:48:01 +02:00
Pedro Alves
fe0a195630 chore(gpu): switches from the TBC PBS to the other variants for many inputs 2025-06-04 05:45:53 -03:00
tmontaigu
aca7e79585 feat(csprng): add Xof random generation
This adds a new kind of seed to the csprng

When created which such seed, the AES-CTR random generator
initialization changes:
- The AES-KEY used is initialized differently
- The AES-CTR starts with a CTR that may not be 0

The changes make it so that the counter still goes from 0..MAX,
but now the AES-CTR will encrypt the counter + some offset allowing
to keep the regular behavior and the new one
2025-06-04 09:57:18 +02:00
tmontaigu
c0e89a53ef fix(csprng): fix and endian for the counter
This commit fixes an endian (little) for the counter
representation of the counter used in the AES-CTR counter.

This is so that, the random bytes generated are the same not matter
the endian of the system.

A test case with known answers is added, as well as make command
to run the test in an emulated big-endian arch using the `cross`
utility.

This also include a small refactor where now the block cipher
do not encrypt `AesIndex`. This is done as it makes more sense
(AES encrypts bytes, not numbers), so this allows to move and centralize
the concept of endian as well a centralize where batch created.
2025-06-04 09:57:18 +02:00
David Testé
312952007f chore(ci): lock zizmor version to avoid breaking ci pipelines
Newer version of Zizmor can trigger errors due to new findings in workflows. To avoid breaking any ongoing pull-request, due to this unhandled update, zizmor version is locked.
2025-06-03 12:29:36 +02:00
Enzo Di Maria
ff51ed3f34 refactor(gpu): moving trim_radix_blocks_lsb_async to backend 2025-06-03 11:42:18 +02:00
Agnes Leroy
9737bdcb98 fix(gpu): fix degrees after bitxor 2025-06-03 08:47:12 +02:00
tmontaigu
87a43a4900 chore(integer): add determinism check for sum 2025-06-02 17:37:21 +02:00
Agnes Leroy
345bdbf17f feat(gpu): add memory tracking function for cmux 2025-06-02 17:29:17 +02:00
Agnes Leroy
cc54ba2236 chore(gpu): fix overflow in div in long run tests 2025-06-02 17:05:09 +02:00
David Testé
11df6c69ee chore(ci): fix workflow security warnings
Since Zizmor v1.9.0, new pedantic warnings are detected especially
regarding template-injection patterns.
2025-06-02 14:46:14 +02:00
Guillermo Oyarzun
b76f4dbfe0 fix(gpu): fix hardcoded use of message modulus 2025-06-02 10:43:14 +02:00
Enzo Di Maria
be21c15c80 refactor(gpu): moving extend_radix_with_trivial_zero_blocks_msb to backend 2025-06-02 09:19:51 +02:00
tmontaigu
aa51b25313 chore(ci): fix test_user_docs run and add hpu
Due to #[cfg] before the test_user_docs module, the module would
not actually be compiled (thus run user doc test) unless all required
features where activated when running.

So we remove these cfg, as each hardware doc supports its own set of
features and its better to have a test fail because a feature is
missing rather than silently not run anything

Also, add commands and ci stuff to check HPU docs
2025-05-30 16:36:56 +02:00
tmontaigu
300c95fe3d fix(doc): finish HPU example fix 2025-05-30 16:36:56 +02:00
pgardratzama
524adda8f6 fix(doc): hpu example was not compiling 2025-05-30 16:36:56 +02:00
tmontaigu
dedcf205b4 feat(integer): improve default neg 2025-05-30 15:02:35 +02:00
tmontaigu
2c8d4c0fb0 feat(hlapi): add overflowing_neg 2025-05-30 15:02:35 +02:00
tmontaigu
3370fb5b7e feat(gpu): add overflowing_neg 2025-05-30 15:02:35 +02:00
tmontaigu
cd77eac42b feat(integer): add overflowing_neg 2025-05-30 15:02:35 +02:00
Baptiste Roux
40f20b4ecb fix(hpu): Rewrite hpu_bench iteration loop
hpu_bench example was wrong for iter > 1 following clippy modifications.
NB: Vector is collect but intermediate value are explicitly drop to enable long-time stressed tests.
2025-05-28 14:45:45 +02:00
Agnes Leroy
59a78c76a9 fix(gpu): fix build after shift/rotate mem tracking merge 2025-05-28 12:08:09 +02:00
Pedro Alves
1025246b17 fix(gpu): fix a linking problem on Hopper GPUs 2025-05-28 09:27:33 +02:00
Agnes Leroy
338e9eaeef feat(gpu): add memory tracking functions for shift/rotate 2025-05-28 09:26:27 +02:00
David Testé
0bec4d2ba1 chore(ci): pin rust-toolchain action to v1 2025-05-27 17:31:33 +02:00
David Testé
c5fab98900 chore(ci): add token to do online workflow security checks 2025-05-27 17:31:33 +02:00
Nicolas Sarlin
14e1ee5bd3 fix(gpu): build with hpu and zk features 2025-05-27 16:10:38 +02:00
Pedro Alves
52bc778629 feat(gpu): completely remove the internal CUDA_STREAMS in the HL API
- From now on the streams stored in the available cuda server key are the ones to be
2025-05-27 10:29:34 -03:00
Pedro Alves
10405c9836 feat(gpu): improve test_specific_gpu_selection() so it always tests all possible GPU configurations 2025-05-27 10:29:34 -03:00
Pedro Alves
5eaf6cec55 feat(gpu): reintroduce the feature that allows a user to perform computation on multi-gpu using a custom selection of GPUs
This reverts commit a7d8d2b1d4.
2025-05-27 10:29:34 -03:00
Agnes Leroy
3bfacc1e9d chore(bench): add swap throughput benchmark 2025-05-27 12:08:31 +02:00
Agnes Leroy
a47a418d41 chore(gpu): rework dex bench to prepare throughput benchmark 2025-05-27 12:08:31 +02:00
David Testé
75b3141e19 chore(ci): fix command parsing for gpu benchmark common workflow
Quote escaping was flawed and would generate an array containing a unique string instead of several ones separated by commas.
2025-05-27 10:14:06 +02:00
Agnes Leroy
d01328e0fe fix(gpu): fix overflow error in clear inputs remainder in long run tests 2025-05-26 22:51:18 +02:00
Agnes Leroy
6e102b5fa1 chore(gpu): fix oom error in ci 2025-05-26 22:50:55 +02:00
Pedro Alves
8aa6fa514e fix(gpu): add missing error checks after some kernels 2025-05-26 16:29:23 -03:00
Nicolas Sarlin
21a19cd3c5 chore(shortint): modswitch noise reduction key upgrade without clone 2025-05-26 16:53:35 +02:00
Nicolas Sarlin
f51c70d536 feat(shortint): adds generic client key for atomic pattern support 2025-05-26 16:53:35 +02:00
Agnes Leroy
66e3c02838 feat(gpu): add memory tracking functions for comparisons 2025-05-23 14:37:39 +02:00
Pedro Alves
408e81c45a feat(gpu): add support for GPU-accelerated expand on the HL Api
- includes documentation about GPU's accelerated expand on the HL API
- rework CudaKeySwitchingKey
- Cloning the key is no longer necessary on the HL API
2025-05-23 11:54:29 +02:00
dependabot[bot]
4152906c5d chore(deps): bump actions/upload-artifact from 4.6.0 to 4.6.2
Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.6.0 to 4.6.2.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/v4.6.0...ea165f8d65b6e75b540449e92b4886f43607fa02)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-version: 4.6.2
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-05-23 11:23:02 +02:00
dependabot[bot]
9fc8a0b5bc chore(deps): bump codecov/codecov-action from 5.4.2 to 5.4.3
Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 5.4.2 to 5.4.3.
- [Release notes](https://github.com/codecov/codecov-action/releases)
- [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md)
- [Commits](ad3126e916...18283e04ce)

---
updated-dependencies:
- dependency-name: codecov/codecov-action
  dependency-version: 5.4.3
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-05-23 11:22:55 +02:00
dependabot[bot]
5dc3e59d13 chore(deps): bump zgosalvez/github-actions-ensure-sha-pinned-actions
Bumps [zgosalvez/github-actions-ensure-sha-pinned-actions](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions) from 3.0.23 to 3.0.25.
- [Release notes](https://github.com/zgosalvez/github-actions-ensure-sha-pinned-actions/releases)
- [Commits](4830be28ce...fc87bb5b5a)

---
updated-dependencies:
- dependency-name: zgosalvez/github-actions-ensure-sha-pinned-actions
  dependency-version: 3.0.25
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-05-23 11:22:48 +02:00
Nicolas Sarlin
b40996a7e5 chore(shortint): prepare the v1.3 params folder 2025-05-23 10:57:56 +02:00
Pedro Alves
b066ef19fa fix(gpu): fix the internal benchmark 2025-05-23 10:32:24 +02:00
Nicolas Sarlin
25d008bae8 fix(bench): add missing internal keycache feature 2025-05-22 16:14:30 +02:00
David Testé
2749c1088c chore(ci): handle multi directories for parameters records 2025-05-22 15:03:02 +02:00
Guillermo Oyarzun
c19cd9f021 fix(gpu): add indexes to modulus switch noise reduction 2025-05-22 10:50:51 +02:00
Nicolas Sarlin
45fdba04b1 fix(gpu): allow to build with hpu feature enabled 2025-05-22 10:21:35 +02:00
youben11
69d46810b8 feat(core): chunked seeded_lwe_ksk generation 2025-05-21 18:06:58 +01:00
youben11
a16eeb983f feat(core): chunked lwe_ksk generation 2025-05-21 18:06:58 +01:00
Agnes Leroy
8278a9373c fix(gpu): fix degrees after abs 2025-05-21 15:46:18 +02:00
Arthur Meyre
e2a2768484 chore: fix typos
Co-authored-by: crStiv <cryptostiv7@gmail.com>
2025-05-21 13:06:42 +02:00
Arthur Meyre
57cfc38b66 chore: some more CODEOWNERS 2025-05-21 11:30:35 +02:00
Pedro Alves
259d125434 fix(gpu): fix pbs and ks benchmarks 2025-05-20 17:37:48 +02:00
Arthur Meyre
2571196b41 chore: fix ambiguous decrypt 2025-05-20 17:32:05 +02:00
Arthur Meyre
9f3dc6167d chore: remove raw decomposition
- this was left in by mistake
2025-05-20 17:32:05 +02:00
Agnes Leroy
59c17692a3 feat(gpu): add memory tracking functions for bitops 2025-05-20 16:16:22 +02:00
David Testé
e29d615b9d chore(bench): add suitable heuristic for zk throughput
Heuristic based on PBS count was flawed since a ZK verification operation will eat up to 32 threads on the machine. The previous heuristic could generate an input data vector way bigger than the total of threads divided by 32. This in turn lead to long execution time for benchmark and generate bad results.
2025-05-20 15:02:59 +02:00
tmontaigu
8caff604ed chore: use wrapping div in long_run 2025-05-20 14:36:22 +02:00
Agnes Leroy
16badf0c00 chore(gpu): add degree prints in long run tests in case of failure 2025-05-20 14:13:59 +02:00
Nicolas Sarlin
99a27c1cbe chore(hpu): fix Cargo.toml for release 2025-05-19 17:47:40 +02:00
Nicolas Sarlin
9131aaa383 fix(doc): uniformized readme file names 2025-05-19 15:22:34 +02:00
Nicolas Sarlin
a01949e630 fix(bench): compilation error without the internal-keycache feature 2025-05-19 09:50:29 +02:00
Arthur Meyre
30a58cdd1a chore: update version in docs to 1.2.0 2025-05-16 17:10:12 +02:00
Agnes Leroy
03325bf94e feat(gpu): add memory tracking functions for add/sub and scalar add/sub 2025-05-16 16:39:34 +02:00
Nicolas Sarlin
786fe66495 chore(zk): check that crs group element at index n is 0 2025-05-16 16:38:27 +02:00
Baptiste Roux
9ee8259002 feat(hpu): Add Hpu backend implementation
This backend abstract communication with Hpu Fpga hardware.
It define it's proper entities to prevent circular dependencies with
tfhe-rs.
Object lifetime is handle through Arc<Mutex<T>> wrapper, and enforce
that all objects currently alive in Hpu Hw are also kept valid on the
host side.

It contains the second version of HPU instruction set (HIS_V2.0):
* DOp have following properties:
  + Template as first class citizen
  + Support of Immediate template
  + Direct parser and conversion between Asm/Hex
  + Replace deku (and it's associated endianess limitation) by
  + bitfield_struct and manual parsing

* IOp have following properties:
  + Support various number of Destination
  + Support various number of Sources
  + Support various number of Immediat values
  + Support of multiple bitwidth (Not implemented yet in the Fpga
    firmware)

Details could be view in `backends/tfhe-hpu-backend/Readme.md`
2025-05-16 16:30:23 +02:00
Agnes Leroy
a7d8d2b1d4 feat(gpu): revert enables the user to perform computation on multi-gpu using a custom selection of GPUs
This reverts commit 0280dbeb41.
2025-05-15 18:01:17 +02:00
734 changed files with 81680 additions and 7112 deletions

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
*.hpu filter=lfs diff=lfs merge=lfs -text

View File

@@ -6,6 +6,7 @@ self-hosted-runner:
- large_windows_16_latest
- large_ubuntu_16
- large_ubuntu_16-22.04
- v80-desktop
# Configuration variables in array of strings defined in your repository or
# organization. `null` means disabling configuration variables check.
# Empty array means no configuration variable is allowed.

View File

@@ -33,7 +33,9 @@ runs:
if: inputs.github-instance == 'true'
shell: bash
run: |
TOOLKIT_VERSION="$(echo ${CUDA_VERSION} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/${env.CUDA_KEYRING_PACKAGE}
echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
sha256sum -c checksum

View File

@@ -67,7 +67,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -126,9 +126,10 @@ jobs:
- name: Set pull-request URL
if: ${{ failure() && github.event_name == 'pull_request' }}
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -174,7 +174,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -182,9 +182,11 @@ jobs:
if: needs.should-run.outputs.csprng_test == 'true'
run: |
make test_tfhe_csprng
make test_tfhe_csprng_big_endian
- name: Run tfhe-zk-pok tests
if: needs.should-run.outputs.zk_pok_test == 'true'
# Always run it to catch non deterministic bugs earlier
# if: needs.should-run.outputs.zk_pok_test == 'true'
run: |
make test_zk_pok
@@ -272,9 +274,10 @@ jobs:
- name: Set pull-request URL
if: ${{ failure() && github.event_name == 'pull_request' }}
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Slack Notification
if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}

View File

@@ -114,7 +114,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -142,9 +142,10 @@ jobs:
- name: Set pull-request URL
if: ${{ failure() && github.event_name == 'pull_request' }}
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -115,7 +115,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -147,9 +147,10 @@ jobs:
- name: Set pull-request URL
if: ${{ failure() && github.event_name == 'pull_request' }}
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -185,7 +185,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -254,9 +254,10 @@ jobs:
- name: Set pull-request URL
if: ${{ failure() && github.event_name == 'pull_request' }}
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -68,7 +68,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -123,9 +123,10 @@ jobs:
- name: Set pull-request URL
if: ${{ failure() && github.event_name == 'pull_request' }}
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -58,14 +58,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -114,8 +117,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -58,14 +58,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -107,8 +110,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -58,14 +58,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -95,15 +98,27 @@ jobs:
env:
REF_NAME: ${{ github.ref_name }}
- name: Parse swap request PBS counts
- name: Parse swap request update PBS counts
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_update_dex_balance_pbs_count.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results
- name: Parse swap claim PBS counts
- name: Parse swap request finalize PBS counts
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_finalize_pbs_count.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results
- name: Parse swap claim prepare PBS counts
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_prepare_pbs_count.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results
- name: Parse swap claim update PBS counts
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_update_dex_balance_pbs_count.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results
@@ -116,8 +131,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -59,14 +59,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -111,8 +114,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -46,15 +46,18 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
echo "FAST_BENCH=TRUE";
} >> "${GITHUB_ENV}"
echo "FAST_BENCH=TRUE" >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -93,8 +96,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
@@ -124,14 +130,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -159,7 +168,8 @@ jobs:
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--walk-subdirs \
env:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
@@ -170,8 +180,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -84,7 +84,7 @@ jobs:
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\\", \\"/g')
PARSED_COMMAND=$(echo "${INPUTS_COMMAND}" | sed 's/[[:space:]]*,[[:space:]]*/\", \"/g')
echo "COMMAND=[\"${PARSED_COMMAND}\"]" >> "${GITHUB_ENV}"
- name: Set single operations flavor
@@ -120,25 +120,24 @@ jobs:
env:
INPUTS_PARAMS_TYPE: ${{ inputs.params_type }}
- name: Set command output
id: set_command
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "command=${{ toJSON(env.COMMAND) }}" >> "${GITHUB_OUTPUT}"
- name: Set operation flavor output
id: set_op_flavor
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
- name: Set benchmark types output
id: set_bench_type
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
- name: Set parameters types output
id: set_params_type
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "params_type=${{ toJSON(env.PARAMS_TYPE) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
@@ -227,6 +226,8 @@ jobs:
include:
- cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -237,18 +238,20 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
# Re-export environment variables as dependencies setup perform this task in the previous job.
# Local env variables are cleaned at the end of each job.
- name: Export CUDA variables
shell: bash
run: |
CUDA_PATH=/usr/local/cuda-${{ matrix.cuda }}
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "PATH=$PATH:$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib64:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
@@ -258,13 +261,15 @@ jobs:
shell: bash
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
echo "CXX=/usr/bin/g++-${GCC_VERSION}";
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
} >> "${GITHUB_ENV}"
env:
GCC_VERSION: ${{ matrix.gcc }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -317,8 +322,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification

View File

@@ -119,14 +119,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -167,8 +170,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification

View File

@@ -120,14 +120,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -168,8 +171,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
slack-notify:
name: Slack Notification

View File

@@ -0,0 +1,94 @@
# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
name: Hpu Integer Benchmarks
on:
workflow_dispatch:
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
permissions: {}
jobs:
integer-benchmarks-hpu:
name: Execute integer & erc20 benchmarks for HPU backend
runs-on: v80-desktop
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
timeout-minutes: 1440 # 24 hours
steps:
# Needed as long as hw_regmap repository is private
- name: Configure SSH
uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
repository: zama-ai/slab
path: slab
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Run benchmarks
run: |
make bench_integer_hpu
make bench_hlapi_erc20_hpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
--database tfhe_rs \
--hardware "hpu_x1" \
--backend hpu \
--project-version "${COMMIT_HASH}" \
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--walk-subdirs
env:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: ${{ github.sha }}_integer_benchmarks
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}

View File

@@ -78,12 +78,12 @@ jobs:
- name: Set operation flavor output
id: set_op_flavor
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
- name: Set benchmark types output
id: set_bench_type
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
@@ -128,14 +128,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -193,8 +196,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -47,7 +47,7 @@ jobs:
- name: Set operation flavor output
id: set_op_flavor
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
@@ -89,14 +89,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -150,8 +153,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -78,12 +78,12 @@ jobs:
- name: Set operation flavor output
id: set_op_flavor
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "op_flavor=${{ toJSON(env.OP_FLAVOR) }}" >> "${GITHUB_OUTPUT}"
- name: Set benchmark types output
id: set_bench_type
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
@@ -128,14 +128,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -185,8 +188,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -61,11 +61,14 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -107,8 +110,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -61,11 +61,14 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
@@ -107,8 +110,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -98,14 +98,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -141,7 +144,7 @@ jobs:
- name: Upload parsed results artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: ${{ github.sha }}_tfhe_zk_pok
name: ${{ github.sha }}_tfhe_zk_pok_${{ env.BENCH_TYPE }}
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
@@ -155,8 +158,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -96,14 +96,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -136,12 +139,16 @@ jobs:
- name: Install web resources
run: |
make install_${{ matrix.browser }}_browser
make install_${{ matrix.browser }}_web_driver
make install_"${BROWSER}"_browser
make install_"${BROWSER}"_web_driver
env:
BROWSER: ${{ matrix.browser }}
- name: Run benchmarks
run: |
make bench_web_js_api_parallel_${{ matrix.browser }}_ci
make bench_web_js_api_parallel_"${BROWSER}"_ci
env:
BROWSER: ${{ matrix.browser }}
- name: Parse results
run: |
@@ -188,8 +195,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -92,7 +92,7 @@ jobs:
- name: Set benchmark types output
id: set_bench_type
run: |
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
setup-instance:
@@ -140,14 +140,17 @@ jobs:
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
@@ -191,7 +194,7 @@ jobs:
- name: Upload parsed results artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: ${{ github.sha }}_integer_zk
name: ${{ github.sha }}_integer_zk_${{ matrix.bench_type }}
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
@@ -205,8 +208,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}

View File

@@ -35,7 +35,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -94,5 +94,10 @@ jobs:
run: |
make build_tfhe_coverage
- name: Run Hpu pcc checks
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make pcc_hpu
# The wasm build check is a bit annoying to set-up here and is done during the tests in
# aws_tfhe_tests.yml

View File

@@ -51,7 +51,7 @@ jobs:
runs-on: ${{ matrix.runner_type }}
strategy:
matrix:
runner_type: [ubuntu-latest, macos-latest, windows-latest]
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
fail-fast: false
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -82,7 +82,7 @@ jobs:
runs-on: ${{ matrix.runner_type }}
strategy:
matrix:
runner_type: [ubuntu-latest, macos-latest, windows-latest]
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:

View File

@@ -51,7 +51,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ ubuntu-latest, macos-latest, windows-latest ]
fail-fast: false
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -77,7 +77,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ ubuntu-latest, macos-latest, windows-latest ]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:

View File

@@ -3,14 +3,15 @@ name: Check commit and PR compliance
on:
pull_request:
permissions:
contents: read
pull-requests: read # Permission needed to scan commits in a pull-request
permissions: {}
jobs:
check-commit-pr:
name: Check commit and PR
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write # Permission needed to scan commits in a pull-request and write issue comment
steps:
- name: Check first line
uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee

View File

@@ -25,10 +25,10 @@ jobs:
- name: Get actionlint
run: |
wget "https://github.com/rhysd/actionlint/releases/download/v${{ env.ACTIONLINT_VERSION }}/actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz"
echo "${{ env.ACTIONLINT_CHECKSUM }} actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz" > checksum
wget "https://github.com/rhysd/actionlint/releases/download/v${ACTIONLINT_VERSION}/actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz"
echo "${ACTIONLINT_CHECKSUM} actionlint_${ACTIONLINT_VERSION}_linux_amd64.tar.gz" > checksum
sha256sum -c checksum
tar -xf actionlint_${{ env.ACTIONLINT_VERSION }}_linux_amd64.tar.gz actionlint
tar -xf actionlint_"${ACTIONLINT_VERSION}"_linux_amd64.tar.gz actionlint
ln -s "$(pwd)/actionlint" /usr/local/bin/
- name: Lint workflows
@@ -38,9 +38,11 @@ jobs:
- name: Check workflows security
run: |
make check_workflow_security
env:
GH_TOKEN: ${{ env.CHECKOUT_TOKEN }}
- name: Ensure SHA pinned actions
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@4830be28ce81da52ec70d65c552a7403821d98d4 # v3.0.23
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@fc87bb5b5a97953d987372e74478de634726b3e5 # v3.0.25
with:
allowlist: |
slsa-framework/slsa-github-generator

View File

@@ -54,7 +54,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -90,7 +90,7 @@ jobs:
make test_shortint_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
@@ -104,7 +104,7 @@ jobs:
make test_integer_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@ad3126e916f78f00edff4ed0317cf185271ccc2d
uses: codecov/codecov-action@18283e04ce6e62d37312384ff67231eb8fd56d24
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}

View File

@@ -66,7 +66,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable

View File

@@ -3,7 +3,7 @@ name: Close or Merge corresponding PR on the data repo
# When a PR with the data_PR tag is closed or merged, this will close the corresponding PR in the data repo.
env:
TARGET_REPO_API_URL: ${{ github.api_url }}/repos/zama-ai/tfhe-backward-compat-data
DATA_REPO: zama-ai/tfhe-backward-compat-data
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
@@ -16,117 +16,43 @@ on:
pull_request:
types: [ closed ]
# The same pattern is used for jobs that use the github api:
# - save the result of the API call in the env var "GH_API_RES". Since the var is multiline
# we use this trick: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-of-a-multiline-string
# - "set +e" will make sure we reach the last "echo EOF" even in case of error
# - "set -o" pipefail makes one line piped command return the error of the first failure
# - 'RES="$?"' and 'exit $RES' are used to return the error code if a command failed. Without it, with "set +e"
# the script will always return 0 because of the "echo EOF".
permissions: {}
jobs:
auto_close_job:
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') }}
if: ${{ contains(github.event.pull_request.labels.*.name, 'data_PR') && github.repository == 'zama-ai/tfhe-rs' }}
runs-on: ubuntu-latest
env:
GH_TOKEN: ${{ secrets.FHE_ACTIONS_TOKEN }} # Needed for gh CLI commands
steps:
- name: Find corresponding Pull Request in the data repo
- name: Fetch PR number
run: |
{
set +e
set -o pipefail
echo 'TARGET_REPO_PR<<EOF'
curl --fail-with-body --no-progress-meter -L -X GET \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"${TARGET_REPO_API_URL}"/pulls\?head="${REPO_OWNER}":"${PR_BRANCH}" | jq -e '.[0]' | sed 's/null/{ "message": "corresponding PR not found" }/'
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
env:
REPO_OWNER: ${{ github.repository_owner }}
PR_NUMBER=$(gh pr view "${PR_BRANCH}" --repo "${DATA_REPO}" --json number | jq '.number')
echo "DATA_REPO_PR_NUMBER=${PR_NUMBER}" >> "${GITHUB_ENV}"
- name: Comment on the PR to indicate the reason of the close
run: |
BODY="'{ \"body\": \"PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}\" }'"
{
set +e
set -o pipefail
echo 'GH_API_RES<<EOF'
curl --fail-with-body --no-progress-meter -L -X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"${COMMENTS_URL}" \
-d "${BODY}"
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
gh pr comment "${PR_BRANCH}" \
--repo "${DATA_REPO}" \
--body "PR ${CLOSE_TYPE}d because the corresponding PR in main repo was ${CLOSE_TYPE}d: ${REPO}#${EVENT_NUMBER}"
env:
REPO: ${{ github.repository }}
EVENT_NUMBER: ${{ github.event.number }}
COMMENTS_URL: ${{ fromJson(env.TARGET_REPO_PR).comments_url }}
- name: Merge the Pull Request in the data repo
if: ${{ github.event.pull_request.merged }}
run: |
{
set +e
set -o pipefail
echo 'GH_API_RES<<EOF'
curl --fail-with-body --no-progress-meter -L -X PUT \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"${TARGET_REPO_PR_URL}"/merge \
-d '{ "merge_method": "rebase" }'
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
env:
TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
gh pr merge "${PR_BRANCH}" \
--repo "${DATA_REPO}" \
--rebase \
--delete-branch
- name: Close the Pull Request in the data repo
if: ${{ !github.event.pull_request.merged }}
run: |
{
set +e
set -o pipefail
echo 'GH_API_RES<<EOF'
curl --fail-with-body --no-progress-meter -L -X PATCH \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"${TARGET_REPO_PR_URL}" \
-d '{ "state": "closed" }'
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
env:
TARGET_REPO_PR_URL: ${{ fromJson(env.TARGET_REPO_PR).url }}
- name: Delete the associated branch in the data repo
run: |
{
set +e
set -o pipefail
echo 'GH_API_RES<<EOF'
curl --fail-with-body --no-progress-meter -L -X DELETE \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.FHE_ACTIONS_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"${TARGET_REPO_API_URL}"/git/refs/heads/"${PR_BRANCH}"
RES="$?"
echo EOF
} >> "${GITHUB_ENV}"
exit $RES
gh pr close "${PR_BRANCH}" \
--repo "${DATA_REPO}" \
--delete-branch
- name: Slack Notification
if: ${{ always() && job.status == 'failure' }}
@@ -134,4 +60,4 @@ jobs:
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: ${{ fromJson(env.GH_API_RES || env.TARGET_REPO_PR).message }}"
SLACK_MESSAGE: "Failed to auto-${{ env.CLOSE_TYPE }} PR on data repo: https://github.com/${{ env.DATA_REPO }}/pull/${{ env.DATA_REPO_PR_NUMBER }}"

View File

@@ -45,7 +45,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable

View File

@@ -140,7 +140,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -172,9 +172,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

View File

@@ -124,7 +124,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -156,9 +156,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

View File

@@ -79,7 +79,7 @@ jobs:
gcc-version: ${{ matrix.gcc }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable

View File

@@ -126,7 +126,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -149,7 +149,7 @@ jobs:
- name: Run High Level API Tests
run: |
BIG_TESTS_INSTANCE=FALSE make test_high_level_api_gpu
make test_high_level_api_gpu
slack-notify:
name: Slack Notification
@@ -161,9 +161,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

View File

@@ -72,7 +72,7 @@ jobs:
gcc-version: ${{ matrix.gcc }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable

View File

@@ -1,4 +1,4 @@
# Perfom tfhe-cuda-backend post-commit checks on an AWS instance
# Perform tfhe-cuda-backend post-commit checks on an AWS instance
name: Cuda - Post-commit Checks
env:
@@ -81,16 +81,20 @@ jobs:
if: env.SECRETS_AVAILABLE == 'false'
shell: bash
run: |
TOOLKIT_VERSION="$(echo ${{ matrix.cuda }} | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
TOOLKIT_VERSION="$(echo "${CUDA_VERSION}" | sed 's/\(.*\)\.\(.*\)/\1-\2/')"
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"${CUDA_KEYRING_PACKAGE}"
echo "${CUDA_KEYRING_SHA} ${CUDA_KEYRING_PACKAGE}" > checksum
sha256sum -c checksum
sudo dpkg -i "${CUDA_KEYRING_PACKAGE}"
sudo apt update
sudo apt -y install "cuda-toolkit-${TOOLKIT_VERSION}" cmake-format
env:
CUDA_VERSION: ${{ matrix.cuda }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -100,17 +104,21 @@ jobs:
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc" >> "${GITHUB_ENV}"
env:
CUDA_VERSION: ${{ matrix.cuda }}
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
echo "CXX=/usr/bin/g++-${GCC_VERSION}";
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
} >> "${GITHUB_ENV}"
env:
GCC_VERSION: ${{ matrix.gcc }}
- name: Run fmt checks
run: |
@@ -120,12 +128,17 @@ jobs:
run: |
make pcc_gpu
- name: Check build with hpu enabled
run: |
make clippy_gpu_hpu
- name: Set pull-request URL
if: ${{ failure() && github.event_name == 'pull_request' }}
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Slack Notification
if: ${{ failure() && env.SECRETS_AVAILABLE == 'true' }}

View File

@@ -126,7 +126,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -144,9 +144,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

View File

@@ -140,7 +140,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -158,9 +158,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

View File

@@ -130,7 +130,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -156,9 +156,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

View File

@@ -126,7 +126,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -144,9 +144,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

View File

@@ -140,7 +140,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -158,9 +158,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

View File

@@ -130,7 +130,7 @@ jobs:
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -156,9 +156,10 @@ jobs:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${{ github.event.pull_request.number }}), " >> "${GITHUB_ENV}"
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'

73
.github/workflows/hpu_hlapi_tests.yml vendored Normal file
View File

@@ -0,0 +1,73 @@
# Test tfhe-fft
name: Cargo Test HLAPI HPU
on:
pull_request:
push:
branches:
- main
env:
CARGO_TERM_COLOR: always
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions: { }
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: read
outputs:
hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
with:
files_yaml: |
hpu:
- tfhe/Cargo.toml
- Makefile
- backends/tfhe-hpu-backend/**
- mockups/tfhe-hpu-mockup/**
cargo-tests-hpu:
needs: should-run
if: needs.should-run.outputs.hpu_test == 'true'
runs-on: large_ubuntu_16
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install Rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
with:
toolchain: stable
override: true
- name: Install Just
run: |
cargo install just
- name: Test HLAPI HPU
run: |
source setup_hpu.sh
just -f mockups/tfhe-hpu-mockup/Justfile BUILD_PROFILE=release mockup &
make HPU_CONFIG=sim test_high_level_api_hpu
make HPU_CONFIG=sim test_user_doc_hpu

View File

@@ -57,7 +57,7 @@ jobs:
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable

View File

@@ -46,7 +46,7 @@ jobs:
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable

View File

@@ -67,7 +67,7 @@ jobs:
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -78,19 +78,24 @@ jobs:
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
} >> "${GITHUB_ENV}"
env:
CUDA_VERSION: ${{ matrix.cuda }}
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
echo "CXX=/usr/bin/g++-${GCC_VERSION}";
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
env:
GCC_VERSION: ${{ matrix.gcc }}
- name: Prepare package
run: |
cargo package -p tfhe-cuda-backend
@@ -129,7 +134,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Install latest stable
uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
@@ -140,19 +145,23 @@ jobs:
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
echo "CUDACXX=/usr/local/cuda-${CUDA_VERSION}/bin/nvcc";
} >> "${GITHUB_ENV}"
env:
CUDA_VERSION: ${{ matrix.cuda }}
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CC=/usr/bin/gcc-${GCC_VERSION}";
echo "CXX=/usr/bin/g++-${GCC_VERSION}";
echo "CUDAHOSTCXX=/usr/bin/g++-${GCC_VERSION}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
env:
GCC_VERSION: ${{ matrix.gcc }}
- name: Publish crate.io package
env:

105
.github/workflows/make_release_hpu.yml vendored Normal file
View File

@@ -0,0 +1,105 @@
name: Publish HPU release
on:
workflow_dispatch:
inputs:
dry_run:
description: "Dry-run"
type: boolean
default: true
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
permissions: {}
jobs:
verify_tag:
uses: ./.github/workflows/verify_tagged_commit.yml
secrets:
RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
package:
runs-on: ubuntu-latest
needs: verify_tag
outputs:
hash: ${{ steps.hash.outputs.hash }}
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Prepare package
run: |
cargo package -p tfhe-hpu-backend
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: crate
path: target/package/*.crate
- name: generate hash
id: hash
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
provenance:
if: ${{ !inputs.dry_run }}
needs: [package]
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
permissions:
# Needed to detect the GitHub Actions environment
actions: read
# Needed to create the provenance via GitHub OIDC
id-token: write
# Needed to upload assets/artifacts
contents: write
with:
# SHA-256 hashes of the Crate package.
base64-subjects: ${{ needs.package.outputs.hash }}
publish_release:
name: Publish tfhe-hpu-backend Release
runs-on: ubuntu-latest
needs: [verify_tag, package] # for comparing hashes
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Publish crate.io package
env:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
- name: Generate hash
id: published_hash
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
- name: Slack notification (hashes comparison)
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
env:
SLACK_COLOR: failure
SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})"

3
.gitignore vendored
View File

@@ -40,3 +40,6 @@ __pycache__
# First directive is to ignore symlinks
tests/tfhe-backward-compat-data
ci/
# In case someone clones the lattice-estimator locally to verify security
/lattice-estimator

2
.lfsconfig Normal file
View File

@@ -0,0 +1,2 @@
[lfs]
fetchexclude = *

View File

@@ -1,12 +1,28 @@
# Specifying a path without code owners means that path won't have owners and is akin to a negation
# i.e. the `core_crypto` dir is owned and needs owner approval/review, but not the `gpu` sub dir
# See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#example-of-a-codeowners-file
/backends/tfhe-cuda-backend/ @agnesLeroy
/backends/tfhe-hpu-backend/ @zama-ai/hardware
/tfhe/examples/hpu @zama-ai/hardware
/tfhe/src/core_crypto/ @IceTDrinker
/tfhe/src/core_crypto/gpu
/tfhe/src/core_crypto/gpu @agnesLeroy
/tfhe/src/core_crypto/hpu @zama-ai/hardware
/tfhe/src/shortint/ @mayeul-zama
/tfhe/src/integer/ @tmontaigu
/tfhe/src/integer/gpu
/tfhe/src/integer/gpu @agnesLeroy
/tfhe/src/integer/hpu @zama-ai/hardware
/tfhe/src/high_level_api/ @tmontaigu
/Makefile @IceTDrinker @soonum
/mockups/tfhe-hpu-mockup @zama-ai/hardware
/.github/ @soonum
/CODEOWNERS @IceTDrinker

View File

@@ -9,10 +9,12 @@ members = [
"tasks",
"tfhe-csprng",
"backends/tfhe-cuda-backend",
"backends/tfhe-hpu-backend",
"utils/tfhe-versionable",
"utils/tfhe-versionable-derive",
"utils/param_dedup",
"tests",
"mockups/tfhe-hpu-mockup",
]
exclude = [

126
Makefile
View File

@@ -2,6 +2,7 @@ SHELL:=$(shell /usr/bin/env which bash)
OS:=$(shell uname)
RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
CARGO_BUILD_JOBS=default
CPU_COUNT=$(shell ./scripts/cpu_count.sh)
RS_BUILD_TOOLCHAIN:=stable
CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
@@ -55,6 +56,9 @@ REGEX_PATTERN?=''
TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
# tfhe-hpu-backend
HPU_CONFIG=v80
# Exclude these files from coverage reports
define COVERAGE_EXCLUDED_FILES
--exclude-files apps/trivium/src/trivium/* \
@@ -166,9 +170,13 @@ install_typos_checker: install_rs_build_toolchain
.PHONY: install_zizmor # Install zizmor workflow security checker
install_zizmor: install_rs_build_toolchain
@zizmor --version > /dev/null 2>&1 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install zizmor --version ~1.9 || \
( echo "Unable to install zizmor, unknown error." && exit 1 )
.PHONY: install_cargo_cross # Install custom tfhe-rs lints
install_cargo_cross: install_rs_build_toolchain
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install cross
.PHONY: setup_venv # Setup Python virtualenv for wasm tests
setup_venv:
python3 -m venv venv
@@ -290,7 +298,7 @@ check_typos: install_typos_checker
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types \
--features=boolean,shortint,integer,internal-keycache,gpu,pbs-stats,extended-types,zk-pok \
--all-targets \
-p $(TFHE_SPEC) -- --no-deps -D warnings
@@ -301,6 +309,20 @@ check_gpu: install_rs_check_toolchain
--all-targets \
-p $(TFHE_SPEC)
.PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled
clippy_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \
--all-targets \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_gpu_hpu # Run clippy lints on tfhe with "gpu" and "hpu" enabled
clippy_gpu_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,gpu,hpu,pbs-stats,extended-types,zk-pok \
--all-targets \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
fix_newline: check_linelint_installed
linelint -a .
@@ -440,6 +462,8 @@ clippy_tfhe_csprng: install_rs_check_toolchain
clippy_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-zk-pok -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-zk-pok --features=experimental -- --no-deps -D warnings
.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
clippy_versionable: install_rs_check_toolchain
@@ -473,6 +497,11 @@ clippy_cuda_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-cuda-backend -- --no-deps -D warnings
.PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
clippy_hpu_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-hpu-backend -- --no-deps -D warnings
.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
check_rust_bindings_did_not_change:
cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -702,6 +731,28 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
--signed-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend
test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest
cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu
.PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup
test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest
source ./setup_hpu.sh --config sim ; \
cargo build --release --bin hpu_mockup; \
coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
HPU_TEST_ITER=1 \
cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
kill %1
.PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup.
test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest
source ./setup_hpu.sh --config sim ; \
cargo build --profile devo --bin hpu_mockup; \
coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
HPU_TEST_ITER=1 \
cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
kill %1
.PHONY: test_boolean # Run the tests of the boolean module
test_boolean: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -854,9 +905,25 @@ test_high_level_api: install_rs_build_toolchain
test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
-E "test(/high_level_api::.*gpu.*/)"
test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
ifeq ($(HPU_CONFIG), v80)
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
--build-jobs=$(CARGO_BUILD_JOBS) \
--test-threads=1 \
--features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \
-E "test(/high_level_api::.*hpu.*/)"
else
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
--build-jobs=$(CARGO_BUILD_JOBS) \
--test-threads=1 \
--features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \
-E "test(/high_level_api::.*hpu.*/)"
endif
.PHONY: test_strings # Run the tests for strings ci
test_strings: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -874,9 +941,21 @@ test_user_doc: install_rs_build_toolchain
.PHONY: test_user_doc_gpu # Run tests for GPU from the .md documentation
test_user_doc_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=boolean,shortint,integer,internal-keycache,gpu,zk-pok -p $(TFHE_SPEC) \
--features=internal-keycache,integer,zk-pok,gpu -p $(TFHE_SPEC) \
-- test_user_docs::
.PHONY: test_user_doc_hpu # Run tests for HPU from the .md documentation
test_user_doc_hpu: install_rs_build_toolchain
ifeq ($(HPU_CONFIG), v80)
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=internal-keycache,integer,hpu,hpu-v80 -p $(TFHE_SPEC) \
-- test_user_docs::
else
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) --doc \
--features=internal-keycache,integer,hpu -p $(TFHE_SPEC) \
-- test_user_docs::
endif
.PHONY: test_regex_engine # Run tests for regex_engine example
@@ -907,10 +986,16 @@ test_tfhe_csprng: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-csprng
.PHONY: test_tfhe_csprng_big_endian # Run tfhe-csprng tests on an emulated big endian system
test_tfhe_csprng_big_endian: install_rs_build_toolchain install_cargo_cross
RUSTFLAGS="" cross $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-csprng --target=powerpc64-unknown-linux-gnu
.PHONY: test_zk_pok # Run tfhe-zk-pok tests
test_zk_pok: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-zk-pok
-p tfhe-zk-pok --features experimental
.PHONY: test_zk_wasm_x86_compat_ci
test_zk_wasm_x86_compat_ci: check_nvm_installed
@@ -1012,7 +1097,7 @@ check_compile_tests: install_rs_build_toolchain
.PHONY: check_compile_tests_benches_gpu # Build tests in debug without running them
check_compile_tests_benches_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
--features=experimental,boolean,shortint,integer,internal-keycache,gpu \
--features=experimental,boolean,shortint,integer,internal-keycache,gpu,zk-pok \
-p $(TFHE_SPEC)
mkdir -p "$(TFHECUDA_BUILD)" && \
cd "$(TFHECUDA_BUILD)" && \
@@ -1100,6 +1185,12 @@ clippy_bench_gpu: install_rs_check_toolchain
--features=gpu,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \
-p tfhe-benchmark -- --no-deps -D warnings
.PHONY: clippy_bench_hpu # Run clippy lints on tfhe-benchmark
clippy_bench_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=hpu,shortint,integer,internal-keycache,pbs-stats\
-p tfhe-benchmark -- --no-deps -D warnings
.PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
print_doc_bench_parameters:
RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
@@ -1133,6 +1224,14 @@ bench_signed_integer_gpu: install_rs_check_toolchain
--bench integer-signed-bench \
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --
.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
bench_integer_hpu: install_rs_check_toolchain
source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1146,7 +1245,7 @@ bench_integer_compression_gpu: install_rs_check_toolchain
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench glwe_packing_compression-integer-bench \
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --
.PHONY: bench_integer_zk_gpu
bench_integer_zk_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1324,6 +1423,14 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
--bench hlapi-dex \
--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
bench_hlapi_erc20_hpu: install_rs_check_toolchain
source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
bench_tfhe_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
@@ -1384,7 +1491,7 @@ parse_wasm_benchmarks: install_rs_check_toolchain
.PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
write_params_to_file: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run \
--example write_params_to_file --features=boolean,shortint,internal-keycache
--example write_params_to_file --features=boolean,shortint,hpu,internal-keycache
.PHONY: clone_backward_compat_data # Clone the data repo needed for backward compatibility tests
clone_backward_compat_data:
@@ -1423,6 +1530,9 @@ tfhe_lints
pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
check_md_docs_are_tested clippy_fast check_compile_tests

View File

@@ -11,11 +11,13 @@ extend-ignore-identifiers-re = [
# Example with string replacing "hello" with "herlo"
"herlo",
# Example in trivium
"C9217BA0D762ACA1"
"C9217BA0D762ACA1",
"0x[0-9a-fA-F]+"
]
[files]
extend-exclude = [
"backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
"backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
"backends/tfhe-hpu-backend/config_store/**/*.link_summary",
]

View File

@@ -129,7 +129,7 @@ Other sizes than 64 bit are expected to be available in the future.
# FHE shortint Trivium implementation
The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
The same implementation is also available for generic Ciphertexts representing bits (meant to be used with parameters `V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128`).
It uses a lower level API of tfhe-rs, so the syntax is a little bit different. It also implements the `TransCiphering` trait. For optimization purposes, it does not internally run
on the same cryptographic parameters as the high level API of tfhe-rs. As such, it requires the usage of a casting key, to switch from one parameter space to another, which makes
its setup a little more intricate.
@@ -137,10 +137,10 @@ its setup a little more intricate.
Example code:
```rust
use tfhe::shortint::prelude::*;
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::{ConfigBuilder, generate_keys, FheUint64};
use tfhe::prelude::*;
@@ -148,17 +148,17 @@ use tfhe_trivium::TriviumStreamShortint;
fn test_shortint() {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let (client_key, server_key): (ClientKey, ServerKey) = gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();

View File

@@ -1,9 +1,9 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::shortint::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{KreyviumStreamShortint, TransCiphering};
pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -64,19 +64,19 @@ pub fn kreyvium_shortint_warmup(c: &mut Criterion) {
pub fn kreyvium_shortint_gen(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();
@@ -112,19 +112,19 @@ pub fn kreyvium_shortint_gen(c: &mut Criterion) {
pub fn kreyvium_shortint_trans(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();

View File

@@ -1,9 +1,9 @@
use criterion::Criterion;
use tfhe::prelude::*;
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::shortint::prelude::*;
use tfhe::{generate_keys, ConfigBuilder, FheUint64};
@@ -11,19 +11,19 @@ use tfhe_trivium::{TransCiphering, TriviumStreamShortint};
pub fn trivium_shortint_warmup(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -64,19 +64,19 @@ pub fn trivium_shortint_warmup(c: &mut Criterion) {
pub fn trivium_shortint_gen(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();
@@ -112,19 +112,19 @@ pub fn trivium_shortint_gen(c: &mut Criterion) {
pub fn trivium_shortint_trans(c: &mut Criterion) {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();

View File

@@ -1,9 +1,9 @@
use crate::{KreyviumStream, KreyviumStreamByte, KreyviumStreamShortint, TransCiphering};
use tfhe::prelude::*;
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
// Values for these tests come from the github repo renaud1239/Kreyvium,
@@ -221,19 +221,19 @@ use tfhe::shortint::prelude::*;
#[test]
fn kreyvium_test_shortint_long() {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB000000000000".to_string();

View File

@@ -1,9 +1,9 @@
use crate::{TransCiphering, TriviumStream, TriviumStreamByte, TriviumStreamShortint};
use tfhe::prelude::*;
use tfhe::shortint::parameters::v1_2::{
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
use tfhe::shortint::parameters::current_params::{
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
};
use tfhe::{generate_keys, ConfigBuilder, FheBool, FheUint64, FheUint8};
// Values for these tests come from the github repo cantora/avr-crypto-lib, commit 2a5b018,
@@ -357,19 +357,19 @@ use tfhe::shortint::prelude::*;
#[test]
fn trivium_test_shortint_long() {
let config = ConfigBuilder::default()
.use_custom_parameters(V1_2_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.use_custom_parameters(V1_3_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128)
.build();
let (hl_client_key, hl_server_key) = generate_keys(config);
let underlying_ck: tfhe::shortint::ClientKey = (*hl_client_key.as_ref()).clone().into();
let underlying_sk: tfhe::shortint::ServerKey = (*hl_server_key.as_ref()).clone().into();
let (client_key, server_key): (ClientKey, ServerKey) =
gen_keys(V1_2_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
gen_keys(V1_3_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128);
let ksk = KeySwitchingKey::new(
(&client_key, Some(&server_key)),
(&underlying_ck, &underlying_sk),
V1_2_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
V1_3_PARAM_KEYSWITCH_1_1_KS_PBS_TO_2_2_KS_PBS_GAUSSIAN_2M128,
);
let key_string = "0053A6F94C9FF24598EB".to_string();

View File

@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
void cuda_improve_noise_modulus_switch_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
double r_sigma, double bound, uint32_t log_modulus);
void const *lwe_array_in, void const *lwe_array_indexes,
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
uint32_t log_modulus);
void cuda_glwe_sample_extract_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,

View File

@@ -24,7 +24,15 @@ using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
return std::get<Torus *>(variant); \
} \
}()
// Macro to define the visitor logic using std::holds_alternative for vectors
#define GET_VARIANT_ELEMENT_64BIT(variant, index) \
[&] { \
if (std::holds_alternative<std::vector<uint64_t *>>(variant)) { \
return std::get<std::vector<uint64_t *>>(variant)[index]; \
} else { \
return std::get<uint64_t *>(variant); \
} \
}()
int get_active_gpu_count(int num_inputs, int gpu_count);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

View File

@@ -400,8 +400,9 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaRadixCiphertextFFI *radix_lwe_vec,
bool reduce_degrees_for_single_carry_propagation, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
@@ -414,7 +415,8 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array);
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
bool allocate_ms_array);
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -538,5 +540,100 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void extend_radix_with_trivial_zero_blocks_msb_64(
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
void *const *streams, uint32_t const *gpu_indexes);
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
void *const *streams,
uint32_t const *gpu_indexes);
uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool anticipated_buffer_drop,
bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
uint64_t rhs, uint64_t const *decomposed_scalar,
uint64_t const *has_at_least_one_set,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_scalars);
void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_apply_noise_squashing_kb(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_glwe_dimension,
uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t num_original_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_apply_noise_squashing_kb(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks);
void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry);
void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t num_additional_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, bool allocate_ms_array);
void cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
} // extern C
#endif // CUDA_INTEGER_H

View File

@@ -6,6 +6,8 @@
#include "integer/radix_ciphertext.h"
#include "keyswitch/keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include "pbs/programmable_bootstrap_128.cuh"
#include "utils/helper_multi_gpu.cuh"
#include <cmath>
#include <functional>
@@ -249,7 +251,6 @@ template <typename Torus> struct int_radix_lut {
h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes_in[i] = i;
@@ -528,10 +529,10 @@ template <typename Torus> struct int_radix_lut {
}
// Return a pointer to idx-ith degree
Torus *get_degree(size_t idx) { return &degrees[num_many_lut * idx]; }
uint64_t *get_degree(size_t idx) { return &degrees[num_many_lut * idx]; }
// Return a pointer to idx-ith max degree
Torus *get_max_degree(size_t idx) { return &max_degrees[idx]; }
uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; }
// Return a pointer to idx-ith lut indexes at gpu_index's global memory
Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
@@ -580,6 +581,7 @@ template <typename Torus> struct int_radix_lut {
streams[i], gpu_indexes[i], gpu_memory_allocated);
}
}
cuda_set_device(gpu_indexes[0]);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -646,6 +648,206 @@ template <typename Torus> struct int_radix_lut {
free(max_degrees);
}
};
template <typename InputTorus> struct int_noise_squashing_lut {
int_radix_params params;
uint32_t input_glwe_dimension;
uint32_t input_polynomial_size;
uint32_t input_big_lwe_dimension;
uint32_t num_blocks;
// Tracks the degree of each LUT and the max degree on CPU
// The max degree is (message_modulus * carry_modulus - 1) except for many lut
// for which it's different
uint64_t *degrees;
uint64_t *max_degrees;
int active_gpu_count;
// There will be one buffer on each GPU in multi-GPU computations
// (same for tmp lwe arrays)
std::vector<int8_t *> pbs_buffer;
std::vector<__uint128_t *> lut_vec;
uint32_t *gpu_indexes;
CudaRadixCiphertextFFI *tmp_lwe_before_ks;
// All tmp lwe arrays and index arrays for lwe contain the total
// amount of blocks to be computed on, there is no split between GPUs
// for the moment
InputTorus *lwe_indexes_in;
InputTorus *h_lwe_indexes_in;
InputTorus *h_lwe_indexes_out;
InputTorus *lwe_trivial_indexes;
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<InputTorus *> lwe_array_in_vec;
std::vector<InputTorus *> lwe_after_ks_vec;
std::vector<__uint128_t *> lwe_after_pbs_vec;
std::vector<InputTorus *> lwe_trivial_indexes_vec;
bool using_trivial_lwe_indexes = true;
bool gpu_memory_allocated;
// noise squashing constructor
int_noise_squashing_lut(cudaStream_t const *streams,
uint32_t const *input_gpu_indexes, uint32_t gpu_count,
int_radix_params params,
uint32_t input_glwe_dimension,
uint32_t input_polynomial_size,
uint32_t num_radix_blocks,
uint32_t original_num_blocks,
bool allocate_gpu_memory, uint64_t *size_tracker) {
this->params = params;
this->num_blocks = num_radix_blocks;
gpu_memory_allocated = allocate_gpu_memory;
// This are the glwe dimension and polynomial size before squashing
this->input_glwe_dimension = input_glwe_dimension;
this->input_polynomial_size = input_polynomial_size;
uint32_t input_big_lwe_dimension =
input_glwe_dimension * input_polynomial_size;
this->input_big_lwe_dimension = input_big_lwe_dimension;
uint32_t lut_buffer_size = (params.glwe_dimension + 1) *
params.polynomial_size * sizeof(__uint128_t);
gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(i);
auto num_radix_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
int8_t *gpu_pbs_buffer;
uint64_t size = 0;
execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
params.small_lwe_dimension, params.glwe_dimension,
params.polynomial_size, params.pbs_level,
num_radix_blocks_on_gpu, allocate_gpu_memory,
params.allocate_ms_array, &size);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
if (i == 0 && size_tracker != nullptr) {
*size_tracker += size;
}
pbs_buffer.push_back(gpu_pbs_buffer);
}
lwe_indexes_in = (InputTorus *)cuda_malloc_with_size_tracking_async(
num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
lwe_trivial_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
h_lwe_indexes_in =
(InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));
for (int i = 0; i < num_radix_blocks; i++)
h_lwe_indexes_in[i] = i;
cuda_memcpy_with_size_tracking_async_to_gpu(
lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(InputTorus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
lwe_trivial_indexes, h_lwe_indexes_in,
num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
allocate_gpu_memory);
multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
lwe_array_in_vec, num_radix_blocks,
params.big_lwe_dimension + 1, size_tracker,
allocate_gpu_memory);
multi_gpu_alloc_lwe_async<InputTorus>(
streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
num_radix_blocks, params.small_lwe_dimension + 1, size_tracker,
allocate_gpu_memory);
multi_gpu_alloc_lwe_async<__uint128_t>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
num_radix_blocks, params.big_lwe_dimension + 1, size_tracker,
allocate_gpu_memory);
multi_gpu_alloc_array_async<InputTorus>(
streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
num_radix_blocks, size_tracker, allocate_gpu_memory);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
lwe_trivial_indexes_vec, lwe_trivial_indexes,
num_radix_blocks, allocate_gpu_memory);
if (allocate_gpu_memory) {
// Allocate LUT
// LUT is used as a trivial encryption and must be initialized outside
// this constructor
for (uint i = 0; i < active_gpu_count; i++) {
auto lut = (__uint128_t *)cuda_malloc_with_size_tracking_async(
lut_buffer_size, streams[i], gpu_indexes[i], size_tracker,
allocate_gpu_memory);
lut_vec.push_back(lut);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
// Keyswitch
tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<InputTorus>(
streams[0], gpu_indexes[0], tmp_lwe_before_ks, original_num_blocks,
input_big_lwe_dimension, size_tracker, allocate_gpu_memory);
degrees = (uint64_t *)malloc(sizeof(uint64_t));
max_degrees = (uint64_t *)malloc(sizeof(uint64_t));
// lut for the squashing
auto f_squash = [](__uint128_t block) -> __uint128_t { return block; };
// Generate the identity LUT, for now we only use one GPU
for (uint i = 0; i < active_gpu_count; i++) {
auto squash_lut = lut_vec[i];
generate_device_accumulator<__uint128_t>(
streams[i], gpu_indexes[i], squash_lut, degrees, max_degrees,
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, f_squash, allocate_gpu_memory);
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
free(this->gpu_indexes);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_drop_with_size_tracking_async(lut_vec[i], streams[i], gpu_indexes[i],
gpu_memory_allocated);
}
cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
lut_vec.clear();
free(h_lwe_indexes_in);
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
tmp_lwe_before_ks, gpu_memory_allocated);
for (int i = 0; i < pbs_buffer.size(); i++) {
cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
&pbs_buffer[i]);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
for (uint i = 0; i < active_gpu_count; i++)
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
lwe_array_in_vec.clear();
lwe_after_ks_vec.clear();
lwe_after_pbs_vec.clear();
lwe_trivial_indexes_vec.clear();
delete tmp_lwe_before_ks;
pbs_buffer.clear();
}
};
template <typename Torus> struct int_bit_extract_luts_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
@@ -1076,9 +1278,10 @@ template <typename Torus> struct int_overflowing_sub_memory {
luts_array->get_degree(1), luts_array->get_max_degree(1),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_lut_does_block_generate_or_propagate, gpu_memory_allocated);
cuda_set_value_async<Torus>(streams[0], gpu_indexes[0],
luts_array->get_lut_indexes(0, 1), 1,
num_radix_blocks - 1);
if (allocate_gpu_memory)
cuda_set_value_async<Torus>(streams[0], gpu_indexes[0],
luts_array->get_lut_indexes(0, 1), 1,
num_radix_blocks - 1);
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], luts_borrow_propagation_sum->get_lut(0, 0),
@@ -1116,18 +1319,123 @@ template <typename Torus> struct int_overflowing_sub_memory {
};
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
CudaRadixCiphertextFFI *new_blocks;
CudaRadixCiphertextFFI *new_blocks_copy;
CudaRadixCiphertextFFI *old_blocks;
CudaRadixCiphertextFFI *small_lwe_vector;
int_radix_params params;
int32_t *d_smart_copy_in;
int32_t *d_smart_copy_out;
bool mem_reuse = false;
uint32_t max_total_blocks_in_vec;
uint32_t num_blocks_in_radix;
uint32_t max_num_radix_in_vec;
uint32_t chunk_size;
uint64_t *size_tracker;
bool gpu_memory_allocated;
// temporary buffers
CudaRadixCiphertextFFI *current_blocks;
CudaRadixCiphertextFFI *small_lwe_vector;
uint32_t *d_columns_data;
uint32_t *d_columns_counter;
uint32_t **d_columns;
uint32_t *d_new_columns_data;
uint32_t *d_new_columns_counter;
uint32_t **d_new_columns;
uint64_t *d_degrees;
// lookup table for extracting message and carry
int_radix_lut<Torus> *luts_message_carry;
bool mem_reuse = false;
bool allocated_luts_message_carry;
void setup_index_buffers(cudaStream_t const *streams,
uint32_t const *gpu_indexes) {
d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
size_tracker, gpu_memory_allocated);
auto num_blocks_in_radix = this->num_blocks_in_radix;
auto max_num_radix_in_vec = this->max_num_radix_in_vec;
auto setup_columns =
[num_blocks_in_radix, max_num_radix_in_vec, streams,
gpu_indexes](uint32_t **&columns, uint32_t *&columns_data,
uint32_t *&columns_counter, uint64_t *size_tracker,
bool gpu_memory_allocated) {
columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
streams[0], gpu_indexes[0], size_tracker, gpu_memory_allocated);
columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * sizeof(uint32_t), streams[0],
gpu_indexes[0], size_tracker, gpu_memory_allocated);
cuda_memset_with_size_tracking_async(
columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
streams[0], gpu_indexes[0], gpu_memory_allocated);
uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
for (int i = 0; i < num_blocks_in_radix; ++i) {
h_columns[i] = columns_data + i * max_num_radix_in_vec;
}
columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * sizeof(uint32_t *), streams[0],
gpu_indexes[0], size_tracker, gpu_memory_allocated);
if (gpu_memory_allocated) {
cuda_memcpy_async_to_gpu(columns, h_columns,
num_blocks_in_radix * sizeof(uint32_t *),
streams[0], gpu_indexes[0]);
}
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete[] h_columns;
};
setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
gpu_memory_allocated);
setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
size_tracker, gpu_memory_allocated);
}
void setup_lookup_tables(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
uint32_t message_modulus = params.message_modulus;
if (!mem_reuse) {
uint32_t pbs_count = std::max(2 * (max_total_blocks_in_vec / chunk_size),
2 * num_blocks_in_radix);
if (max_total_blocks_in_vec > 0) {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 2, pbs_count,
gpu_memory_allocated, size_tracker);
} else {
allocated_luts_message_carry = false;
}
}
if (allocated_luts_message_carry) {
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc,
luts_message_carry->get_degree(0),
luts_message_carry->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_message, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc,
luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, gpu_memory_allocated);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
}
}
int_sum_ciphertexts_vec_memory(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
@@ -1136,103 +1444,87 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
bool allocate_gpu_memory,
uint64_t *size_tracker) {
this->params = params;
gpu_memory_allocated = allocate_gpu_memory;
this->mem_reuse = false;
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
this->num_blocks_in_radix = num_blocks_in_radix;
this->max_num_radix_in_vec = max_num_radix_in_vec;
this->gpu_memory_allocated = allocate_gpu_memory;
this->size_tracker = size_tracker;
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
(params.message_modulus - 1);
this->allocated_luts_message_carry = true;
setup_index_buffers(streams, gpu_indexes);
setup_lookup_tables(streams, gpu_indexes, gpu_count);
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
// allocate gpu memory for intermediate buffers
new_blocks = new CudaRadixCiphertextFFI;
// create and allocate intermediate buffers
current_blocks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
new_blocks_copy = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
old_blocks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
}
int_sum_ciphertexts_vec_memory(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
CudaRadixCiphertextFFI *old_blocks,
CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *current_blocks,
CudaRadixCiphertextFFI *small_lwe_vector,
int_radix_lut<Torus> *reused_lut, bool allocate_gpu_memory,
uint64_t *size_tracker) {
mem_reuse = true;
gpu_memory_allocated = allocate_gpu_memory;
this->mem_reuse = true;
this->params = params;
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
this->num_blocks_in_radix = num_blocks_in_radix;
this->max_num_radix_in_vec = max_num_radix_in_vec;
this->gpu_memory_allocated = allocate_gpu_memory;
this->size_tracker = size_tracker;
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
(params.message_modulus - 1);
this->allocated_luts_message_carry = true;
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
// assign gpu memory for intermediate buffers
this->new_blocks = new_blocks;
this->old_blocks = old_blocks;
this->current_blocks = current_blocks;
this->small_lwe_vector = small_lwe_vector;
new_blocks_copy = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
this->luts_message_carry = reused_lut;
setup_index_buffers(streams, gpu_indexes);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
gpu_indexes[0], gpu_memory_allocated);
if (!mem_reuse) {
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
small_lwe_vector, gpu_memory_allocated);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete new_blocks;
delete old_blocks;
if (allocated_luts_message_carry) {
luts_message_carry->release(streams, gpu_indexes, gpu_count);
delete luts_message_carry;
}
delete current_blocks;
delete small_lwe_vector;
}
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
gpu_memory_allocated);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete new_blocks_copy;
}
};
// For sequential algorithm in group propagation
template <typename Torus> struct int_seq_group_prop_memory {
@@ -2549,7 +2841,7 @@ template <typename Torus> struct int_mul_memory {
// radix_lwe_left except the last blocks of each shift
int msb_vector_block_count = num_radix_blocks * (num_radix_blocks - 1) / 2;
int total_block_count = lsb_vector_block_count + msb_vector_block_count;
int total_block_count = num_radix_blocks * num_radix_blocks;
// allocate memory for intermediate buffers
vector_result_sb = new CudaRadixCiphertextFFI;
@@ -2562,13 +2854,13 @@ template <typename Torus> struct int_mul_memory {
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], small_lwe_vector, total_block_count,
streams[0], gpu_indexes[0], small_lwe_vector, 2 * total_block_count,
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
// create int_radix_lut objects for lsb, msb, message, carry
// luts_array -> lut = {lsb_acc, msb_acc}
luts_array = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
params, 2, total_block_count,
params, 2, 2 * total_block_count,
allocate_gpu_memory, size_tracker);
auto lsb_acc = luts_array->get_lut(0, 0);
auto msb_acc = luts_array->get_lut(0, 1);
@@ -2595,16 +2887,17 @@ template <typename Torus> struct int_mul_memory {
// first lsb_vector_block_count value should reference to lsb_acc
// last msb_vector_block_count values should reference to msb_acc
// for message and carry default lut_indexes_vec is fine
cuda_set_value_async<Torus>(
streams[0], gpu_indexes[0],
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
if (allocate_gpu_memory)
cuda_set_value_async<Torus>(
streams[0], gpu_indexes[0],
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
luts_array->broadcast_lut(streams, gpu_indexes, 0);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array,
allocate_gpu_memory, size_tracker);
uint32_t uses_carry = 0;
uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -2750,9 +3043,10 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
tmp_rotated = pre_allocated_buffer;
reuse_memory = true;
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
tmp_rotated, 0,
tmp_rotated->num_radix_blocks);
if (allocate_gpu_memory)
set_zero_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], tmp_rotated, 0,
tmp_rotated->num_radix_blocks);
uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);
@@ -3918,7 +4212,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
zero_out_if_overflow_did_not_happen[0]->get_degree(0),
zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, 2, gpu_memory_allocated);
params.carry_modulus, cur_lut_f, params.message_modulus - 2,
gpu_memory_allocated);
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
0);
generate_device_accumulator_bivariate_with_factor<Torus>(
@@ -3927,7 +4222,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
zero_out_if_overflow_did_not_happen[1]->get_degree(0),
zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, 3, gpu_memory_allocated);
params.carry_modulus, cur_lut_f, params.message_modulus - 1,
gpu_memory_allocated);
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
0);
@@ -3954,7 +4250,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
zero_out_if_overflow_happened[0]->get_degree(0),
zero_out_if_overflow_happened[0]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, 2, gpu_memory_allocated);
params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
gpu_memory_allocated);
zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes, 0);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams[0], gpu_indexes[0],
@@ -3962,7 +4259,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
zero_out_if_overflow_happened[1]->get_degree(0),
zero_out_if_overflow_happened[1]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, 3, gpu_memory_allocated);
params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
gpu_memory_allocated);
zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes, 0);
// merge_overflow_flags_luts
@@ -4378,26 +4676,28 @@ template <typename Torus> struct int_scalar_mul_buffer {
int_sc_prop_memory<Torus> *sc_prop_mem;
bool anticipated_buffers_drop;
bool gpu_memory_allocated;
uint32_t num_ciphertext_bits;
int_scalar_mul_buffer(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory, bool anticipated_buffer_drop,
uint64_t *size_tracker) {
uint32_t num_scalar_bits, bool allocate_gpu_memory,
bool anticipated_buffer_drop, uint64_t *size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
this->anticipated_buffers_drop = anticipated_buffer_drop;
uint32_t msg_bits = (uint32_t)std::log2(params.message_modulus);
size_t num_ciphertext_bits = msg_bits * num_radix_blocks;
num_ciphertext_bits = msg_bits * num_scalar_bits;
//// Contains all shifted values of lhs for shift in range (0..msg_bits)
//// The idea is that with these we can create all other shift that are
/// in / range (0..total_bits) for free (block rotation)
preshifted_buffer = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], preshifted_buffer, num_ciphertext_bits,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
streams[0], gpu_indexes[0], preshifted_buffer,
msg_bits * num_radix_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
all_shifted_buffer = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -4414,9 +4714,11 @@ template <typename Torus> struct int_scalar_mul_buffer {
streams, gpu_indexes, gpu_count, LEFT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
num_ciphertext_bits, allocate_gpu_memory, size_tracker);
if (num_ciphertext_bits > 0) {
sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
num_ciphertext_bits, allocate_gpu_memory, size_tracker);
}
uint32_t uses_carry = 0;
uint32_t requested_flag = outputFlag::FLAG_NONE;
sc_prop_mem = new int_sc_prop_memory<Torus>(
@@ -4428,9 +4730,11 @@ template <typename Torus> struct int_scalar_mul_buffer {
uint32_t gpu_count) {
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
all_shifted_buffer, gpu_memory_allocated);
sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
if (num_ciphertext_bits > 0) {
sum_ciphertexts_vec_mem->release(streams, gpu_indexes, gpu_count);
delete sum_ciphertexts_vec_mem;
}
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
delete sum_ciphertexts_vec_mem;
delete sc_prop_mem;
delete all_shifted_buffer;
if (!anticipated_buffers_drop) {
@@ -4686,6 +4990,169 @@ template <typename Torus> struct int_div_rem_memory {
}
};
template <typename Torus> struct int_scalar_mul_high {
int_radix_params params;
bool allocate_gpu_memory;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem;
int_scalar_mul_buffer<Torus> *scalar_mul_mem;
CudaRadixCiphertextFFI *tmp;
int_scalar_mul_high(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
uint32_t num_radix_blocks, const bool allocate_gpu_memory,
SHIFT_OR_ROTATE_TYPE shift_type, uint32_t num_scalar_bits,
bool anticipated_buffer_drop, uint64_t *size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, shift_type, params,
2 * num_radix_blocks, allocate_gpu_memory, size_tracker);
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, 2 * num_radix_blocks,
num_scalar_bits, allocate_gpu_memory, anticipated_buffer_drop,
size_tracker);
this->tmp = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], tmp, 2 * num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
logical_scalar_shift_mem->release(streams, gpu_indexes, gpu_count);
delete logical_scalar_shift_mem;
scalar_mul_mem->release(streams, gpu_indexes, gpu_count);
delete scalar_mul_mem;
release_radix_ciphertext_async(streams[0], gpu_indexes[0], tmp,
allocate_gpu_memory);
delete tmp;
}
};
template <typename Torus> struct int_sub_and_propagate {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *neg_rhs_array;
int_sc_prop_memory<Torus> *sc_prop_mem;
int_sub_and_propagate(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
const int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t *size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
this->neg_rhs_array = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
delete sc_prop_mem;
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
allocate_gpu_memory);
delete neg_rhs_array;
}
};
template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
int_radix_params params;
bool allocate_gpu_memory;
int_radix_lut<Torus> *lut;
CudaRadixCiphertextFFI *last_block;
CudaRadixCiphertextFFI *padding_block;
int_extend_radix_with_sign_msb_buffer(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const int_radix_params params,
uint32_t num_radix_blocks, uint32_t num_additional_blocks,
const bool allocate_gpu_memory, uint64_t *size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->lut = nullptr;
this->last_block = nullptr;
this->padding_block = nullptr;
if (num_additional_blocks != 0) {
this->lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
uint32_t bits_per_block = std::log2(params.message_modulus);
uint32_t msg_modulus = params.message_modulus;
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus,
[msg_modulus, bits_per_block](Torus x) {
const auto xm = x % msg_modulus;
const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
return (Torus)((msg_modulus - 1) * sign_bit);
},
allocate_gpu_memory);
this->last_block = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], last_block, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->padding_block = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], padding_block, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
if (lut != nullptr) {
lut->release(streams, gpu_indexes, gpu_count);
delete lut;
}
if (last_block != nullptr) {
release_radix_ciphertext_async(streams[0], gpu_indexes[0], last_block,
allocate_gpu_memory);
delete last_block;
}
if (padding_block != nullptr) {
release_radix_ciphertext_async(streams[0], gpu_indexes[0], padding_block,
allocate_gpu_memory);
delete padding_block;
}
}
};
void update_degrees_after_bitand(uint64_t *output_degrees,
uint64_t *lwe_array_1_degrees,
uint64_t *lwe_array_2_degrees,

View File

@@ -0,0 +1,13 @@
#ifndef CUDA_BOOTSTRAP_128_H
#define CUDA_BOOTSTRAP_128_H
#include "pbs_enums.h"
#include <stdint.h>
uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array);
#endif // CUDA_BOOTSTRAP_128_H

View File

@@ -240,14 +240,13 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
}
};
template <PBS_TYPE pbs_type> struct pbs_buffer_128;
template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
int8_t *d_mem;
__uint128_t *global_accumulator;
double *global_join_buffer;
__uint128_t *temp_lwe_array_in;
InputTorus *temp_lwe_array_in;
uint64_t *trivial_indexes;
PBS_VARIANT pbs_variant;
bool uses_noise_reduction;
@@ -263,11 +262,25 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;
this->uses_noise_reduction = allocate_ms_array;
this->temp_lwe_array_in =
(__uint128_t *)cuda_malloc_with_size_tracking_async(
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(__uint128_t),
stream, gpu_index, size_tracker, allocate_ms_array);
if (allocate_ms_array) {
this->temp_lwe_array_in = (InputTorus *)cuda_malloc_async(
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(InputTorus),
stream, gpu_index);
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
size_tracker, allocate_ms_array);
uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
h_trivial_indexes[i] = i;
cuda_memcpy_with_size_tracking_async_to_gpu(
trivial_indexes, h_trivial_indexes,
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
allocate_gpu_memory);
cuda_synchronize_stream(stream, gpu_index);
delete[] h_trivial_indexes;
}
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
@@ -404,9 +417,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
gpu_memory_allocated);
if (uses_noise_reduction)
if (uses_noise_reduction) {
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
gpu_memory_allocated);
}
}
};
@@ -502,7 +518,12 @@ template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count);
uint32_t level_count,
uint32_t max_shared_memory);
bool has_support_to_cuda_programmable_bootstrap_128_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
#ifdef __CUDACC__
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,

View File

@@ -100,7 +100,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void const *lut_vector, void const *lwe_array_in,
void const *bootstrapping_key,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);

View File

@@ -112,15 +112,15 @@ template <typename Torus> struct zk_expand_mem {
// Hint for future readers: if message_modulus == 4 then
// packed_messages_per_lwe becomes 2
auto packed_messages_per_lwe = log2_int(params.message_modulus);
auto num_packed_msgs = log2_int(params.message_modulus);
// Adjust indexes to permute the output and access the correct LUT
auto h_indexes_in = static_cast<Torus *>(
malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
auto h_indexes_out = static_cast<Torus *>(
malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
auto h_lut_indexes = static_cast<Torus *>(
malloc(packed_messages_per_lwe * num_lwes * sizeof(Torus)));
malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
auto h_body_id_per_compact_list =
static_cast<uint32_t *>(malloc(num_lwes * sizeof(uint32_t)));
auto h_lwe_compact_input_indexes =
@@ -138,6 +138,10 @@ template <typename Torus> struct zk_expand_mem {
auto compact_list_id = 0;
auto idx = 0;
auto count = 0;
// During flatenning, all num_lwes LWEs from all compact lists are stored
// sequentially on a Torus array. h_lwe_compact_input_indexes stores the
// index of the first LWE related to the compact list that contains the i-th
// LWE
for (int i = 0; i < num_lwes; i++) {
h_lwe_compact_input_indexes[i] = idx;
count++;
@@ -148,6 +152,8 @@ template <typename Torus> struct zk_expand_mem {
}
}
// Stores the index of the i-th LWE (within each compact list) related to
// the k-th compact list.
auto offset = 0;
for (int k = 0; k < num_compact_lists; k++) {
auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
@@ -159,46 +165,75 @@ template <typename Torus> struct zk_expand_mem {
offset += num_lwes_in_kth_compact_list;
}
/*
* Each LWE contains encrypted data in both carry and message spaces
* that needs to be extracted.
*
* The loop processes each compact list (k) and for each LWE within that
* list:
* 1. Sets input indexes to read each LWE twice (for carry and message
* extraction)
* 2. Creates output indexes to properly reorder the results
* 3. Selects appropriate LUT index based on whether boolean sanitization is
* needed
*
* We want the output to have always first the content of the message part
* and then the content of the carry part of each LWE.
*
* i.e. msg_extract(LWE_0), carry_extract(LWE_0), msg_extract(LWE_1),
* carry_extract(LWE_1), ...
*
* Aiming that behavior, with 4 LWEs we would have:
*
* // Each LWE is processed twice
* h_indexes_in = {0, 1, 2, 3, 0, 1, 2, 3}
*
* // First 4 use message LUT, last 4 use carry LUT
* h_lut_indexes = {0, 0, 0, 0, 1, 1, 1, 1}
*
* // Reorders output so message and carry for each LWE appear together
* h_indexes_out = {0, 2, 4, 6, 1, 3, 5, 7}
*
* If an LWE contains a boolean value, its LUT index is shifted by
* num_packed_msgs to use the sanitization LUT (which ensures output is
* exactly 0 or 1).
*/
offset = 0;
for (int k = 0; k < num_compact_lists; k++) {
auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
for (int i = 0;
i < packed_messages_per_lwe * num_lwes_in_kth_compact_list; i++) {
Torus j = i % num_lwes_in_kth_compact_list;
h_indexes_in[i + packed_messages_per_lwe * offset] = j + offset;
h_indexes_out[i + packed_messages_per_lwe * offset] =
packed_messages_per_lwe * (j + offset) +
(i / num_lwes_in_kth_compact_list);
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto lwe_index_in_list = i % num_lwes_in_kth;
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
h_indexes_out[lwe_index] =
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
// If the input relates to a boolean, shift the LUT so the correct one
// with sanitization is used
h_lut_indexes[i + packed_messages_per_lwe * offset] =
(is_boolean_array[h_indexes_out[i +
packed_messages_per_lwe * offset]]
? packed_messages_per_lwe
: 0) +
i / num_lwes_in_kth_compact_list;
auto boolean_offset =
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
}
offset += num_lwes_in_kth_compact_list;
offset += num_lwes_in_kth;
}
message_and_carry_extract_luts->set_lwe_indexes(
streams[0], gpu_indexes[0], h_indexes_in, h_indexes_out);
auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_lwe_compact_input_indexes, h_lwe_compact_input_indexes,
num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_indexes, h_lut_indexes,
packed_messages_per_lwe * num_lwes * sizeof(Torus), streams[0],
gpu_indexes[0], allocate_gpu_memory);
lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
d_body_id_per_compact_list, h_body_id_per_compact_list,
num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
allocate_gpu_memory);
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes, 0);
// The expanded LWEs will always be on the casting key format
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),

View File

@@ -84,15 +84,19 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
static_cast<uint64_t *>(lwe_array_out), size, log_modulus);
}
// This end point is used only for testing purposes
// its output always follows trivial ordering
void cuda_improve_noise_modulus_switch_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
double r_sigma, double bound, uint32_t log_modulus) {
void const *lwe_array_in, void const *lwe_array_indexes,
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
uint32_t log_modulus) {
host_improve_noise_modulus_switch<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t const *>(lwe_array_in),
static_cast<uint64_t const *>(lwe_array_indexes),
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
num_zeros, input_variance, r_sigma, bound, log_modulus);
}

View File

@@ -172,14 +172,14 @@ __host__ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe(
// allocate at least LWE-mask times two: to keep both decomposition state and
// decomposed intermediate value
int memory_unit = glwe_accumulator_size > lwe_dimension * 2
? glwe_accumulator_size
: lwe_dimension * 2;
uint64_t memory_unit = glwe_accumulator_size > lwe_dimension * 2
? glwe_accumulator_size
: lwe_dimension * 2;
uint64_t size_tracker;
uint64_t buffer_size = 2 * num_lwes * memory_unit * sizeof(Torus);
*fp_ks_buffer = (int8_t *)cuda_malloc_with_size_tracking_async(
2 * num_lwes * memory_unit * sizeof(Torus), stream, gpu_index,
&size_tracker, allocate_gpu_memory);
buffer_size, stream, gpu_index, &size_tracker, allocate_gpu_memory);
return size_tracker;
}

View File

@@ -178,11 +178,12 @@ __device__ __forceinline__ double measure_modulus_switch_noise(
// Each thread processes two elements of the lwe array
template <typename Torus>
__global__ void
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
const Torus *zeros, int lwe_size, int num_zeros,
double input_variance, double r_sigma,
double bound, uint32_t log_modulus) {
__global__ void __launch_bounds__(512)
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
const uint64_t *indexes, const Torus *zeros,
int lwe_size, int num_zeros,
double input_variance, double r_sigma,
double bound, uint32_t log_modulus) {
// First we will assume size is less than the number of threads per block
// I should switch this to dynamic shared memory
@@ -198,13 +199,14 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
// This probably are not needed cause we are setting the values
sum_mask_errors[threadIdx.x] = 0.f;
sum_squared_mask_errors[threadIdx.x] = 0.f;
auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
// We use modulus switch to gather the output in trivial order
auto this_block_lwe_out = array_out + blockIdx.x * lwe_size;
Torus input_element1 = this_block_lwe_in[threadIdx.x];
Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
Torus input_element2 =
threadIdx.x + blockDim.x < lwe_size
? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
: 0;
Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
? this_block_lwe_in[threadIdx.x + blockDim.x]
: 0;
// Base noise is only handled by thread 0
double base_noise = measure_modulus_switch_noise<Torus>(
@@ -218,11 +220,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
__syncthreads();
if (found)
array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
this_block_lwe_out[threadIdx.x] = input_element1;
if (found && (threadIdx.x + blockDim.x) < lwe_size)
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
input_element2;
this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;
__syncthreads();
// If we found a zero element we stop iterating (in avg 20 times are
@@ -253,11 +254,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
// Assumption we always have at least 512 elements
// If we find a useful zero encryption we replace the lwe by lwe + zero
if (found)
array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
this_block_lwe_out[threadIdx.x] = zero_element1;
if (found && (threadIdx.x + blockDim.x) < lwe_size)
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
zero_element2;
this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;
__syncthreads();
// If we found a zero element we stop iterating (in avg 20 times are
@@ -270,9 +270,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
template <typename Torus>
__host__ void host_improve_noise_modulus_switch(
cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
const double r_sigma, const double bound, uint32_t log_modulus) {
Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
const double input_variance, const double r_sigma, const double bound,
uint32_t log_modulus) {
if (lwe_size < 512) {
PANIC("The lwe_size is less than 512, this is not supported\n");
@@ -289,8 +290,8 @@ __host__ void host_improve_noise_modulus_switch(
int num_threads = 512, num_blocks = num_lwes;
improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
bound, log_modulus);
array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
r_sigma, bound, log_modulus);
check_cuda_error(cudaGetLastError());
}

View File

@@ -492,6 +492,7 @@ __host__ void host_fourier_transform_forward_as_integer_f128(
batch_convert_u128_to_f128_as_integer<params>
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
d_standard);
check_cuda_error(cudaGetLastError());
// call negacyclic 128 bit forward fft.
if (full_sm) {
@@ -503,6 +504,7 @@ __host__ void host_fourier_transform_forward_as_integer_f128(
<<<grid_size, block_size, shared_memory_size, stream>>>(
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
}
check_cuda_error(cudaGetLastError());
cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
gpu_index);

View File

@@ -63,7 +63,7 @@ void update_degrees_after_bitor(uint64_t *output_degrees,
auto result = max;
for (uint j = 0; j < min + 1; j++) {
if (max | j > result) {
if ((max | j) > result) {
result = max | j;
}
}
@@ -82,7 +82,7 @@ void update_degrees_after_bitxor(uint64_t *output_degrees,
// Try every possibility to find the worst case
for (uint j = 0; j < min + 1; j++) {
if (max ^ j > result) {
if ((max ^ j) > result) {
result = max ^ j;
}
}

View File

@@ -36,7 +36,7 @@ __host__ void host_integer_radix_bitop_kb(
update_degrees_after_bitor(degrees, lwe_array_1->degrees,
lwe_array_2->degrees,
lwe_array_1->num_radix_blocks);
} else if (mem_ptr->op == BITXOR) {
} else if (mem_ptr->op == BITOP_TYPE::BITXOR) {
update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
lwe_array_2->degrees,
lwe_array_1->num_radix_blocks);

View File

@@ -0,0 +1,62 @@
#include "cast.cuh"
void extend_radix_with_trivial_zero_blocks_msb_64(
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
void *const *streams, uint32_t const *gpu_indexes) {
host_extend_radix_with_trivial_zero_blocks_msb<uint64_t>(
output, input, (cudaStream_t *)streams, gpu_indexes);
}
void trim_radix_blocks_lsb_64(CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
void *const *streams,
uint32_t const *gpu_indexes) {
host_trim_radix_blocks_lsb<uint64_t>(output, input, (cudaStream_t *)streams,
gpu_indexes);
}
uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t num_additional_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_extend_radix_with_sign_msb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
(int_extend_radix_with_sign_msb_buffer<uint64_t> **)mem_ptr, params,
num_blocks, num_additional_blocks, allocate_gpu_memory);
}
void cuda_extend_radix_with_sign_msb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
int8_t *mem_ptr, uint32_t num_additional_blocks, void *const *bsks,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
host_extend_radix_with_sign_msb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
}
void cleanup_cuda_extend_radix_with_sign_msb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_extend_radix_with_sign_msb_buffer<uint64_t> *mem_ptr =
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

View File

@@ -0,0 +1,94 @@
#ifndef CAST_CUH
#define CAST_CUH
#include "device.h"
#include "integer.cuh"
#include "integer/integer_utilities.h"
template <typename Torus>
__host__ void host_extend_radix_with_trivial_zero_blocks_msb(
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
cudaStream_t const *streams, uint32_t const *gpu_indexes) {
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
0, input->num_radix_blocks, input, 0,
input->num_radix_blocks);
}
template <typename Torus>
__host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
cudaStream_t const *streams,
uint32_t const *gpu_indexes) {
const uint32_t input_start_lwe_index =
input->num_radix_blocks - output->num_radix_blocks;
if (input->num_radix_blocks <= output->num_radix_blocks) {
PANIC("Cuda error: input num blocks should be greater than output num "
"blocks");
}
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
input_start_lwe_index, input->num_radix_blocks);
}
template <typename Torus>
__host__ uint64_t scratch_extend_radix_with_sign_msb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
const int_radix_params params, uint32_t num_radix_blocks,
uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
num_additional_blocks, allocate_gpu_memory, &size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_extend_radix_with_sign_msb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
if (num_additional_blocks == 0) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], output,
input);
return;
}
const uint32_t input_blocks = input->num_radix_blocks;
if (input_blocks == 0) {
PANIC("Cuda error: input blocks cannot be zero");
}
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
0, input_blocks, input, 0,
input_blocks);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->last_block, 0, 1, input,
input_blocks - 1, input_blocks);
host_apply_univariate_lut_kb(
streams, gpu_indexes, gpu_count, mem_ptr->padding_block,
mem_ptr->last_block, mem_ptr->lut, ksks, ms_noise_reduction_key, bsks);
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
uint32_t dst_block_idx = input_blocks + i;
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
dst_block_idx, dst_block_idx + 1,
mem_ptr->padding_block, 0, 1);
}
}
#endif

View File

@@ -456,7 +456,7 @@ __host__ void tree_sign_reduction(
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
while (partial_block_count > 2) {
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
4);
message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -477,16 +477,17 @@ __host__ void tree_sign_reduction(
auto last_lut = tree_buffer->tree_last_leaf_lut;
auto block_selector_f = tree_buffer->block_selector_f;
std::function<Torus(Torus)> f;
auto num_bits_in_message = log2_int(params.message_modulus);
if (partial_block_count == 2) {
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, partial_block_count,
4);
message_modulus);
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
int msb = (x >> 2) & 3;
int lsb = x & 3;
f = [block_selector_f, sign_handler_f, num_bits_in_message,
message_modulus](Torus x) -> Torus {
Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
Torus lsb = x & (message_modulus - 1);
int final_sign = block_selector_f(msb, lsb);
Torus final_sign = block_selector_f(msb, lsb);
return sign_handler_f(final_sign);
};
} else {

View File

@@ -386,8 +386,9 @@ __host__ void host_unsigned_integer_div_rem_kb(
subtraction_overflowed,
at_least_one_upper_block_is_non_zero, 1);
int factor = (i) ? 3 : 2;
int factor_lut_id = factor - 2;
auto message_modulus = radix_params.message_modulus;
int factor = (i) ? message_modulus - 1 : message_modulus - 2;
int factor_lut_id = (i) ? 1 : 0;
for (size_t k = 0;
k < cleaned_merged_interesting_remainder->num_radix_blocks; k++) {
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],

View File

@@ -386,3 +386,69 @@ void reverseArray(uint64_t arr[], size_t n) {
end--;
}
}
uint64_t scratch_cuda_apply_noise_squashing_mem(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int_radix_params params, int_noise_squashing_lut<uint64_t> **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t num_radix_blocks, uint32_t original_num_blocks,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_noise_squashing_lut<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
polynomial_size, num_radix_blocks, original_num_blocks,
allocate_gpu_memory, &size_tracker);
return size_tracker;
}
uint64_t scratch_cuda_apply_noise_squashing_kb(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_glwe_dimension,
uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t original_num_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, bool allocate_ms_array) {
PUSH_RANGE("scratch noise squashing")
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_cuda_apply_noise_squashing_mem(
streams, gpu_indexes, gpu_count, params,
(int_noise_squashing_lut<uint64_t> **)mem_ptr, input_glwe_dimension,
input_polynomial_size, num_radix_blocks, original_num_blocks,
allocate_gpu_memory);
POP_RANGE()
}
void cuda_apply_noise_squashing_kb(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks) {
PUSH_RANGE("apply noise squashing")
integer_radix_apply_noise_squashing_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
input_radix_lwe, (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key);
POP_RANGE()
}
void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup noise squashing")
int_noise_squashing_lut<uint64_t> *mem_ptr =
(int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
POP_RANGE()
}

View File

@@ -9,6 +9,7 @@
#include "linear_algebra.h"
#include "linearalgebra/addition.cuh"
#include "linearalgebra/negation.cuh"
#include "pbs/pbs_128_utilities.h"
#include "pbs/programmable_bootstrap.h"
#include "polynomial/functions.cuh"
#include "utils/helper.cuh"
@@ -520,8 +521,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
if (num_radix_blocks > lut->num_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of lut radix blocks")
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
num_radix_blocks > lwe_array_in->num_radix_blocks)
if (num_radix_blocks > lwe_array_out->num_radix_blocks)
PANIC("Cuda error: num radix blocks on which lut is applied should be "
"smaller or equal to the number of input & output radix blocks")
@@ -866,7 +866,7 @@ uint64_t generate_lookup_table_with_encoding(
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
auto body = &acc[glwe_dimension * polynomial_size];
uint64_t degree = 0;
Torus degree = 0;
// This accumulator extracts the carry bits
for (int i = 0; i < input_modulus_sup; i++) {
@@ -886,7 +886,7 @@ uint64_t generate_lookup_table_with_encoding(
}
rotate_left<Torus>(body, half_box_size, polynomial_size);
return degree;
return (uint64_t)degree;
}
template <typename Torus>
@@ -1291,7 +1291,7 @@ void host_compute_prefix_sum_hillis_steele(
}
// This function is used to perform step 2 of Thomas' new propagation algorithm
// Consist three steps:
// Consists of three steps:
// - propagates the carry within each group with cheap LWE operations stored in
// simulators
// - calculates the propagation state of each group
@@ -1616,10 +1616,12 @@ __host__ void reduce_signs(
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
auto num_bits_in_message = log2_int(message_modulus);
std::function<Torus(Torus)> reduce_two_orderings_function =
[diff_buffer, sign_handler_f](Torus x) -> Torus {
int msb = (x >> 2) & 3;
int lsb = x & 3;
[diff_buffer, sign_handler_f, num_bits_in_message,
message_modulus](Torus x) -> Torus {
Torus msb = (x >> num_bits_in_message) & (message_modulus - 1);
Torus lsb = x & (message_modulus - 1);
return diff_buffer->tree_buffer->block_selector_f(msb, lsb);
};
@@ -1640,7 +1642,7 @@ __host__ void reduce_signs(
while (num_sign_blocks > 2) {
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
num_sign_blocks, 4);
num_sign_blocks, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
ms_noise_reduction_key, lut, num_sign_blocks / 2);
@@ -1669,7 +1671,8 @@ __host__ void reduce_signs(
message_modulus, carry_modulus, final_lut_f, true);
lut->broadcast_lut(streams, gpu_indexes, 0);
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a, 2, 4);
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
num_sign_blocks, message_modulus);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
ms_noise_reduction_key, lut, 1);
@@ -1677,8 +1680,8 @@ __host__ void reduce_signs(
} else {
std::function<Torus(Torus)> final_lut_f =
[mem_ptr, sign_handler_f](Torus x) -> Torus {
return sign_handler_f(x & 3);
[mem_ptr, sign_handler_f, message_modulus](Torus x) -> Torus {
return sign_handler_f(x & (message_modulus - 1));
};
auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
@@ -1831,9 +1834,6 @@ void host_propagate_single_carry(
PUSH_RANGE("propagate sc")
auto num_radix_blocks = lwe_array->num_radix_blocks;
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
auto lut_stride = mem->lut_stride;
auto num_many_lut = mem->num_many_lut;
CudaRadixCiphertextFFI output_flag;
@@ -1849,6 +1849,7 @@ void host_propagate_single_carry(
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
input_carries, 1);
}
// Step 1
host_compute_shifted_blocks_and_states<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, mem->shifted_blocks_state_mem,
@@ -2197,4 +2198,110 @@ void host_single_borrow_propagate(
}
}
/// num_radix_blocks corresponds to the number of blocks on which to apply the
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
/// the input and output numbers of blocks
template <typename InputTorus>
__host__ void integer_radix_apply_noise_squashing_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
InputTorus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
PUSH_RANGE("apply noise squashing")
auto params = lut->params;
auto pbs_type = params.pbs_type;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
auto pbs_level = params.pbs_level;
auto pbs_base_log = params.pbs_base_log;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
if (lwe_array_out->num_radix_blocks !=
(lwe_array_in->num_radix_blocks + 1) / 2)
PANIC("Cuda error: num output radix blocks should be "
"half ceil the number input radix blocks")
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
std::vector<InputTorus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<InputTorus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<__uint128_t *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<InputTorus *> lwe_trivial_indexes_vec =
lut->lwe_trivial_indexes_vec;
// We know carry is empty so we can pack two blocks in one
pack_blocks<InputTorus>(streams[0], gpu_indexes[0], lwe_array_pbs_in,
lwe_array_in, lwe_array_in->num_radix_blocks,
params.message_modulus);
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
// from the output ct
auto active_gpu_count =
get_active_gpu_count(lwe_array_out->num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<InputTorus>(
streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], (InputTorus *)lwe_array_pbs_in->ptr,
lut->lwe_indexes_in, ksks, lut->input_big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
lwe_array_out->num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_128_async<__uint128_t>(
streams, gpu_indexes, 1, (__uint128_t *)lwe_array_out->ptr,
lut->lut_vec, lwe_after_ks_vec[0], bsks, ms_noise_reduction_key,
lut->pbs_buffer, small_lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, lwe_array_out->num_radix_blocks);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<InputTorus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
lut->input_big_lwe_dimension + 1);
execute_keyswitch_async<InputTorus>(
streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
lwe_trivial_indexes_vec, lwe_array_in_vec, lwe_trivial_indexes_vec,
ksks, lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
ks_level, lwe_array_out->num_radix_blocks);
execute_pbs_128_async<__uint128_t>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, lut->lut_vec,
lwe_after_ks_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
small_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, lwe_array_out->num_radix_blocks);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<__uint128_t>(
streams, gpu_indexes, active_gpu_count,
(__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
(__uint128_t *)lut->h_lwe_indexes_out, lut->using_trivial_lwe_indexes,
lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
for (uint i = 0; i < lut->num_blocks; i++) {
lwe_array_out->degrees[i] = lut->degrees[0];
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
}
POP_RANGE()
}
#endif // TFHE_RS_INTERNAL_INTEGER_CUH

View File

@@ -226,72 +226,68 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
void *const *ksks,
CudaRadixCiphertextFFI *radix_lwe_vec,
bool reduce_degrees_for_single_carry_propagation, int8_t *mem_ptr,
void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
PANIC("Cuda error: input vector length should be a multiple of the "
"output's number of radix blocks")
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
// the general case
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
}
switch (mem->params.polynomial_size) {
case 512:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 1024:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 2048:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 4096:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 8192:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
case 16384:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, radix_lwe_out,
radix_lwe_vec, bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_vec, reduce_degrees_for_single_carry_propagation, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
radix_lwe_out->num_radix_blocks,
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks,
nullptr);
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "

View File

@@ -20,28 +20,11 @@
#include <fstream>
#include <iostream>
#include <omp.h>
#include <queue>
#include <sstream>
#include <string>
#include <vector>
template <typename Torus>
__global__ void smart_copy(Torus *dst, Torus *src, int32_t *id_out,
int32_t *id_in, size_t lwe_size) {
size_t tid = threadIdx.x;
size_t b_id = blockIdx.x;
size_t stride = blockDim.x;
auto input_id = id_in[b_id];
auto output_id = id_out[b_id];
auto cur_src = (input_id >= 0) ? &src[input_id * lwe_size] : nullptr;
auto cur_dst = &dst[output_id * lwe_size];
for (int i = tid; i < lwe_size; i += stride) {
cur_dst[i] = (input_id >= 0) ? cur_src[i] : 0;
}
}
template <typename Torus, class params>
__global__ void
all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
@@ -94,33 +77,155 @@ all_shifted_lhs_rhs(Torus const *radix_lwe_left, Torus *lsb_ciphertext,
}
}
template <typename Torus>
__global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
uint32_t chunk_size, uint32_t block_size,
uint32_t num_blocks) {
__global__ inline void radix_vec_to_columns(uint32_t *const *const columns,
uint32_t *const columns_counter,
const uint64_t *const degrees,
const uint32_t num_radix_blocks,
const uint32_t num_radix_in_vec) {
size_t stride = blockDim.x;
size_t chunk_id = blockIdx.x;
size_t chunk_elem_size = chunk_size * num_blocks * block_size;
size_t radix_elem_size = num_blocks * block_size;
auto src_chunk = &input_blocks[chunk_id * chunk_elem_size];
auto dst_radix = &result_blocks[chunk_id * radix_elem_size];
size_t block_stride = blockIdx.y * block_size;
auto result = &dst_radix[block_stride];
// init shared mem with first radix of chunk
size_t tid = threadIdx.x;
for (int i = tid; i < block_size; i += stride) {
result[i] = src_chunk[block_stride + i];
}
// accumulate rest of the radixes
for (int r_id = 1; r_id < chunk_size; r_id++) {
auto cur_src_radix = &src_chunk[r_id * radix_elem_size];
for (int i = tid; i < block_size; i += stride) {
result[i] += cur_src_radix[block_stride + i];
const uint32_t idx = threadIdx.x;
size_t cnt = 0;
for (int i = 0; i < num_radix_in_vec; i++) {
size_t ct_id = i * num_radix_blocks + idx;
if (degrees[ct_id] != 0) {
columns[idx][cnt] = ct_id;
++cnt;
}
}
columns_counter[idx] = cnt;
}
template <typename Torus>
__global__ inline void prepare_new_columns_and_pbs_indexes(
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
Torus *const lut_indexes, const uint32_t *const *const columns,
const uint32_t *const columns_counter, const uint32_t chunk_size) {
__shared__ uint32_t counter;
if (threadIdx.x == 0) {
counter = 0;
}
__syncthreads();
const uint32_t base_id = threadIdx.x;
const uint32_t column_len = columns_counter[base_id];
uint32_t ct_count = 0;
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
// those indexes are for message ciphertexts
// for message ciphertexts in and out index should be same
const uint32_t in_index = columns[base_id][i];
new_columns[base_id][ct_count] = in_index;
const uint32_t pbs_index = atomicAdd(&counter, 1);
pbs_indexes_in[pbs_index] = in_index;
pbs_indexes_out[pbs_index] = in_index;
lut_indexes[pbs_index] = 0;
++ct_count;
}
__syncthreads();
if (base_id > 0) {
const uint32_t prev_base_id = base_id - 1;
const uint32_t prev_column_len = columns_counter[prev_base_id];
for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
// those indexes are for carry ciphertexts
// for carry ciphertexts input is same as for message
// output will be placed to next block in the column
const uint32_t in_index = columns[prev_base_id][i];
const uint32_t out_index = columns[prev_base_id][i + 1];
new_columns[base_id][ct_count] = out_index;
const uint32_t pbs_index = atomicAdd(&counter, 1);
pbs_indexes_in[pbs_index] = in_index;
pbs_indexes_out[pbs_index] = out_index;
lut_indexes[pbs_index] = 1;
++ct_count;
}
}
const uint32_t start_index = column_len - column_len % chunk_size;
for (uint32_t i = start_index; i < column_len; ++i) {
new_columns[base_id][ct_count] = columns[base_id][i];
++ct_count;
}
new_columns_counter[base_id] = ct_count;
}
template <typename Torus>
__global__ inline void prepare_final_pbs_indexes(
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
int idx = threadIdx.x;
pbs_indexes_in[idx] = idx % num_radix_blocks;
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
lut_indexes[idx] = idx / num_radix_blocks;
}
template <typename Torus>
__global__ void calculate_chunks(Torus *const input_blocks,
const uint32_t *const *const columns,
const uint32_t *const columns_counter,
const uint32_t chunk_size,
const uint32_t block_size) {
const uint32_t part_size = blockDim.x;
const uint32_t base_id = blockIdx.x;
const uint32_t part_id = blockIdx.y;
const uint32_t coef_id = part_id * part_size + threadIdx.x;
if (coef_id >= block_size)
return;
const uint32_t column_len = columns_counter[base_id];
if (column_len >= chunk_size) {
const uint32_t num_chunks = column_len / chunk_size;
Torus result = 0;
for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
result = input_blocks[first_ct_id * block_size + coef_id];
for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
const uint32_t cur_ct_id =
columns[base_id][chunk_id * chunk_size + ct_id];
result += input_blocks[cur_ct_id * block_size + coef_id];
}
input_blocks[first_ct_id * block_size + coef_id] = result;
}
}
}
template <typename Torus>
__global__ void calculate_final_chunk_into_radix(
Torus *const out_radix, const Torus *const input_blocks,
const uint32_t *const *const columns, const uint32_t *const columns_counter,
const uint32_t chunk_size, const uint32_t block_size) {
const uint32_t part_size = blockDim.x;
const uint32_t base_id = blockIdx.x;
const uint32_t part_id = blockIdx.y;
const uint32_t coef_id = part_id * part_size + threadIdx.x;
if (coef_id >= block_size)
return;
const uint32_t column_len = columns_counter[base_id];
Torus result = 0;
if (column_len) {
const uint32_t first_ct_id = columns[base_id][0];
result = input_blocks[first_ct_id * block_size + coef_id];
for (uint32_t i = 1; i < column_len; ++i) {
const uint32_t cur_ct_it = columns[base_id][i];
result += input_blocks[cur_ct_it * block_size + coef_id];
}
}
out_radix[base_id * block_size + coef_id] = result;
}
template <typename Torus, class params>
@@ -167,6 +272,113 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
(process_msb) ? cur_msb_ct[params::degree] : 0;
}
}
struct radix_columns {
std::vector<size_t> columns_counter;
size_t num_blocks;
size_t num_radix_in_vec;
size_t chunk_size;
radix_columns(const uint64_t *const input_degrees, size_t num_blocks,
size_t num_radix_in_vec, size_t chunk_size,
bool &needs_processing)
: num_blocks(num_blocks), num_radix_in_vec(num_radix_in_vec),
chunk_size(chunk_size) {
needs_processing = false;
columns_counter.resize(num_blocks, 0);
for (size_t i = 0; i < num_radix_in_vec; ++i) {
for (size_t j = 0; j < num_blocks; ++j) {
if (input_degrees[i * num_blocks + j])
columns_counter[j] += 1;
}
}
for (size_t i = 0; i < num_blocks; ++i) {
if (columns_counter[i] > chunk_size) {
needs_processing = true;
break;
}
}
}
void next_accumulation(size_t &total_ciphertexts, size_t &message_ciphertexts,
bool &needs_processing) {
message_ciphertexts = 0;
total_ciphertexts = 0;
needs_processing = false;
for (int i = num_blocks - 1; i > 0; --i) {
size_t cur_count = columns_counter[i];
size_t prev_count = columns_counter[i - 1];
size_t new_count = 0;
// accumulated_blocks from current columns
new_count += cur_count / chunk_size;
// all accumulated message blocks needs pbs
message_ciphertexts += new_count;
// carry blocks from previous columns
new_count += prev_count / chunk_size;
// both carry and message blocks that needs pbs
total_ciphertexts += new_count;
// now add remaining non accumulated blocks that does not require pbs
new_count += cur_count % chunk_size;
columns_counter[i] = new_count;
if (new_count > chunk_size)
needs_processing = true;
}
// now do it for 0th block
size_t new_count = columns_counter[0] / chunk_size;
message_ciphertexts += new_count;
total_ciphertexts += new_count;
new_count += columns_counter[0] % chunk_size;
columns_counter[0] = new_count;
if (new_count > chunk_size) {
needs_processing = true;
}
}
};
inline void calculate_final_degrees(uint64_t *const out_degrees,
const uint64_t *const input_degrees,
size_t num_blocks, size_t num_radix_in_vec,
size_t chunk_size,
uint64_t message_modulus) {
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
return std::min(message_modulus - 1, degree);
};
std::vector<std::queue<uint64_t>> columns(num_blocks);
for (size_t i = 0; i < num_radix_in_vec; ++i) {
for (size_t j = 0; j < num_blocks; ++j) {
if (input_degrees[i * num_blocks + j])
columns[j].push(input_degrees[i * num_blocks + j]);
}
}
for (size_t i = 0; i < num_blocks; ++i) {
auto &col = columns[i];
while (col.size() > 1) {
uint32_t cur_degree = 0;
size_t mn = std::min(chunk_size, col.size());
for (int j = 0; j < mn; ++j) {
cur_degree += col.front();
col.pop();
}
const uint64_t new_degree = get_degree(cur_degree);
col.push(new_degree);
if ((i + 1) < num_blocks) {
columns[i + 1].push(new_degree);
}
}
}
for (int i = 0; i < num_blocks; i++) {
out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
}
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -185,11 +397,14 @@ template <typename Torus, class params>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
CudaRadixCiphertextFFI *terms,
bool reduce_degrees_for_single_carry_propagation, void *const *bsks,
uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
uint32_t num_radix_blocks, uint32_t num_radix_in_vec,
int_radix_lut<Torus> *reused_lut) {
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
if (terms->lwe_dimension != radix_lwe_out->lwe_dimension)
PANIC("Cuda error: output and input radix ciphertexts should have the same "
@@ -199,22 +414,29 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
PANIC("Cuda error: input vector does not have enough blocks")
if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
PANIC("Cuda error: output does not have enough blocks")
auto new_blocks = mem_ptr->new_blocks;
auto new_blocks_copy = mem_ptr->new_blocks_copy;
auto old_blocks = mem_ptr->old_blocks;
if (num_radix_in_vec == 0)
return;
auto current_blocks = mem_ptr->current_blocks;
auto small_lwe_vector = mem_ptr->small_lwe_vector;
auto d_degrees = mem_ptr->d_degrees;
auto d_columns = mem_ptr->d_columns;
auto d_columns_counter = mem_ptr->d_columns_counter;
auto d_new_columns = mem_ptr->d_new_columns;
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
auto luts_message_carry = mem_ptr->luts_message_carry;
auto message_modulus = mem_ptr->params.message_modulus;
auto carry_modulus = mem_ptr->params.carry_modulus;
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto small_lwe_size = small_lwe_dimension + 1;
auto chunk_size =
(mem_ptr->params.message_modulus * mem_ptr->params.carry_modulus - 1) /
(mem_ptr->params.message_modulus - 1);
size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
// In the case of extracting a single LWE this parameters are dummy
uint32_t num_many_lut = 1;
@@ -228,244 +450,195 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
terms, 0, num_radix_blocks);
return;
}
if (old_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
terms);
}
if (num_radix_in_vec == 2) {
CudaRadixCiphertextFFI old_blocks_slice;
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks_slice, num_radix_blocks);
CudaRadixCiphertextFFI terms_slice;
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
&terms_slice, num_radix_blocks);
return;
}
size_t r = num_radix_in_vec;
size_t total_modulus = message_modulus * carry_modulus;
size_t message_max = message_modulus - 1;
size_t chunk_size = (total_modulus - 1) / message_max;
size_t h_lwe_idx_in[terms->num_radix_blocks];
size_t h_lwe_idx_out[terms->num_radix_blocks];
int32_t h_smart_copy_in[terms->num_radix_blocks];
int32_t h_smart_copy_out[terms->num_radix_blocks];
/// Here it is important to query the default max shared memory on device 0
/// instead of cuda_get_max_shared_memory,
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
auto max_shared_memory = 0;
check_cuda_error(cudaDeviceGetAttribute(
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)
// to reduce average memory consumption
int_radix_lut<Torus> *luts_message_carry;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
if (reused_lut == nullptr) {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_radix_blocks, true, nullptr);
} else {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
if (mem_ptr->mem_reuse) {
mem_ptr->setup_lookup_tables(streams, gpu_indexes, gpu_count);
}
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
if (current_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
current_blocks, terms);
}
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc,
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_message, true);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry, true);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
total_blocks_in_vec * sizeof(uint64_t), streams[0],
gpu_indexes[0]);
while (r > 2) {
size_t cur_total_blocks = r * num_radix_blocks;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
dim3 add_grid(ch_amount, num_radix_blocks, 1);
cuda_set_device(gpu_indexes[0]);
radix_vec_to_columns<<<1, num_radix_blocks, 0, streams[0]>>>(
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
num_radix_in_vec);
cuda_set_device(gpu_indexes[0]);
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
(Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
bool needs_processing = false;
radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
num_radix_in_vec, chunk_size, needs_processing);
int number_of_threads = min(256, params::degree);
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
check_cuda_error(cudaGetLastError());
while (needs_processing) {
calculate_chunks<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
chunk_size, big_lwe_size);
size_t total_count = 0;
size_t message_count = 0;
size_t carry_count = 0;
size_t sm_copy_count = 0;
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0, streams[0]>>>(
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0), d_columns,
d_columns_counter, chunk_size);
generate_ids_update_degrees(
terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
message_max, total_count, message_count, carry_count, sm_copy_count);
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
h_lwe_idx_in, h_lwe_idx_out);
size_t total_ciphertexts;
size_t total_messages;
current_columns.next_accumulation(total_ciphertexts, total_messages,
needs_processing);
size_t copy_size = sm_copy_count * sizeof(int32_t);
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
streams[0], gpu_indexes[0]);
// inside d_smart_copy_in there are only -1 values
// it's fine to call smart_copy with same pointer
// as source and destination
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
new_blocks, 0, r * num_radix_blocks);
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
(Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
d_smart_copy_out, d_smart_copy_in, big_lwe_size);
check_cuda_error(cudaGetLastError());
if (carry_count > 0)
cuda_set_value_async<Torus>(
streams[0], gpu_indexes[0],
luts_message_carry->get_lut_indexes(0, message_count), 1,
carry_count);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
std::vector<Torus *> small_lwe_vector_vec =
luts_message_carry->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec =
luts_message_carry->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec =
luts_message_carry->lwe_trivial_indexes_vec;
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
auto active_gpu_count = get_active_gpu_count(total_ciphertexts, gpu_count);
if (active_gpu_count == 1) {
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
polynomial_size * glwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks_vec,
(Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
big_lwe_size);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
ksks, big_lwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
total_messages);
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
/// different configuration
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
small_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
(Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, total_count,
small_lwe_size);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
luts_message_carry->using_trivial_lwe_indexes, total_count,
big_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
Torus *h_lwe_indexes_in_pinned;
Torus *h_lwe_indexes_out_pinned;
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
total_ciphertexts * sizeof(Torus));
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
total_ciphertexts * sizeof(Torus));
for (uint32_t i = 0; i < total_ciphertexts; i++) {
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
}
}
for (uint i = 0; i < total_count; i++) {
auto degrees_index = luts_message_carry->h_lut_indexes[i];
new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
}
cuda_memcpy_async_to_cpu(
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_cpu(
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint32_t i = 0; i < total_ciphertexts; i++) {
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
}
cudaFreeHost(h_lwe_indexes_in_pinned);
cudaFreeHost(h_lwe_indexes_out_pinned);
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
int new_blocks_created = 2 * ch_amount * num_radix_blocks;
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
luts_message_carry->using_trivial_lwe_indexes = false;
if (rem_blocks > 0)
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
new_blocks_created + rem_blocks, old_blocks,
cur_total_blocks - rem_blocks, cur_total_blocks);
std::swap(new_blocks, old_blocks);
r = (new_blocks_created + rem_blocks) / num_radix_blocks;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, active_gpu_count, current_blocks,
current_blocks, bsks, ksks, ms_noise_reduction_key,
luts_message_carry, total_ciphertexts);
}
cuda_set_device(gpu_indexes[0]);
std::swap(d_columns, d_new_columns);
std::swap(d_columns_counter, d_new_columns_counter);
}
luts_message_carry->release(streams, gpu_indexes, gpu_count);
delete (luts_message_carry);
CudaRadixCiphertextFFI old_blocks_slice;
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks_slice, num_radix_blocks);
calculate_final_chunk_into_radix<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
d_columns, d_columns_counter, chunk_size, big_lwe_size);
if (reduce_degrees_for_single_carry_propagation) {
prepare_final_pbs_indexes<Torus>
<<<1, 2 * num_radix_blocks, 0, streams[0]>>>(
d_pbs_indexes_in, d_pbs_indexes_out,
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
cuda_memset_async(
(Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
auto active_gpu_count =
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, num_radix_blocks);
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
Torus *h_lwe_indexes_in_pinned;
Torus *h_lwe_indexes_out_pinned;
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
num_blocks_in_apply_lut * sizeof(Torus));
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
num_blocks_in_apply_lut * sizeof(Torus));
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
}
cuda_memcpy_async_to_cpu(
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_cpu(
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
}
cudaFreeHost(h_lwe_indexes_in_pinned);
cudaFreeHost(h_lwe_indexes_out_pinned);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, active_gpu_count, current_blocks, radix_lwe_out,
bsks, ksks, ms_noise_reduction_key, luts_message_carry,
num_blocks_in_apply_lut);
}
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
num_radix_blocks, num_radix_in_vec, chunk_size,
mem_ptr->params.message_modulus);
cuda_set_device(gpu_indexes[0]);
CudaRadixCiphertextFFI current_blocks_slice;
as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
current_blocks, &current_blocks_slice,
num_radix_blocks);
}
}
template <typename Torus, class params>
@@ -600,9 +773,9 @@ __host__ void host_integer_mult_radix_kb(
terms_degree_msb[i] = (b_id > r_id) ? message_modulus - 2 : 0;
}
host_integer_partial_sum_ciphertexts_vec_kb<Torus, params>(
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, bsks,
ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
2 * num_blocks, mem_ptr->luts_array);
streams, gpu_indexes, gpu_count, radix_lwe_out, vector_result_sb, true,
bsks, ksks, ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem,
num_blocks, 2 * num_blocks);
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;

View File

@@ -34,7 +34,7 @@ void update_degrees_after_scalar_bitor(uint64_t *output_degrees,
auto result = max;
for (uint j = 0; j < min + 1; j++) {
if (max | j > result) {
if ((max | j) > result) {
result = max | j;
}
}
@@ -52,7 +52,7 @@ void update_degrees_after_scalar_bitxor(uint64_t *output_degrees,
// Try every possibility to find the worst case
for (uint j = 0; j < min + 1; j++) {
if (max ^ j > result) {
if ((max ^ j) > result) {
result = max ^ j;
}
}

View File

@@ -6,7 +6,8 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory, bool allocate_ms_array) {
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool allocate_gpu_memory,
bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
@@ -17,7 +18,28 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
return scratch_cuda_integer_radix_scalar_mul_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_scalar_mul_buffer<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
num_scalar_bits, allocate_gpu_memory);
}
uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t num_scalar_bits, bool anticipated_buffer_drop,
bool allocate_gpu_memory, bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);
return scratch_cuda_integer_radix_scalar_mul_high_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_scalar_mul_high<uint64_t> **)mem_ptr, num_blocks, params,
num_scalar_bits, anticipated_buffer_drop, allocate_gpu_memory);
}
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
@@ -83,6 +105,21 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
}
}
void cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *ct, int8_t *mem_ptr, void *const *ksks,
uint64_t rhs, uint64_t const *decomposed_scalar,
uint64_t const *has_at_least_one_set,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_scalars) {
host_integer_radix_scalar_mul_high_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, ct,
(int_scalar_mul_high<uint64_t> *)mem_ptr, (uint64_t **)ksks, rhs,
decomposed_scalar, has_at_least_one_set, ms_noise_reduction_key, bsks,
num_scalars);
}
void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
@@ -93,3 +130,13 @@ void cleanup_cuda_integer_radix_scalar_mul(void *const *streams,
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_scalar_mul_high<uint64_t> *mem_ptr =
(int_scalar_mul_high<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
}

View File

@@ -6,6 +6,7 @@
#include <cuda_runtime.h>
#endif
#include "cast.cuh"
#include "device.h"
#include "integer/integer_utilities.h"
#include "multiplication.cuh"
@@ -32,12 +33,12 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_scalar_mul_buffer<T> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_scalar_mul_buffer<T>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
allocate_gpu_memory, true, &size_tracker);
num_scalar_bits, allocate_gpu_memory, true, &size_tracker);
return size_tracker;
}
@@ -115,13 +116,10 @@ __host__ void host_integer_scalar_mul_radix(
set_zero_radix_ciphertext_slice_async<T>(streams[0], gpu_indexes[0],
lwe_array, 0, num_radix_blocks);
} else {
for (int i = 0; i < j * num_radix_blocks; i++) {
all_shifted_buffer->degrees[i] = message_modulus - 1;
}
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, bsks,
ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
num_radix_blocks, j, nullptr);
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer, true,
bsks, ksks, ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem,
num_radix_blocks, j);
auto scp_mem_ptr = mem->sc_prop_mem;
uint32_t requested_flag = outputFlag::FLAG_NONE;
@@ -170,4 +168,109 @@ __host__ void host_integer_small_scalar_mul_radix(
output_lwe_array->degrees[i] = input_lwe_array->degrees[i] * scalar;
}
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_radix_scalar_mul_high_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_scalar_mul_high<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
uint32_t num_scalar_bits, bool anticipated_buffer_drop,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_scalar_mul_high<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
allocate_gpu_memory, LEFT_SHIFT, num_scalar_bits, anticipated_buffer_drop,
&size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void host_integer_radix_scalar_mul_high_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *ct,
int_scalar_mul_high<Torus> *mem_ptr, Torus *const *ksks, uint64_t rhs,
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *const *bsks, uint32_t num_scalars) {
if (rhs == (uint64_t)0) {
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], ct,
0, ct->num_radix_blocks);
return;
}
CudaRadixCiphertextFFI *tmp_ffi = mem_ptr->tmp;
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(tmp_ffi, ct, streams,
gpu_indexes);
if (rhs != (uint64_t)1 || tmp_ffi->num_radix_blocks != 0) {
if ((rhs & (rhs - 1)) == 0) {
uint32_t shift = std::log2(rhs);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, tmp_ffi, shift,
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
} else {
switch (mem_ptr->params.polynomial_size) {
case 512:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<512>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 1024:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<1024>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 2048:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<2048>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 4096:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<4096>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 8192:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<8192>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
case 16384:
host_integer_scalar_mul_radix<uint64_t, AmortizedDegree<16384>>(
streams, gpu_indexes, gpu_count, tmp_ffi, decomposed_scalar,
has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks,
(uint64_t **)ksks, ms_noise_reduction_key,
mem_ptr->params.message_modulus, num_scalars);
break;
default:
PANIC(
"Cuda error (scalar multiplication): unsupported polynomial size. "
"Only N = 512, 1024, 2048, 4096, 8192, 16384 are supported.")
}
}
}
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams, gpu_indexes);
}
#endif

View File

@@ -0,0 +1,46 @@
#include "subtraction.cuh"
uint64_t scratch_cuda_sub_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t requested_flag,
bool allocate_gpu_memory, bool allocate_ms_array) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, allocate_ms_array);
return scratch_cuda_sub_and_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sub_and_propagate<uint64_t> **)mem_ptr, num_blocks, params,
requested_flag, allocate_gpu_memory);
}
void cuda_sub_and_propagate_single_carry_kb_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *lhs_array, const CudaRadixCiphertextFFI *rhs_array,
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
host_sub_and_propagate_single_carry<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lhs_array, rhs_array,
carry_out, carry_in, (int_sub_and_propagate<uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), ms_noise_reduction_key, requested_flag, uses_carry);
}
void cleanup_cuda_sub_and_propagate_single_carry(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_sub_and_propagate<uint64_t> *mem_ptr =
(int_sub_and_propagate<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)streams, gpu_indexes, gpu_count);
}

View File

@@ -8,7 +8,46 @@
#include "device.h"
#include "integer/integer.h"
#include "linear_algebra.h"
#include "integer/integer_utilities.h"
#include "negation.cuh"
#include "pbs/pbs_enums.h"
template <typename Torus>
uint64_t scratch_cuda_sub_and_propagate_single_carry(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_sub_and_propagate<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_sub_and_propagate<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks, requested_flag,
allocate_gpu_memory, &size_tracker);
return size_tracker;
}
template <typename Torus>
void host_sub_and_propagate_single_carry(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries,
int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
uint32_t requested_flag, uint32_t uses_carry) {
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, mem->neg_rhs_array, rhs_array,
mem->params.message_modulus, mem->params.carry_modulus,
mem->neg_rhs_array->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, lhs_array, mem->neg_rhs_array, carry_out,
input_carries, mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key,
requested_flag, uses_carry);
}
template <typename Torus>
__host__ void host_integer_radix_subtraction(

View File

@@ -261,6 +261,8 @@ void cuda_fourier_polynomial_mul(void *stream_v, uint32_t gpu_index,
default:
break;
}
check_cuda_error(cudaGetLastError());
cuda_drop_async(buffer, stream, gpu_index);
}

View File

@@ -279,6 +279,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
PANIC("Cuda error (convert KSK): unsupported polynomial size. Supported "
"N's are powers of two in the interval [256..16384].")
}
check_cuda_error(cudaGetLastError());
cuda_drop_async(d_bsk, stream, gpu_index);
cuda_drop_async(buffer, stream, gpu_index);
@@ -315,6 +316,7 @@ void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
// convert u128 into 4 x double
batch_convert_u128_to_f128_strided_as_torus<params>
<<<grid_size, block_size, 0, stream>>>(d_bsk, d_standard);
check_cuda_error(cudaGetLastError());
// call negacyclic 128 bit forward fft.
if (full_sm) {
@@ -326,6 +328,7 @@ void convert_u128_to_f128_and_forward_fft_128(cudaStream_t stream,
<<<grid_size, block_size, shared_memory_size, stream>>>(d_bsk, d_bsk,
buffer);
}
check_cuda_error(cudaGetLastError());
cuda_drop_async(buffer, stream, gpu_index);
}

View File

@@ -194,7 +194,8 @@ void execute_pbs_async(
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
void *zeros = nullptr;
if (ms_noise_reduction_key != nullptr)
if (ms_noise_reduction_key != nullptr &&
ms_noise_reduction_key->ptr != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams[i], gpu_indexes[i], current_lwe_array_out,

View File

@@ -0,0 +1,46 @@
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
#define CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
#include "pbs/pbs_128_utilities.h"
static void
execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array,
uint64_t *size_tracker_on_gpu) {
// The squash noise function receives as input 64-bit integers
*size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, allocate_ms_array);
}
template <typename Torus>
static void execute_pbs_128_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const LweArrayVariant<__uint128_t> &lwe_array_out,
const std::vector<Torus *> lut_vector,
const LweArrayVariant<uint64_t> &lwe_array_in,
void *const *bootstrapping_keys,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
std::vector<int8_t *> pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
for (uint32_t i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
uint64_t *current_lwe_array_in = GET_VARIANT_ELEMENT_64BIT(lwe_array_in, i);
void *zeros = nullptr;
if (ms_noise_reduction_key != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
streams[i], gpu_indexes[i], current_lwe_array_out, lut_vector[i],
current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
zeros, pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, num_inputs_on_gpu);
}
}
#endif

View File

@@ -46,7 +46,7 @@ __global__ void device_programmable_bootstrap_cg(
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block, uint32_t num_many_lut,
uint32_t lut_stride) {
uint32_t lut_stride, bool uses_noise_reduction) {
grid_group grid = this_grid();
@@ -80,7 +80,9 @@ __global__ void device_programmable_bootstrap_cg(
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
uses_noise_reduction
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
@@ -263,7 +265,9 @@ __host__ void host_programmable_bootstrap_cg(
int thds = polynomial_size / params::opt;
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
void *kernel_args[16];
bool uses_noise_reduction = buffer->uses_noise_reduction;
void *kernel_args[17];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lwe_output_indexes;
kernel_args[2] = &lut_vector;
@@ -279,6 +283,7 @@ __host__ void host_programmable_bootstrap_cg(
kernel_args[12] = &d_mem;
kernel_args[14] = &num_many_lut;
kernel_args[15] = &lut_stride;
kernel_args[16] = &uses_noise_reduction;
if (max_shared_memory < partial_sm) {
kernel_args[13] = &full_dm;

View File

@@ -660,22 +660,17 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
// If the parameters contain noise reduction key, then apply it
if (ms_noise_reduction_key != nullptr) {
if (ms_noise_reduction_key->num_zeros != 0) {
uint32_t log_modulus = log2(polynomial_size) + 1;
host_improve_noise_modulus_switch<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
buffer->temp_lwe_array_in,
static_cast<uint64_t const *>(lwe_array_in),
static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
num_samples, ms_noise_reduction_key->num_zeros,
ms_noise_reduction_key->ms_input_variance,
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
log_modulus);
} else {
buffer->temp_lwe_array_in =
const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
}
if (buffer->uses_noise_reduction) {
uint32_t log_modulus = log2(polynomial_size) + 1;
host_improve_noise_modulus_switch<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index, buffer->temp_lwe_array_in,
static_cast<uint64_t const *>(lwe_array_in),
static_cast<uint64_t const *>(lwe_input_indexes),
static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
num_samples, ms_noise_reduction_key->num_zeros,
ms_noise_reduction_key->ms_input_variance,
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
log_modulus);
} else {
buffer->temp_lwe_array_in =
const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
@@ -846,4 +841,7 @@ template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
bool allocate_ms_array);
template bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
__uint128_t>(uint32_t polynomial_size, uint32_t max_shared_memory);
#endif

View File

@@ -27,7 +27,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
double2 *global_join_buffer, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
uint64_t device_memory_size_per_block, bool uses_noise_reduction) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -55,7 +55,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
uses_noise_reduction
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
@@ -397,7 +399,8 @@ __host__ void execute_step_one(
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
bool uses_noise_reduction) {
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
@@ -410,20 +413,21 @@ __host__ void execute_step_one(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
global_accumulator, global_join_buffer, lwe_iteration,
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
full_dm);
full_dm, uses_noise_reduction);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, first_iter>
<<<grid, thds, partial_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
global_accumulator, global_join_buffer, lwe_iteration,
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
partial_dm);
partial_dm, uses_noise_reduction);
} else {
device_programmable_bootstrap_step_one<Torus, params, FULLSM, first_iter>
<<<grid, thds, full_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
global_accumulator, global_join_buffer, lwe_iteration,
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0);
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
uses_noise_reduction);
}
check_cuda_error(cudaGetLastError());
}
@@ -504,6 +508,7 @@ __host__ void host_programmable_bootstrap(
Torus *global_accumulator = pbs_buffer->global_accumulator;
double2 *global_join_buffer = pbs_buffer->global_join_buffer;
int8_t *d_mem = pbs_buffer->d_mem;
bool uses_noise_reduction = pbs_buffer->uses_noise_reduction;
for (int i = 0; i < lwe_dimension; i++) {
if (i == 0) {
@@ -512,14 +517,16 @@ __host__ void host_programmable_bootstrap(
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
uses_noise_reduction);
} else {
execute_step_one<Torus, params, false>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
uses_noise_reduction);
}
if (i == lwe_dimension - 1) {
execute_step_two<Torus, params, true>(

View File

@@ -8,124 +8,67 @@ bool has_support_to_cuda_programmable_bootstrap_128_cg(
max_shared_memory);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
*/
uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array) {
return scratch_cuda_programmable_bootstrap_128_vector<uint64_t>(
stream, gpu_index,
(pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> **)pbs_buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
}
uint64_t scratch_cuda_programmable_bootstrap_128(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array) {
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
if (has_support_to_cuda_programmable_bootstrap_128_cg(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory)) {
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
case 512:
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
case 1024:
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
case 2048:
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
case 4096:
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
default:
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..4096].")
}
} else {
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
case 512:
return scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
case 1024:
return scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
case 2048:
return scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
case 4096:
return scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..4096].")
}
}
return scratch_cuda_programmable_bootstrap_128_vector_64(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, allocate_ms_array);
}
template <typename Torus>
template <typename InputTorus>
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lut_vector, Torus *lwe_array_in,
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
double const *bootstrapping_key,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_128<AmortizedDegree<256>>(
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 512:
host_programmable_bootstrap_128<AmortizedDegree<512>>(
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 1024:
host_programmable_bootstrap_128<AmortizedDegree<1024>>(
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 2048:
host_programmable_bootstrap_128<AmortizedDegree<2048>>(
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 4096:
host_programmable_bootstrap_128<AmortizedDegree<4096>>(
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
@@ -137,41 +80,42 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
}
}
template <typename Torus>
template <typename InputTorus>
void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lut_vector, Torus *lwe_array_in,
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
double const *bootstrapping_key,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 512:
host_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 1024:
host_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 2048:
host_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 4096:
host_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
@@ -183,6 +127,57 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
}
}
template <typename InputTorus>
void host_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
__uint128_t const *lut_vector, void const *lwe_array_in,
void const *bootstrapping_key,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void const *ms_noise_reduction_ptr,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
// If the parameters contain noise reduction key, then apply it
if (ms_noise_reduction_key->num_zeros != 0) {
uint32_t log_modulus = log2(polynomial_size) + 1;
host_improve_noise_modulus_switch<InputTorus>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<InputTorus *>(buffer->temp_lwe_array_in),
static_cast<InputTorus const *>(lwe_array_in),
static_cast<uint64_t const *>(buffer->trivial_indexes),
static_cast<const InputTorus *>(ms_noise_reduction_ptr),
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
ms_noise_reduction_key->ms_input_variance,
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
log_modulus);
} else {
buffer->temp_lwe_array_in =
const_cast<InputTorus *>(static_cast<const InputTorus *>(lwe_array_in));
}
switch (buffer->pbs_variant) {
case DEFAULT:
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<InputTorus>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
break;
case CG:
executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
InputTorus>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
}
}
/* Perform bootstrapping on a batch of input u128 LWE ciphertexts, storing the
* result in the same index for each ciphertext.
*
@@ -237,56 +232,22 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void *streams, uint32_t gpu_index, void *lwe_array_out,
void const *lut_vector, void const *lwe_array_in,
void const *bootstrapping_key,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
void *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
void const *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *buffer =
(pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *)mem_ptr;
pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
// If the parameters contain noise reduction key, then apply it
if (ms_noise_reduction_key->num_zeros != 0) {
uint32_t log_modulus = log2(polynomial_size) + 1;
host_improve_noise_modulus_switch<__uint128_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
static_cast<__uint128_t const *>(lwe_array_in),
static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
ms_noise_reduction_key->ms_input_variance,
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
log_modulus);
} else {
buffer->temp_lwe_array_in = const_cast<__uint128_t *>(
static_cast<const __uint128_t *>(lwe_array_in));
}
switch (buffer->pbs_variant) {
case DEFAULT:
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const __uint128_t *>(lut_vector),
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
break;
case CG:
executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
__uint128_t>(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const __uint128_t *>(lut_vector),
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
}
host_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
streams, gpu_index, lwe_array_out,
static_cast<const __uint128_t *>(lut_vector), lwe_array_in,
bootstrapping_key, ms_noise_reduction_key, ms_noise_reduction_ptr, buffer,
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
num_samples);
}
/*
@@ -295,6 +256,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
*/
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
int8_t **buffer) {
auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
auto x = (pbs_buffer_128<__uint128_t, PBS_TYPE::CLASSICAL> *)(*buffer);
x->release(static_cast<cudaStream_t>(stream), gpu_index);
}

View File

@@ -74,16 +74,17 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_128(
__syncthreads();
}
template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
template <typename InputTorus, class params, sharedMemDegree SMD,
bool first_iter>
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_one_128(
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lwe_array_in,
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
double *global_join_buffer, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
const __uint128_t *__restrict__ lut_vector,
const InputTorus *__restrict__ lwe_array_in,
const double *__restrict__ bootstrapping_key,
__uint128_t *global_accumulator, double *global_join_buffer,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -100,22 +101,22 @@ __global__ void __launch_bounds__(params::degree / params::opt)
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
__uint128_t *accumulator = (__uint128_t *)selected_memory;
double *accumulator_fft =
(double *)accumulator +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
(ptrdiff_t)(sizeof(__uint128_t) * polynomial_size / sizeof(double));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double *)sharedmem;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
const Torus *block_lut_vector = lut_vector;
const __uint128_t *block_lut_vector = lut_vector;
Torus *global_slice =
__uint128_t *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
@@ -127,12 +128,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
if constexpr (first_iter) {
// First iteration
// Put "b" in [0, 2N[
Torus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
InputTorus b_hat = 0;
modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
// The y-dimension is used to select the element of the GLWE this block will
// compute
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
@@ -146,20 +147,21 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
// Put "a" in [0, 2N[
Torus a_hat = 0;
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
params::log2_degree + 1); // 2 * params::log2_degree + 1);
InputTorus a_hat = 0;
modulus_switch<InputTorus>(block_lwe_array_in[lwe_iteration], a_hat,
params::log2_degree +
1); // 2 * params::log2_degree + 1);
__syncthreads();
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(global_slice,
accumulator, a_hat);
__uint128_t, params::opt, params::degree / params::opt>(
global_slice, accumulator, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
init_decomposer_state_inplace<Torus, params::opt,
init_decomposer_state_inplace<__uint128_t, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
@@ -168,7 +170,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
accumulator);
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
// We are using the same memory space for accumulator_fft and
@@ -314,10 +317,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
*
* Each y-block computes one element of the lwe_array_out.
*/
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, class params, sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_cg_128(
Torus *lwe_array_out, const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lwe_array_in,
__uint128_t *lwe_array_out, const __uint128_t *__restrict__ lut_vector,
const InputTorus *__restrict__ lwe_array_in,
const double *__restrict__ bootstrapping_key, double *join_buffer,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
@@ -342,23 +345,22 @@ __global__ void device_programmable_bootstrap_cg_128(
// We always compute the pointer with most restrictive alignment to avoid
// alignment issues
Torus *accumulator = (Torus *)selected_memory;
Torus *accumulator_rotated =
(Torus *)accumulator + (ptrdiff_t)(polynomial_size);
__uint128_t *accumulator = (__uint128_t *)selected_memory;
__uint128_t *accumulator_rotated =
(__uint128_t *)accumulator + (ptrdiff_t)(polynomial_size);
double *accumulator_fft =
(double *)(accumulator_rotated) +
(ptrdiff_t)(polynomial_size * sizeof(Torus) / sizeof(double));
(ptrdiff_t)(polynomial_size * sizeof(__uint128_t) / sizeof(double));
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double *)sharedmem;
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
const Torus *block_lut_vector =
&lut_vector[blockIdx.x * params::degree * (glwe_dimension + 1)];
const __uint128_t *block_lut_vector = lut_vector;
double *block_join_buffer =
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
@@ -368,11 +370,11 @@ __global__ void device_programmable_bootstrap_cg_128(
// rotated array is not in use anymore by the time we perform the fft
// Put "b" in [0, 2N[
Torus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
InputTorus b_hat = 0;
modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
@@ -381,17 +383,18 @@ __global__ void device_programmable_bootstrap_cg_128(
__syncthreads();
// Put "a" in [0, 2N[
Torus a_hat = 0;
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
InputTorus a_hat = 0;
modulus_switch<InputTorus>(block_lwe_array_in[i], a_hat,
params::log2_degree + 1);
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
__uint128_t, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
init_decomposer_state_inplace<Torus, params::opt,
init_decomposer_state_inplace<__uint128_t, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
@@ -400,8 +403,8 @@ __global__ void device_programmable_bootstrap_cg_128(
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
accumulator_rotated);
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
@@ -420,8 +423,9 @@ __global__ void device_programmable_bootstrap_cg_128(
acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
__syncthreads();
add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
acc_fft_im_lo, accumulator);
add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo,
acc_fft_im_hi, acc_fft_im_lo,
accumulator);
}
auto block_lwe_array_out =
@@ -433,17 +437,20 @@ __global__ void device_programmable_bootstrap_cg_128(
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<__uint128_t, params>(block_lwe_array_out,
accumulator);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
0);
}
}
}
template <typename params>
template <typename InputTorus, typename params>
__host__ uint64_t scratch_programmable_bootstrap_cg_128(
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array) {
@@ -457,33 +464,34 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
uint64_t size_tracker = 0;
*buffer = new pbs_buffer_128<CLASSICAL>(
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
allocate_gpu_memory, allocate_ms_array, &size_tracker);
return size_tracker;
}
template <typename params>
template <typename InputTorus, typename params>
__host__ uint64_t scratch_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array) {
@@ -504,37 +512,37 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one_128<__uint128_t, params,
device_programmable_bootstrap_step_one_128<InputTorus, params,
PARTIALSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one_128<__uint128_t, params,
device_programmable_bootstrap_step_one_128<InputTorus, params,
PARTIALSM, true>,
cudaFuncCachePreferShared);
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one_128<__uint128_t, params,
device_programmable_bootstrap_step_one_128<InputTorus, params,
PARTIALSM, false>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one_128<__uint128_t, params,
device_programmable_bootstrap_step_one_128<InputTorus, params,
PARTIALSM, false>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
true>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
true>,
cudaFuncCachePreferShared);
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
false>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
false>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
@@ -580,17 +588,122 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
}
uint64_t size_tracker = 0;
*buffer = new pbs_buffer_128<CLASSICAL>(
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
allocate_gpu_memory, allocate_ms_array, &size_tracker);
return size_tracker;
}
template <class params, bool first_iter>
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
*/
template <typename InputTorus>
uint64_t scratch_cuda_programmable_bootstrap_128_vector(
void *stream, uint32_t gpu_index,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, bool allocate_ms_array) {
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto buffer = (pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **)pbs_buffer;
if (has_support_to_cuda_programmable_bootstrap_128_cg(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory)) {
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
case 512:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
case 1024:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
case 2048:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
case 4096:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
default:
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..4096].")
}
} else {
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
case 512:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
case 1024:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
case 2048:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
case 4096:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"
" in the interval [256..4096].")
}
}
}
template <typename InputTorus, class params, bool first_iter>
__host__ void execute_step_one_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
__uint128_t *lwe_array_in, double const *bootstrapping_key,
InputTorus *lwe_array_in, double const *bootstrapping_key,
__uint128_t *global_accumulator, double *global_join_buffer,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -603,21 +716,21 @@ __host__ void execute_step_one_128(
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
if (max_shared_memory < partial_sm) {
device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM,
device_programmable_bootstrap_step_one_128<InputTorus, params, NOSM,
first_iter>
<<<grid, thds, 0, stream>>>(
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
base_log, level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM,
device_programmable_bootstrap_step_one_128<InputTorus, params, PARTIALSM,
first_iter>
<<<grid, thds, partial_sm, stream>>>(
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
base_log, level_count, d_mem, partial_dm);
} else {
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
first_iter>
<<<grid, thds, full_sm, stream>>>(
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
@@ -670,11 +783,12 @@ __host__ void execute_step_two_128(
/*
* Host wrapper to the programmable bootstrap 128
*/
template <class params>
template <typename InputTorus, class params>
__host__ void host_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lut_vector, __uint128_t *lwe_array_in,
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *pbs_buffer,
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
double const *bootstrapping_key,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *pbs_buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
@@ -704,14 +818,14 @@ __host__ void host_programmable_bootstrap_128(
for (int i = 0; i < lwe_dimension; i++) {
if (i == 0) {
execute_step_one_128<params, true>(
execute_step_one_128<InputTorus, params, true>(
stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
d_mem, i, partial_sm, partial_dm_step_one, full_sm_step_one,
full_dm_step_one);
} else {
execute_step_one_128<params, false>(
execute_step_one_128<InputTorus, params, false>(
stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
@@ -736,11 +850,12 @@ __host__ void host_programmable_bootstrap_128(
}
}
template <class params>
template <typename InputTorus, class params>
__host__ void host_programmable_bootstrap_cg_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
__uint128_t const *lut_vector, __uint128_t const *lwe_array_in,
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
double const *bootstrapping_key,
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
@@ -783,20 +898,20 @@ __host__ void host_programmable_bootstrap_cg_128(
if (max_shared_memory < partial_sm) {
kernel_args[10] = &full_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_programmable_bootstrap_cg_128<__uint128_t, params, NOSM>,
(void *)device_programmable_bootstrap_cg_128<InputTorus, params, NOSM>,
grid, thds, (void **)kernel_args, 0, stream));
} else if (max_shared_memory < full_sm) {
kernel_args[10] = &partial_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_programmable_bootstrap_cg_128<__uint128_t, params,
PARTIALSM>,
(void *)
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
grid, thds, (void **)kernel_args, partial_sm, stream));
} else {
int no_dm = 0;
kernel_args[10] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
grid, thds, (void **)kernel_args, full_sm, stream));
}

View File

@@ -398,20 +398,32 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
bool supports_cg =
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
// On H100s we should be using TBC until num_samples < num_sms / 2.
// After that we switch to CG until not supported anymore.
// At this point we return to TBC.
int num_sms = 0;
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
bool supports_tbc =
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, cuda_get_max_shared_memory(gpu_index)))
level_count, cuda_get_max_shared_memory(gpu_index));
if (supports_tbc &&
!(input_lwe_ciphertext_count > num_sms / 2 && supports_cg))
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
else
#endif
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
uint64_t>(glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index)))
if (supports_cg)
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,

Some files were not shown because too many files have changed in this diff Show More