Compare commits

...

172 Commits

Author SHA1 Message Date
Agnes Leroy
47e847dab4 Attempt 2024-10-03 18:26:19 +02:00
Pedro Alves
c6ff00c500 fix(gpu): fix the indexes used in compression
- also general minor fixes to compression
2024-10-03 16:50:37 +02:00
Agnes Leroy
54a08afb46 chore(doc): add compression tutorial on GPU 2024-10-03 13:53:52 +02:00
Arthur Meyre
6c8591dc21 chore(doc): add a bit more substance to the array documentation 2024-10-03 13:53:52 +02:00
Arthur Meyre
876cde1f6a chore(doc): add make command to print parameters used in doc benchmarks 2024-10-03 13:53:52 +02:00
Arthur Meyre
ee938797c3 chore(docs): improve getting started page following feedback
- add more details to set-up a rust project from 0 and add TFHE-rs as a
dependency
2024-10-03 13:53:52 +02:00
Agnes Leroy
2311087a64 chore(hl): fix clippy error in test 2024-10-03 13:49:02 +02:00
Beka Barbakadze
7dfabdd4b5 feat(cuda): modify double to torus 2024-10-03 13:48:54 +02:00
David Testé
212af17538 style(global): fix typos 2024-10-03 11:47:00 +02:00
David Testé
c7f4de9a21 chore(ci): add makefile target for typos checker
This target would perform a typos check with some exceptions to ensure
correct spelling throughout the codebase
2024-10-03 11:47:00 +02:00
David Testé
2b25b20aeb chore(ci): fix wasm benchmark results parsing for object sizes 2024-10-03 09:17:55 +02:00
Agnes Leroy
4a930264f5 chore(gpu): reset all test thread values 2024-10-02 15:32:44 +02:00
Guillermo Oyarzun
2498087610 fix(gpu): remove extra single carry propagation from partial sum 2024-10-02 15:26:02 +02:00
tmontaigu
375481c66e fix(hlapi): pub use HlCompressible,HlExpandable
Pub re-export the `HlCompressible` and `HlExpandable`
traits, as users may need them to write generic code
that manipulates CompressedCiphertextList/Builder
2024-10-02 10:47:45 +02:00
Agnes Leroy
cb9dac6eed chore(gpu): add ks/pbs benchmarks in the documentation 2024-10-02 09:37:14 +02:00
Nicolas Sarlin
04c6f18d42 feat(versionable): impl Versionize for Vec<Vec<T>> 2024-10-01 13:32:41 +02:00
dependabot[bot]
75d2457a6f chore(deps): bump actions/checkout from 4.1.7 to 4.2.0
Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.7 to 4.2.0.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](692973e3d9...d632683dd7)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-10-01 11:48:35 +02:00
Arthur Meyre
dedb3e94e5 feat(integer): evaluate unpacking luts during casting to improve perf
- allows to avoid some LUT evaluations during expansions of the various
CompactCiphertextList
2024-09-30 21:03:09 +02:00
Arthur Meyre
766809afe4 chore(doc): fix slightly broken docstring which ended as a comment 2024-09-30 21:03:09 +02:00
Arthur Meyre
22728b9156 chore(ci): allow the redundant closure lint which brings no value 2024-09-30 21:03:09 +02:00
tmontaigu
59380fcacb chore(js): add test for compact pk conformance 2024-09-30 20:15:25 +02:00
Nicolas Sarlin
b50029fcff feat(js): add safe_serialize_conformant for public keys 2024-09-30 20:15:25 +02:00
tmontaigu
7f9ba6ed28 feat(js): add constructors for PublicKeyParams 2024-09-30 20:15:25 +02:00
Mayeul@Zama
9f6e7cd3fc feat(all): add ProvenCompactCiphertextList conformance 2024-09-30 20:15:25 +02:00
Mayeul@Zama
b14db1e3fd feat(all): add CompactPublicKey conformance 2024-09-30 20:15:25 +02:00
tmontaigu
3b4cb6b1fc feat(hlapi): Add initial structure of NdArray types 2024-09-30 17:37:56 +02:00
Guillermo Oyarzun
81c16e7915 chore(gpu): add module loading info 2024-09-30 17:05:50 +02:00
Agnes Leroy
0fc24127a2 chore(gpu): refactor lwe_chunk_size 2024-09-30 17:04:47 +02:00
Nicolas Sarlin
e9d3e21b93 chore(all)!: use a builder pattern for safe serialization API
BREAKING CHANGES:
- `safe_serialize` and `safe_deserialize` are replaced by
  `SerializationConfig::serialize_into` and
  `DeserializationConfig::deserialize_from`.
- C API: the `XXX_safe_serialize_versioned` is deprecated, `XXX_safe_serialize`
  is now versioned by default
- JS API: the `safe_serialize` method now versionize the data before
  serialization.

This is *NOT* a serialization breaking change for data serialized in previous
versions with `safe_serialize_versioned`.
2024-09-30 15:58:25 +02:00
Nicolas Sarlin
53c4850d11 feat(zk): impl Named for zk pke proof and Params 2024-09-30 15:58:25 +02:00
Agnes Leroy
03154d5db8 fix(gpu): fix end index in gpu compression 2024-09-30 15:56:51 +02:00
yuxizama
576bc5782e chore(docs): benchmark regrouping and visualization 2024-09-30 15:38:51 +02:00
Nicolas Sarlin
8256e76f74 refactor(zk): remove dependency to ark_serialize 2024-09-30 13:18:18 +02:00
Nicolas Sarlin
835cc6d9b0 refactor(zk): handle compression without canonical serialize 2024-09-30 13:18:18 +02:00
Nicolas Sarlin
c9be958d1a chore(backward): adds a test for proven list versioning 2024-09-30 13:18:18 +02:00
Nicolas Sarlin
5183c1fb3e fix(backward): fix backward data clone script with multiple branches 2024-09-30 13:18:18 +02:00
Nicolas Sarlin
0d49d19a13 refactor(hl)!: use a trait for common ciphertext lists methods
BREAKING CHANGE:
- The `CiphertextList` trait needs to be in scope to use the common methods of
the `CompressedCiphertextList` and `CompactCiphertextListExpander`
- The `.get` of the `CompactCiphertextListExpander` now returns a
`Result<Option>` instead of an `Option<Result>`
2024-09-30 13:18:18 +02:00
Nicolas Sarlin
e91d532a36 chore(zk): enable versionable lint for zk 2024-09-30 13:18:18 +02:00
Nicolas Sarlin
1c2a0e82f9 feat(zk): Versionize ProvenCompactCiphertextList and PkePublicParams 2024-09-30 13:18:18 +02:00
Nicolas Sarlin
e76503984a refactor(zk): convert ark types to custom types before serialization 2024-09-30 13:18:18 +02:00
Nicolas Sarlin
5cfc57f51a refactor(zk): explicitly state endianness in to_bytes functions 2024-09-30 13:18:18 +02:00
Agnes Leroy
840498977c chore(gpu): fix l40 hardware name in bench workflow 2024-09-30 13:05:46 +02:00
David Testé
77a34a952e chore: bump version for tfhe, tfhe-cuda-backend, tfhe-zk-pok
tfhe bumped to v0.8.0
tfhe-cuda-backend bumped to v0.4.0
tfhe-zk-pok bumped to v0.3.0
2024-09-30 13:00:46 +02:00
Agnes Leroy
d9e9a5bb3f chore(gpu): add gpu compression in the hl api 2024-09-30 09:33:12 +02:00
Pedro Alves
03431e41a9 chore(gpu): change index array type in decompression 2024-09-27 15:36:50 -03:00
Nicolas Sarlin
5d522ffeaa fix(zk): generate m mod t in padding test 2024-09-27 16:57:19 +02:00
Arthur Meyre
3956f96318 feat(tfhe): plug padding bit API from ZKs 2024-09-27 16:57:19 +02:00
Arthur Meyre
7192ecb695 feat(zk): add possibility to specify a number of MSB padding bits set to 0
- pke v1 and v2
2024-09-27 16:57:19 +02:00
Mayeul@Zama
40b097d819 feat(all): add server key conformance 2024-09-27 16:55:23 +02:00
tmontaigu
45effa41d5 refactor!: gate wops behind "experimental" feature
This puts the WOPBS features of shortint and integer
modules behind the "experimental" feature.

Due to the versioning feature, the structs definitions
are not gated behind the "experimental" feature, however
they are only pub(crate) in that case.
2024-09-27 15:00:18 +02:00
Agnes Leroy
d2efa82daf chore(gpu): add leading zeros/ones benchmarks 2024-09-27 13:38:08 +02:00
tmontaigu
bd66a6fd2b feat(integer): improve scalar lt/le/gt/ge/min/max 2024-09-27 12:27:50 +02:00
tmontaigu
16feb46afc refactor(integer): use same logic for signed cmps
This makes the logic for signed cmps more similar whether the parameters
are 1_1 or higher.

This will make possible to reuse this part of the code for
scalar comparisons
2024-09-27 12:27:50 +02:00
Arthur Meyre
81d82bc45c chore(bench): bench 64 bits for ZKs 2024-09-26 20:16:22 +02:00
David Testé
7afe9b71d2 chore(shortint): update multi-bit gpu parameters set
Update with the latest improvements from the optimizer.
2024-09-26 18:15:33 +02:00
David Testé
41fae73e63 chore: bump tfhe to 0.8.0-alpha.10 2024-09-26 15:40:31 +02:00
David Testé
de7c7f209f chore(ci): include snippets folder into tfhe npm package 2024-09-26 15:40:31 +02:00
Mayeul@Zama
84de0a7b23 feat(hlapi): add generate_oblivious_pseudo_random on FheBool 2024-09-26 14:27:18 +02:00
Pedro Alves
4bb115e1e7 chore(gpu): improve and fix compression tests
- the logic was wrong when the integer is split in multiple GLWEs
- now the test pseudo-randomly mix unsigned, signed, and booleans
2024-09-26 07:50:17 -03:00
Agnes Leroy
b365585c74 chore(gpu): add 2xH100 bench workflow 2024-09-26 12:42:05 +02:00
David Testé
ea3ec8cbdd chore(ci): write gpu parameters to file
This is done so that lattice estimator can check security of these
ones.
2024-09-26 09:02:06 +02:00
Bourgerie Quentin
8c51e22aa5 fix(gpu): fix cuda memcpy in plaintext add 2024-09-25 13:38:06 +02:00
tmontaigu
283a3c911b feat(shortint): add try_from_lwe_encryption_key 2024-09-25 10:44:25 +02:00
Agnes Leroy
2bf483c596 chore(gpu): add bench workflow on L40 2024-09-25 09:13:22 +02:00
Beka Barbakadze
2e0736afc6 feat(cuda): implements fft with reduced shared memory read/write. 2024-09-25 09:13:09 +02:00
David Testé
400ce27beb chore(tfhe): update boolean and shortint parameters 2024-09-25 09:12:28 +02:00
Arthur Meyre
43d91f512f chore(ci): use python as webdriver for wasm test and benchmarks
Swtich from Jest and Puppeteer to Python with Selenium. It relies
on browser and webdriver binaries from browser fabricators.
For now the Python script only supports Chrome browser.
2024-09-25 09:11:13 +02:00
Nicolas Sarlin
5db5aba24a chore: bump tfhe to 0.8.0-alpha.9 2024-09-24 17:42:25 +02:00
Nicolas Sarlin
361c9618a0 chore(versionable): run clippy on tfhe-versionable 2024-09-24 15:20:05 +02:00
Nicolas Sarlin
35dac0d85c fix(versionable): use examples as tests 2024-09-24 15:20:05 +02:00
Agnes Leroy
1c0b6fbbd4 fix(gpu): remove all resettings of shared memory size 2024-09-24 08:56:06 +02:00
Agnes Leroy
8c6e916076 chore(gpu): rework async logic for ilog2 2024-09-24 08:53:34 +02:00
Agnes Leroy
49ab72bcec chore(gpu): print info about inputs in determinism test asserts 2024-09-24 08:53:22 +02:00
Agnes Leroy
937b72c538 fix(gpu): add back cuda device synchronize in Drop
It was too unsafe to remove device_synchronize. A good
move would probably to remove all asynchronous entry
points on the Rust side if we want to be safe,
otherwise we need to keep this.
2024-09-24 08:53:14 +02:00
tmontaigu
0259886375 feat(integer): add count_ones/zeros
The non naive version made for 2_2 parameters
only bring slight (10-15%) for some small sizes like (64, 128, 256 bits)
but reduces number of PBS. The place where it brings the best
improvements it for very large numbers (e.g 6400 blocks 1.8s for naive,
1.1 sec for non-naive)
2024-09-23 22:21:34 +02:00
Arthur Meyre
97822db5fc test(core): add noise formulas and variance tests for KS and PBS 2024-09-23 16:48:22 +02:00
Agnes Leroy
934b5f40a1 chore(gpu): add some scalar ops to dedup benchmarks 2024-09-23 14:53:13 +02:00
Nicolas Sarlin
3ff81c3c4b test(versionable): test bounds visibility in the generated code 2024-09-23 13:28:54 +02:00
Nicolas Sarlin
bce5cd3552 chore(versionable): prepare release 0.3.0 2024-09-23 13:28:54 +02:00
Nicolas Sarlin
ec83165acc chore(versionable): run tfhe-versionable tests in ci 2024-09-23 13:28:54 +02:00
Nicolas Sarlin
d63c2f7705 chore(versionable): update examples
Mostly test in the main that the derived code actually works
2024-09-23 13:28:54 +02:00
Nicolas Sarlin
5bcc34728a doc(versionable): adds in the README that this crate uses serde 2024-09-23 13:28:54 +02:00
Nicolas Sarlin
b62228b429 feat(versionable): Versionize Vec of tuples 2024-09-23 13:28:54 +02:00
Nicolas Sarlin
b63347336b fix(versionable)!: wrong derived bounds in the Versionize macro
Over-restrictive derived bounds were in some cases unsatisfiable, making the
`versionize` method uncallable.

BREAKING_CHANGE:
- The `#[versionize(bound = ...)]` attribute is not needed anymore, so it has
been removed.
2024-09-23 13:28:54 +02:00
Nicolas Sarlin
a631904bd1 feat(zk): add metadata to v2 2024-09-23 13:27:24 +02:00
Agnes Leroy
da850865ec chore(gpu): add file to run full tests on H100 from workflow only 2024-09-23 13:02:17 +02:00
dependabot[bot]
8be769e282 chore(deps): bump tj-actions/changed-files from 45.0.1 to 45.0.2
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 45.0.1 to 45.0.2.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](e9772d1404...48d8f15b2a)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-09-23 11:00:51 +02:00
David Testé
47ea8bf45c chore(deps): update slab-github-runner requirement to last version 2024-09-23 09:46:52 +02:00
Agnes Leroy
4823b8a1a0 chore(gpu): initialize some arrays to 0 2024-09-20 22:51:30 +02:00
Agnes Leroy
01f3a6d133 chore(gpu): disable slack notification for fast h100 test success 2024-09-20 17:39:52 +02:00
Nicolas Sarlin
bf613f36b3 feat(hl): impl Named for key types 2024-09-20 17:28:43 +02:00
Pedro Alves
faf200218b chore(gpu): add checks to ensure limits for compression 2024-09-19 15:57:16 -03:00
Agnes Leroy
24088fd494 chore(gpu): add scalar div and signed scalar div to hl api
Also add overflowing sub to hl
2024-09-19 19:11:45 +02:00
Agnes Leroy
48315dca80 feat(gpu): signed scalar div 2024-09-19 19:11:45 +02:00
Agnes Leroy
52b148a728 chore(gpu): temporarily set test threads to 1 for the GPU 2024-09-19 19:11:28 +02:00
Agnes Leroy
d0624d6184 chore(gpu): fix multi-gpu div performance 2024-09-19 16:56:47 +02:00
Agnes Leroy
00fc2818a9 chore(gpu): remove useless syncs 2024-09-19 16:56:47 +02:00
Titouan Tanguy
b93c23e5f8 feat(integer): add raw parts API to integer CompressionPrivateKeys 2024-09-19 14:40:42 +02:00
Nicolas Sarlin
1c59c1c260 fix(gpu): use build profile for cuda release 2024-09-19 14:40:15 +02:00
David Testé
ca7b29163e chore(ci): add token to checkout private repo tfhe-rs-internal 2024-09-19 14:00:34 +02:00
Agnes Leroy
f7a18ddb23 chore(gpu): remove unchecked benchmarks and add ilog2 to dedup ops 2024-09-19 13:16:17 +02:00
Arthur Meyre
7b9085d0e2 feat(integer): add raw parts API to integer (De)CompressionKey 2024-09-19 11:57:50 +02:00
Arthur Meyre
d52fa249a5 feat(shortint): derive PartialEq on Compression and Decompression keys 2024-09-19 11:57:50 +02:00
Arthur Meyre
35e7031751 feat: add raw parts API for CompressedCiphertextList in HL API 2024-09-19 11:57:50 +02:00
Arthur Meyre
d9662daea5 doc(shortint): add some information about expand and the casting_mode used 2024-09-19 10:29:05 +02:00
Arthur Meyre
32cdb0b5a0 fix: expand_with_key was not providing the safest set of modes
- it meant that lists needing unpacking could crash during expand
2024-09-19 10:29:05 +02:00
Agnes Leroy
a6aa95ce2d fix(gpu): fix comparisons 2024-09-18 21:18:53 +02:00
Arthur Meyre
97d7ed9ec2 chore(ci): only notify for most things on failure 2024-09-18 17:41:24 +02:00
Nicolas Sarlin
07045f1137 chore: update tfhe to 0.8.0-alpha.8 / cuda-backend to 0.4.0-alpha.1 2024-09-18 15:50:00 +02:00
David Testé
3ab7f49436 chore(ci): remove support for slab calls with issue comments
Now all workflows use Slab GitHub Action and thus can be launched
directly with a workflow_dispatch event.
2024-09-18 13:42:17 +02:00
Pedro Alves
040e28d822 chore(gpu): downgrade compression conversion tests to become doc tests 2024-09-18 08:35:06 -03:00
Pedro Alves
a113674c82 feat(gpu): implement conversion from CompressedCiphertextList to CudaCompressedCiphertextList 2024-09-18 08:35:06 -03:00
Pedro Alves
1d06691dda feat(gpu): implement conversion from CudaCompressedCiphertextList to CompressedCiphertextList 2024-09-18 08:35:06 -03:00
Guillermo Oyarzun
fc21804f3e feat(gpu): generate and apply many luts 2024-09-18 11:58:22 +02:00
Arthur Meyre
c0878f1600 chore: bump version to 0.8.0-alpha.7 2024-09-17 13:59:32 +02:00
Arthur Meyre
97f1277e06 feat: allow to verify a proof without expanding it 2024-09-17 13:59:32 +02:00
aquint-zama
e1dd4ba4bf chore: ensure actions are pinned by commit hash 2024-09-16 18:08:26 +02:00
David Testé
d96a368b37 chore(bench): fix display name for unchecked bitwise operations 2024-09-16 15:14:54 +02:00
Agnes Leroy
47c8d4cf64 chore(gpu): set test threads to 1 when BIG_INSTANCE is false to get a better view of failures in the ci 2024-09-16 13:19:48 +02:00
Agnes Leroy
9633b61298 fix(gpu): add missing synchronize in scalar add, refactor scalar add on cuda side 2024-09-16 09:05:16 +02:00
Agnes Leroy
8299e1cb9a chore(gpu): change multi-gpu tests to run on rtx so it's cheaper 2024-09-16 09:04:56 +02:00
tmontaigu
72ad76b5e7 fix(integer): do sum by safe chunk sizes
Parameters are made with with assumptions on the number of leveled
add/sub/scalar_mul operations are made, so that the
noise level before doing a PBS has a correct level and everything is
safe, secure and correct.

So the lib implementation has to uphold these assumptions in order to
keep the error probability failure correct.

In the comparisons, at some point we had a vector of ciphertexts with a
degree == 1, so we greedily summed them (e.g with 2_2 params we summed
them by chunks of 15), while it is correct with regards to the carry and
message space it is however less correct with regards to the noise
level.

Noise wise, doing this huge sum is correct as long as the noise of each ciphertext
is independent from the others in the same chunk.

While it may generally be the case we are in, its not guaranteed, and
since we do not track that information we have to take the safer
approach of assuming the worst case: all noise are dependent.

So to fix the issue we compute the correct size of sum chunk by also
taking into account the max noise level.
2024-09-13 15:55:17 +02:00
Arthur Meyre
0e6423820f feat(tfhe): add possibility to expand a ciphertext without verifying it 2024-09-13 14:59:21 +02:00
Arthur Meyre
c45ee6a236 chore(wasm): add missing (?) wasm_bindgen annotation 2024-09-13 14:59:21 +02:00
Arthur Meyre
cf7b21f1af chore(integer): fix an error message string referring to shortint 2024-09-13 14:59:21 +02:00
Arthur Meyre
f9026f1563 feat(zk): recompute big d in zk v1 to be more efficient when k < k_max 2024-09-13 14:21:00 +02:00
Nicolas Sarlin
95ab73cbaa chore(zk): add some comments to the zk pke v2 proof 2024-09-13 13:01:30 +02:00
Arthur Meyre
35faaef431 chore: bump version to 0.8.0-alpha.6 2024-09-13 10:25:03 +02:00
Arthur Meyre
a2ae1a4440 feat(zk): manage D as an upper bound as in the report
- allows to prove less slots than what the CRS can handle
2024-09-13 10:24:32 +02:00
David Testé
077d5727da chore(bench): make compression benchmarks available for database 2024-09-13 10:04:51 +02:00
Agnes Leroy
8314e7d47c chore(gpu): return if chunk_size is 0 2024-09-12 17:26:13 +02:00
Agnes Leroy
9dca245946 fix(gpu): return early in sum_ct if num radix is 2, pass different pointers to smart copy 2024-09-12 17:26:13 +02:00
Agnes Leroy
345f25c5c3 chore(gpu): fix partial sum ct with 0 or 1 inputs in the vec
Also refactor the interface for Hillis & Steele prefix sum
2024-09-12 17:26:13 +02:00
tmontaigu
c6756748f7 feat(integer): improve comparison algorithm
Use subtraction to do comparisons lt/le/gt/ge
2024-09-12 15:48:02 +02:00
Mayeul@Zama
bd21971c84 chore(all): fix new warnings in doctests 2024-09-12 14:20:38 +02:00
Mayeul@Zama
e96ad74006 chore(all): enable all warnings in doctests 2024-09-12 14:20:38 +02:00
Mayeul@Zama
abd87a0f0c chore(integer): remove #![allow(dead_code)] 2024-09-12 14:20:38 +02:00
Arthur Meyre
3875c97574 chore(ci): remove the usage of allow attributes with "reason"
- this is a bandaid fix to be able to publish
2024-09-12 11:34:08 +02:00
Agnes Leroy
6fabe6bab0 chore(gpu): fix templates and refactor radix negation 2024-09-12 09:21:54 +02:00
Arthur Meyre
91171c738d chore: bump version of tfhe to 0.8.0-alpha.5 2024-09-11 18:06:25 +02:00
Arthur Meyre
7bf0dc157d chore: bump tfhe-zk-pok version to 0.3.0-alpha.1 2024-09-11 18:06:25 +02:00
Arthur Meyre
0612ef5be5 feat(integer): plug metadata into lower level ZK APIs 2024-09-11 18:06:25 +02:00
Arthur Meyre
aee4c1ed18 feat(shortint): plug metadata API in the lower level ZK APIs 2024-09-11 18:06:25 +02:00
Arthur Meyre
e2a3ef151a feat(core): plug metadata into ZK APIs 2024-09-11 18:06:25 +02:00
Arthur Meyre
6f77bea5e0 feat(zk): add metadata management to v1
- proof function takes an additional u8 slice which is hashed in the proof
the verification cannot happen without the same metadata being provided
again
2024-09-11 18:06:25 +02:00
Arthur Meyre
e4f72dab30 chore(ci): make a check for wasm bindings with and without zk-pok 2024-09-11 18:06:25 +02:00
Arthur Meyre
7ed3fded4a chore(ci): the detect handles option from jest is freezing the runner
- trying to find the cause is making the problem worse, reverting
2024-09-11 17:25:40 +02:00
David Testé
488c942a3a refactor(shortint): move parameters set to their own directory
This is done to ease automatic parameters updates.
2024-09-11 13:54:23 +02:00
Mayeul@Zama
c0d98394fa refactor(integer): add compression key types 2024-09-11 13:53:04 +02:00
Mayeul@Zama
93ff6992e2 refactor(all): refactor oprf integer and hl APIs 2024-09-11 10:49:39 +02:00
Pedro Alves
2a4026c761 fix(gpu): fix some edge-cases (and booleans) on compression 2024-09-10 23:11:20 +02:00
Pedro Alves
39c424b14d chore(gpu): add debug/release modes 2024-09-09 14:02:10 +02:00
Guillermo Oyarzun
46a7a3b43b refactor(gpu): avoid synchronizations in the keybundle 2024-09-09 14:01:15 +02:00
Mayeul@Zama
38b5759e88 chore(all): fix new lints 2024-09-09 11:57:45 +02:00
Mayeul@Zama
d6f8e59394 chore(all): update toolchain 2024-09-09 11:57:45 +02:00
dependabot[bot]
a95db07003 chore(deps): bump tj-actions/changed-files from 45.0.0 to 45.0.1
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 45.0.0 to 45.0.1.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](40853de9f8...e9772d1404)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-09-09 11:05:21 +02:00
David Testé
6544e6f6a3 chore(ci): use python script to send benchmark results
Using this script simplify writing of corresponding workflow step.
Moreover, now when an upload fails it translate into a workflow
failure.
2024-09-09 11:04:06 +02:00
Agnes Leroy
1d549dfd8a chore(gpu): pass over all cuda bind 2024-09-06 17:47:59 +02:00
Arthur Meyre
019548daa5 chore(ci): add a flag to jest to indicate what might be stuck when running 2024-09-06 17:41:22 +02:00
Arthur Meyre
26b666955a chore(ci): timeout wasm bench and test at the GitHub runner level
- avoids a stuck runner for 6 hours
- actions timeouts are slightly larger than the test runner timeout to
have a chance to get a log out
2024-09-06 17:41:22 +02:00
Arthur Meyre
ce9da12e65 feat(zk): implement faster pke proof
- original work by Sarah El kazdadi

co-authored-by: sarah el kazdadi <sarah.elkazdadi@zama.ai>
2024-09-06 14:25:57 +02:00
Arthur Meyre
32b45ac4bc chore(js): increase timeout for ZK test as it can be surpassed
- this seemed to cause the test runner to hang forever
- also add a timeout in the GitHub workflow, to avoid having the test
runner wait forever (or in this case 6 hours because of default timeout)
2024-09-06 14:19:07 +02:00
Arthur Meyre
26055b236e feat(tfhe): allow unpacking packed compact ciphertext lists in js/wasm 2024-09-06 14:19:07 +02:00
Agnes Leroy
ce9e355c15 chore(gpu): reduce the amount of weekly multi-gpu bench 2024-09-06 11:55:34 +02:00
tmontaigu
85cc638c62 chore(gpu): fix bad merge 2024-09-06 10:21:00 +02:00
Agnes Leroy
d454b5386b chore(gpu): remove device synchronization in drop for CudaVec 2024-09-05 14:13:06 +02:00
tmontaigu
426f3bd192 feat(hlapi): add tag system
Tag

The `Tag` allows to store bytes alongside of entities (keys, and ciphertext)
the main purpose of this system is to `tag` / identify ciphertext with their keys.

* When encrypted, a ciphertext gets the tag of the key used to encrypt it.
* Ciphertexts resulting from operations (add, sub, etc.) get the tag from the ServerKey used
* PublicKey gets its tag from the ClientKey that was used to create it
* ServerKey gets its tag from the ClientKey that was used to create it

User can change the tag of any entities at any point.

BREAKING CHANGE: Many of the into_raw_parts and from_raw_parts changed
to accommodate the addition of the `tag``
2024-09-05 10:32:35 +02:00
tmontaigu
4c707e79d8 feat(hlapi): bind cuda's trailing/leading_ones/zeros, ilog2 2024-09-04 19:38:14 +02:00
Arthur Meyre
e1afb8126d chore: bump version to 0.8.0-alpha.4 2024-09-04 17:30:43 +02:00
Agnes Leroy
0d1ef0af7e chore(gpu): add ilog2 bench 2024-09-04 17:03:20 +02:00
Arthur Meyre
15e3474cda feat(pbs): slightly improve f64 pbs perf
co-authored-by: sarah el kazdadi <sarah.elkazdadi@zama.ai>
2024-09-03 19:31:14 +02:00
Arthur Meyre
10be6f9423 chore(ci): update node project packages 2024-09-03 17:14:36 +02:00
489 changed files with 34018 additions and 18480 deletions

View File

@@ -26,7 +26,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,9 +44,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -75,7 +76,7 @@ jobs:
echo "branch=${BRANCH}" >> "${GITHUB_OUTPUT}"
- name: Clone test data
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
repository: zama-ai/tfhe-backward-compat-data
@@ -103,7 +104,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -26,6 +26,7 @@ jobs:
outputs:
csprng_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.csprng_any_changed }}
zk_pok_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.zk_pok_any_changed }}
versionable_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.versionable_any_changed }}
core_crypto_test: ${{ env.IS_PULL_REQUEST == 'false' ||
steps.changed-files.outputs.core_crypto_any_changed ||
steps.changed-files.outputs.dependencies_any_changed }}
@@ -50,13 +51,13 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -64,10 +65,15 @@ jobs:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
- utils/tfhe-versionable/**
- utils/tfhe-versionable-derive/**
csprng:
- concrete-csprng/**
zk_pok:
- tfhe-zk-pok/**
versionable:
- utils/tfhe-versionable/**
- utils/tfhe-versionable-derive/**
core_crypto:
- tfhe/src/core_crypto/**
boolean:
@@ -103,6 +109,7 @@ jobs:
if: ( steps.changed-files.outputs.dependencies_any_changed == 'true' ||
steps.changed-files.outputs.csprng_any_changed == 'true' ||
steps.changed-files.outputs.zk_pok_any_changed == 'true' ||
steps.changed-files.outputs.versionable_any_changed == 'true' ||
steps.changed-files.outputs.core_crypto_any_changed == 'true' ||
steps.changed-files.outputs.boolean_any_changed == 'true' ||
steps.changed-files.outputs.shortint_any_changed == 'true' ||
@@ -124,7 +131,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -144,9 +151,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -167,6 +175,11 @@ jobs:
run: |
make test_zk_pok
- name: Run tfhe-versionable tests
if: needs.should-run.outputs.versionable_test == 'true'
run: |
make test_versionable
- name: Run core tests
if: needs.should-run.outputs.core_crypto_test == 'true'
run: |
@@ -213,7 +226,7 @@ jobs:
make test_safe_deserialization
- name: Slack Notification
if: ${{ always() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
@@ -228,7 +241,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,14 +39,14 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
persist-credentials: "false"
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -72,7 +72,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,9 +90,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: "false"
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -125,7 +126,7 @@ jobs:
AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_unsigned_integer_ci
- name: Slack Notification
if: ${{ always() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
@@ -140,7 +141,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,14 +39,14 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
persist-credentials: "false"
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -72,7 +72,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,9 +90,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: "false"
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -129,7 +130,7 @@ jobs:
AVX512_SUPPORT=ON NO_BIG_PARAMS=${{ env.NO_BIG_PARAMS }} BIG_TESTS_INSTANCE=TRUE make test_signed_integer_ci
- name: Slack Notification
if: ${{ always() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
@@ -144,7 +145,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -57,13 +57,13 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -131,7 +131,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -151,9 +151,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -222,7 +223,7 @@ jobs:
make test_kreyvium
- name: Slack Notification
if: ${{ always() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
@@ -237,7 +238,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,9 +45,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -58,9 +59,11 @@ jobs:
with:
toolchain: stable
- name: Install Node
- name: Install web resources
run: |
make install_node
make install_chrome_browser
make install_chrome_web_driver
- name: Run fmt checks
run: |
@@ -72,10 +75,10 @@ jobs:
- name: Run parallel wasm tests
run: |
make test_web_js_api_parallel_ci
make test_web_js_api_parallel_chrome_ci
- name: Slack Notification
if: ${{ always() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
@@ -90,7 +93,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -48,9 +48,10 @@ jobs:
continue-on-error: true
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -104,7 +105,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -113,16 +114,8 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
- name: Slack Notification
if: ${{ failure() }}
@@ -140,7 +133,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -26,7 +26,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,9 +44,10 @@ jobs:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -92,7 +93,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -101,16 +102,8 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on downloaded artifact"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
- name: Slack Notification
if: ${{ failure() }}
@@ -128,7 +121,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,9 +39,10 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -58,7 +59,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -90,19 +91,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
- name: Slack Notification
if: ${{ always() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
@@ -121,7 +114,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
@@ -139,7 +132,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -184,7 +177,7 @@ jobs:
${{ secrets.SLAB_URL }}
- name: Slack Notification
if: ${{ !success() && !cancelled() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -65,9 +65,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -134,7 +135,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -143,16 +144,8 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on downloaded artifact"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
slack-notify:
name: Slack Notification
@@ -175,7 +168,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -30,7 +30,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,9 +68,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -150,7 +151,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -159,16 +160,8 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
slack-notify:
name: Slack Notification
@@ -191,7 +184,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -0,0 +1,194 @@
# Run integer benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
name: Integer 2xH100 benchmarks
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-integer-full-2-gpu-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: 2-h100
cuda-integer-full-2-gpu-benchmarks:
name: Execute 2xH100 integer benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
continue-on-error: true
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [integer_multi_bit]
op_flavor: [default]
# explicit include-based build matrix, of known valid options
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run benchmarks with AVX512
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-H100x2" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-integer-full-2-gpu-benchmarks.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}
SLACK_MESSAGE: "Integer GPU 2xH100 benchmarks finished with status: ${{ needs.cuda-integer-full-2-gpu-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-integer-full-2-gpu-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-integer-full-2-gpu-benchmarks ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-integer-full-2-gpu-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
max-parallel: 1
matrix:
command: [integer, integer_multi_bit]
op_flavor: [default, unchecked]
op_flavor: [default]
# explicit include-based build matrix, of known valid options
include:
- os: ubuntu-22.04
@@ -72,9 +72,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -115,7 +116,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -129,6 +130,12 @@ jobs:
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
# Run these benchmarks only once
- name: Run compression benchmarks with AVX512
if: matrix.op_flavor == 'default' && matrix.command == 'integer'
run: |
make bench_integer_compression_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
@@ -152,22 +159,14 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-full-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
if: ${{ always() && needs.cuda-integer-full-benchmarks.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -184,7 +183,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -81,9 +81,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -173,7 +174,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -182,23 +183,14 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-multi-bit-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
if: ${{ always() && needs.cuda-integer-multi-bit-benchmarks.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -215,7 +207,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -82,9 +82,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -125,7 +126,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -172,22 +173,14 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-multi-bit-multi-gpu-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
if: ${{ always() && needs.cuda-integer-multi-bit-multi-gpu-benchmarks.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -204,7 +197,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -39,7 +39,7 @@ jobs:
profile: multi-h100
cuda-integer-full-multi-gpu-benchmarks:
name: Execute multi GPU integer benchmarks for all operations flavor
name: Execute multi GPU integer benchmarks
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
@@ -48,8 +48,8 @@ jobs:
fail-fast: false
max-parallel: 1
matrix:
command: [integer, integer_multi_bit]
op_flavor: [default, unchecked]
command: [integer_multi_bit]
op_flavor: [default]
# explicit include-based build matrix, of known valid options
include:
- os: ubuntu-22.04
@@ -72,9 +72,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -115,7 +116,7 @@ jobs:
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -152,22 +153,14 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-integer-full-multi-gpu-benchmarks ]
runs-on: ubuntu-latest
if: ${{ !success() && !cancelled() }}
if: ${{ always() && needs.cuda-integer-full-multi-gpu-benchmarks.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -184,7 +177,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

206
.github/workflows/benchmark_gpu_l40.yml vendored Normal file
View File

@@ -0,0 +1,206 @@
# Run benchmarks on an L40 VM and return parsed results to Slab CI bot.
name: Cuda benchmarks (L40)
on:
workflow_dispatch:
schedule:
# Weekly benchmarks will be triggered each Saturday at 1a.m.
- cron: '0 1 * * 6'
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
jobs:
setup-instance:
name: Setup instance (cuda-l40-benchmarks)
runs-on: ubuntu-latest
if: github.event_name != 'schedule' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: l40
cuda-l40-benchmarks:
name: Cuda benchmarks (L40)
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
timeout-minutes: 1440 # 24 hours
continue-on-error: true
strategy:
fail-fast: false
max-parallel: 1
matrix:
command: [integer_multi_bit]
op_flavor: [default]
# explicit include-based build matrix, of known valid options
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Set up home
# "Install rust" step require root user to have a HOME directory which is not set.
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: nightly
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
{
echo "CUDA_PATH=$CUDA_PATH";
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH";
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc";
} >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
} >> "${GITHUB_ENV}"
- name: Checkout Slab repo
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run benchmarks with AVX512
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}_gpu
- name: Run compression benchmarks with AVX512
run: |
make bench_integer_compression_gpu
- name: Run PBS benchmarks
run: |
make bench_pbs_gpu
- name: Run KS benchmarks
run: |
make bench_ks_gpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
--database tfhe_rs \
--hardware "n3-L40x1" \
--backend gpu \
--project-version "${{ env.COMMIT_HASH }}" \
--branch ${{ github.ref_name }} \
--commit-date "${{ env.COMMIT_DATE }}" \
--bench-date "${{ env.BENCH_DATE }}" \
--walk-subdirs \
--name-suffix avx512 \
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-l40-benchmarks ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-l40-benchmarks.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-l40-benchmarks.result }}
SLACK_MESSAGE: "Cuda benchmarks (L40) finished with status: ${{ needs.cuda-l40-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-l40-benchmarks)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, cuda-l40-benchmarks, slack-notify ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-l40-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -62,7 +62,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,9 +87,10 @@ jobs:
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -110,7 +111,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -125,6 +126,12 @@ jobs:
run: |
make BENCH_OP_FLAVOR=${{ matrix.op_flavor }} bench_${{ matrix.command }}
# Run these benchmarks only once
- name: Run compression benchmarks with AVX512
if: matrix.op_flavor == 'default' && matrix.command == 'integer'
run: |
make bench_integer_compression
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
@@ -147,16 +154,8 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
- name: Slack Notification
if: ${{ failure() }}
@@ -174,7 +173,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -56,7 +56,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,9 +79,10 @@ jobs:
op_flavor: ${{ fromJson(needs.prepare-matrix.outputs.op_flavor) }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -102,7 +103,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -149,16 +150,8 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
- name: Slack Notification
if: ${{ failure() }}
@@ -176,7 +169,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -62,7 +62,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,9 +87,10 @@ jobs:
op_flavor: [ default, unchecked ]
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -110,7 +111,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -147,16 +148,8 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
- name: Slack Notification
if: ${{ failure() }}
@@ -174,7 +167,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -33,13 +33,13 @@ jobs:
wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -64,7 +64,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -80,9 +80,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -102,10 +103,15 @@ jobs:
with:
toolchain: nightly
- name: Run benchmarks
- name: Install web resources
run: |
make install_node
make bench_web_js_api_parallel_ci
make install_chrome_browser
make install_chrome_web_driver
- name: Run benchmarks
run: |
make bench_web_js_api_parallel_chrome_ci
- name: Parse results
run: |
@@ -136,7 +142,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -145,16 +151,8 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
- name: Slack Notification
if: ${{ failure() }}
@@ -172,7 +170,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -30,13 +30,13 @@ jobs:
zk_pok_changed: ${{ steps.changed-files.outputs.zk_pok_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -65,7 +65,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -84,9 +84,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Get benchmark details
run: |
@@ -107,7 +108,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -144,7 +145,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: zama-ai/slab
path: slab
@@ -153,19 +154,11 @@ jobs:
- name: Send data to Slab
shell: bash
run: |
echo "Computing HMac on results file"
SIGNATURE="$(slab/scripts/hmac_calculator.sh ${{ env.RESULTS_FILENAME }} '${{ secrets.JOB_SECRET }}')"
echo "Sending results to Slab..."
curl -v -k \
-H "Content-Type: application/json" \
-H "X-Slab-Repository: ${{ github.repository }}" \
-H "X-Slab-Command: store_data_v2" \
-H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
-d @${{ env.RESULTS_FILENAME }} \
${{ secrets.SLAB_URL }}
python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"
- name: Slack Notification
if: ${{ !success() && !cancelled() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
@@ -180,7 +173,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -25,7 +25,7 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a

View File

@@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
- name: Get actionlint
run: |
@@ -25,3 +25,9 @@ jobs:
- name: Lint workflows
run: |
make lint_workflow
- name: Ensure SHA pinned actions
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@0901cf7b71c7ea6261ec69a3dc2bd3f9264f893e # v3.0.12
with:
allowlist: |
slsa-framework/slsa-github-generator

View File

@@ -25,7 +25,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -44,7 +44,7 @@ jobs:
timeout-minutes: 5760 # 4 days
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
- name: Set up home
run: |
@@ -57,7 +57,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
files_yaml: |
tfhe:
@@ -125,7 +125,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,7 +27,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -45,9 +45,10 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -78,7 +79,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -34,9 +34,10 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
@@ -74,7 +75,7 @@ jobs:
github_token: ${{ secrets.GITHUB_TOKEN }}
- name: Slack Notification
if: ${{ always() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:

View File

@@ -28,13 +28,13 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -44,7 +44,7 @@ jobs:
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
@@ -65,7 +65,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -108,9 +108,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -147,6 +148,7 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
- name: Run user docs tests
@@ -165,7 +167,7 @@ jobs:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -182,7 +184,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -27,13 +27,13 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -43,7 +43,7 @@ jobs:
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
@@ -63,7 +63,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -106,9 +106,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -145,6 +146,7 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
make test_core_crypto_gpu
make test_integer_compression_gpu
make test_cuda_backend
- name: Run user docs tests
@@ -163,7 +165,7 @@ jobs:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -180,7 +182,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -0,0 +1,156 @@
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: TFHE Cuda Backend - Full tests on H100
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
on:
workflow_dispatch:
jobs:
setup-instance:
name: Setup instance (cuda-h100-tests)
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
cuda-tests-linux:
name: CUDA H100 tests
needs: [ setup-instance ]
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.2"
gcc: 11
env:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
CMAKE_VERSION: 3.29.6
steps:
# Mandatory on hyperstack since a bootable volume is not re-usable yet.
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y checkinstall zlib1g-dev libssl-dev
wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
cd cmake-${{ env.CMAKE_VERSION }}
./bootstrap
make -j"$(nproc)"
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Export CUDA variables
if: ${{ !cancelled() }}
run: |
echo "CUDA_PATH=$CUDA_PATH" >> "${GITHUB_ENV}"
echo "$CUDA_PATH/bin" >> "${GITHUB_PATH}"
echo "LD_LIBRARY_PATH=$CUDA_PATH/lib:$LD_LIBRARY_PATH" >> "${GITHUB_ENV}"
echo "CUDACXX=/usr/local/cuda-${{ matrix.cuda }}/bin/nvcc" >> "${GITHUB_ENV}"
# Specify the correct host compilers
- name: Export gcc and g++ variables
if: ${{ !cancelled() }}
run: |
{
echo "CC=/usr/bin/gcc-${{ matrix.gcc }}";
echo "CXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "CUDAHOSTCXX=/usr/bin/g++-${{ matrix.gcc }}";
echo "HOME=/home/ubuntu";
} >> "${GITHUB_ENV}"
- name: Check device is detected
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run core crypto, integer and internal CUDA backend tests
run: |
make test_gpu
- name: Run user docs tests
run: |
make test_user_doc_gpu
- name: Test C API
run: |
make test_c_api_gpu
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
slack-notify:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ failure() }}
continue-on-error: true
steps:
- name: Send message
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Full H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.ACTION_RUN_URL }})"
teardown-instance:
name: Teardown instance (cuda-h100-tests)
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -28,13 +28,13 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -44,7 +44,7 @@ jobs:
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
@@ -65,7 +65,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -108,9 +108,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -144,6 +145,10 @@ jobs:
if: ${{ !cancelled() }}
run: nvidia-smi
- name: Run multi-bit CUDA integer compression tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
- name: Run multi-bit CUDA integer tests
run: |
@@ -165,7 +170,7 @@ jobs:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -182,7 +187,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,9 +53,10 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -94,7 +95,7 @@ jobs:
make pcc_gpu
- name: Slack Notification
if: ${{ always() }}
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
@@ -109,7 +110,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -28,13 +28,14 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -44,7 +45,7 @@ jobs:
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
@@ -65,7 +66,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -108,7 +109,7 @@ jobs:
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
- name: Set up home
run: |
@@ -154,7 +155,7 @@ jobs:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -171,7 +172,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -36,13 +36,13 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -52,7 +52,7 @@ jobs:
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
@@ -73,7 +73,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,9 +116,10 @@ jobs:
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Set up home
run: |
@@ -168,7 +169,7 @@ jobs:
name: Slack Notification
needs: [ setup-instance, cuda-signed-integer-tests ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' }}
if: ${{ always() && needs.cuda-signed-integer-tests.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -185,7 +186,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -28,13 +28,14 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -44,7 +45,7 @@ jobs:
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
@@ -65,7 +66,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -108,7 +109,7 @@ jobs:
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
- name: Set up home
run: |
@@ -154,7 +155,7 @@ jobs:
name: Slack Notification
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' }}
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -171,7 +172,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -35,13 +35,14 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
token: ${{ secrets.FHE_ACTIONS_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
uses: tj-actions/changed-files@48d8f15b2aaa3d255ca5af3eba4870f807ce6b3c
with:
since_last_remote_commit: true
files_yaml: |
@@ -51,7 +52,7 @@ jobs:
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/shortint/parameters/**
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**.md'
@@ -72,7 +73,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -115,7 +116,7 @@ jobs:
sudo make install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
- name: Set up home
run: |
@@ -165,7 +166,7 @@ jobs:
name: Slack Notification
needs: [ setup-instance, cuda-unsigned-integer-tests ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' }}
if: ${{ always() && needs.cuda-unsigned-integer-tests.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Send message
@@ -182,7 +183,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -34,7 +34,7 @@ jobs:
timeout-minutes: 720
steps:
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
persist-credentials: 'false'

View File

@@ -36,7 +36,7 @@ jobs:
hash: ${{ steps.hash.outputs.hash }}
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Prepare package
@@ -74,7 +74,7 @@ jobs:
id-token: write
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: Create NPM version tag

View File

@@ -17,7 +17,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0

View File

@@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0

View File

@@ -29,14 +29,14 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: gpu-test
profile: gpu-build
publish-cuda-release:
name: Publish CUDA Release
@@ -54,7 +54,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
@@ -112,7 +112,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
uses: zama-ai/slab-github-runner@c0e7168795bd78f61f61146951ed9d0c73c9b701
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -18,7 +18,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0

View File

@@ -17,10 +17,10 @@ jobs:
runs-on: large_ubuntu_16
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
- name: Checkout lattice-estimator
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
repository: malb/lattice-estimator
path: lattice_estimator

View File

@@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repo
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938
with:
fetch-depth: 0
- name: git-sync

2
.gitignore vendored
View File

@@ -26,6 +26,8 @@ backends/tfhe-cuda-backend/cuda/cmake-build-debug/
# WASM tests
tfhe/web_wasm_parallel_tests/server.PID
venv/
web-test-runner/
# Dir used for backward compatibility test data
tfhe/tfhe-backward-compat-data/

159
Makefile
View File

@@ -21,10 +21,12 @@ BENCH_OP_FLAVOR?=DEFAULT
NODE_VERSION=22.6
FORWARD_COMPAT?=OFF
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
BACKWARD_COMPAT_DATA_BRANCH?=v0.1
BACKWARD_COMPAT_DATA_BRANCH?=v0.2
BACKWARD_COMPAT_DATA_PROJECT=tfhe-backward-compat-data
BACKWARD_COMPAT_DATA_DIR=$(BACKWARD_COMPAT_DATA_PROJECT)
TFHE_SPEC:=tfhe
WEB_RUNNER_DIR=web-test-runner
WEB_SERVER_DIR=tfhe/web_wasm_parallel_tests
# This is done to avoid forgetting it, we still precise the RUSTFLAGS in the commands to be able to
# copy paste the command in the terminal and change them if required without forgetting the flags
export RUSTFLAGS?=-C target-cpu=native
@@ -146,6 +148,43 @@ install_tfhe_lints:
(cd utils/cargo-tfhe-lints-inner && cargo install --path .) && \
cd utils/cargo-tfhe-lints && cargo install --path .
.PHONY: install_typos_checker # Install typos checker
install_typos_checker: install_rs_build_toolchain
@typos --version > /dev/null 2>&1 || \
cargo $(CARGO_RS_BUILD_TOOLCHAIN) install typos-cli || \
( echo "Unable to install typos-cli, unknown error." && exit 1 )
.PHONY: setup_venv # Setup Python virtualenv for wasm tests
setup_venv:
python3 -m venv venv
@source venv/bin/activate && \
pip3 install -r ci/webdriver_requirements.txt
# This is an internal target, not meant to be called on its own.
install_web_resource:
wget -P $(dest) $(url)
@cd $(dest) && \
echo "$(checksum) $(filename)" > checksum && \
sha256sum -c checksum && \
rm checksum && \
unzip $(filename)
install_chrome_browser: url = "https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.137/linux64/chrome-linux64.zip"
install_chrome_browser: checksum = "c5d7da679f3a353ae4e4420ab113de06d4bd459152f5b17558390c02d9520566"
install_chrome_browser: dest = "$(WEB_RUNNER_DIR)/chrome"
install_chrome_browser: filename = "chrome-linux64.zip"
.PHONY: install_chrome_browser # Install Chrome browser for Linux
install_chrome_browser: install_web_resource
install_chrome_web_driver: url = "https://storage.googleapis.com/chrome-for-testing-public/128.0.6613.137/linux64/chromedriver-linux64.zip"
install_chrome_web_driver: checksum = "f041092f403fb7455a6da2871070b6587c32814a3e3c2b0a794d3d4aa4739151"
install_chrome_web_driver: dest = "$(WEB_RUNNER_DIR)/chrome"
install_chrome_web_driver: filename = "chromedriver-linux64.zip"
.PHONY: install_chrome_web_driver # Install Chrome web driver for Linux
install_chrome_web_driver: install_web_resource
.PHONY: check_linelint_installed # Check if linelint newline linter is installed
check_linelint_installed:
@printf "\n" | linelint - > /dev/null 2>&1 || \
@@ -207,6 +246,10 @@ check_fmt_js: check_nvm_installed
nvm use $(NODE_VERSION) && \
$(MAKE) -C tfhe/web_wasm_parallel_tests check_fmt
.PHONY: check_typos # Check for typos in codebase
check_typos: install_typos_checker
@typos && echo "No typos found"
.PHONY: clippy_gpu # Run clippy lints on tfhe with "gpu" enabled
clippy_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
@@ -214,6 +257,13 @@ clippy_gpu: install_rs_check_toolchain
--all-targets \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: check_gpu # Run check on tfhe with "gpu" enabled
check_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" check \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,gpu \
--all-targets \
-p $(TFHE_SPEC)
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
fix_newline: check_linelint_installed
linelint -a .
@@ -252,12 +302,18 @@ clippy_shortint: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),shortint \
-p $(TFHE_SPEC) -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),shortint,experimental \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_integer # Run clippy lints enabling the integer features
clippy_integer: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),integer \
-p $(TFHE_SPEC) -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=$(TARGET_ARCH_FEATURE),integer,experimental \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy # Run clippy lints enabling the boolean, shortint, integer
clippy: install_rs_check_toolchain
@@ -284,6 +340,9 @@ clippy_c_api: install_rs_check_toolchain
.PHONY: clippy_js_wasm_api # Run clippy lints enabling the boolean, shortint, integer and the js wasm API
clippy_js_wasm_api: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api,zk-pok \
-p $(TFHE_SPEC) -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,high-level-client-js-wasm-api \
-p $(TFHE_SPEC) -- --no-deps -D warnings
@@ -303,6 +362,9 @@ clippy_all_targets: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok \
-p $(TFHE_SPEC) -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,internal-keycache,zk-pok,experimental \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: clippy_concrete_csprng # Run clippy lints on concrete-csprng
clippy_concrete_csprng: install_rs_check_toolchain
@@ -315,9 +377,17 @@ clippy_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-zk-pok -- --no-deps -D warnings
.PHONY: clippy_versionable # Run clippy lints on tfhe-versionable
clippy_versionable: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-versionable-derive -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-versionable -- --no-deps -D warnings
.PHONY: clippy_all # Run all clippy targets
clippy_all: clippy_rustdoc clippy clippy_boolean clippy_shortint clippy_integer clippy_all_targets \
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium
clippy_c_api clippy_js_wasm_api clippy_tasks clippy_core clippy_concrete_csprng clippy_zk_pok clippy_trivium \
clippy_versionable
.PHONY: clippy_fast # Run main clippy targets
clippy_fast: clippy_rustdoc clippy clippy_all_targets clippy_c_api clippy_js_wasm_api clippy_tasks \
@@ -331,7 +401,7 @@ clippy_cuda_backend: install_rs_check_toolchain
.PHONY: tfhe_lints # Run custom tfhe-rs lints
tfhe_lints: install_tfhe_lints
cd tfhe && RUSTFLAGS="$(RUSTFLAGS)" cargo tfhe-lints \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer -- -D warnings
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,integer,zk-pok -- -D warnings
.PHONY: build_core # Build core_crypto without experimental features
build_core: install_rs_build_toolchain install_rs_check_toolchain
@@ -419,6 +489,7 @@ build_web_js_api_parallel: install_rs_check_toolchain install_wasm_pack
-- --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api,integer-client-js-wasm-api,parallel-wasm-api,zk-pok \
-Z build-std=panic_abort,std && \
find pkg/snippets -type f -iname workerHelpers.worker.js -exec sed -i "s|from '..\/..\/..\/';|from '..\/..\/..\/tfhe.js';|" {} \;
jq '.files += ["snippets"]' tfhe/pkg/package.json > tmp_pkg.json && mv -f tmp_pkg.json tfhe/pkg/package.json
.PHONY: build_node_js_api # Build the js API targeting nodejs
build_node_js_api: install_rs_build_toolchain install_wasm_pack
@@ -481,6 +552,13 @@ test_integer_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
.PHONY: test_integer_compression_gpu
test_integer_compression_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compressed_ciphertext_list::tests::
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
.PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
@@ -734,7 +812,7 @@ test_zk_pok: install_rs_build_toolchain
.PHONY: test_versionable # Run tests for tfhe-versionable subcrate
test_versionable: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
-p tfhe-versionable
--all-targets -p tfhe-versionable
# The backward compat data repo holds historical binary data but also rust code to generate and load them.
# Here we use the "patch" functionality of Cargo to make sure the repo used for the data is the same as the one used for the code.
@@ -742,7 +820,7 @@ test_versionable: install_rs_build_toolchain
test_backward_compatibility_ci: install_rs_build_toolchain
TFHE_BACKWARD_COMPAT_DATA_DIR="$(BACKWARD_COMPAT_DATA_DIR)" RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--config "patch.'$(BACKWARD_COMPAT_DATA_URL)'.$(BACKWARD_COMPAT_DATA_PROJECT).path=\"tfhe/$(BACKWARD_COMPAT_DATA_DIR)\"" \
--features=$(TARGET_ARCH_FEATURE),shortint,integer -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
--features=$(TARGET_ARCH_FEATURE),shortint,integer,zk-pok -p $(TFHE_SPEC) test_backward_compatibility -- --nocapture
.PHONY: test_backward_compatibility # Same as test_backward_compatibility_ci but tries to clone the data repo first if needed
test_backward_compatibility: tfhe/$(BACKWARD_COMPAT_DATA_DIR) test_backward_compatibility_ci
@@ -833,18 +911,35 @@ test_nodejs_wasm_api_in_docker: build_nodejs_test_docker
.PHONY: test_nodejs_wasm_api # Run tests for the nodejs on wasm API
test_nodejs_wasm_api: build_node_js_api
cd tfhe/js_on_wasm_tests && npm run test
cd tfhe/js_on_wasm_tests && npm install && npm run test
.PHONY: test_web_js_api_parallel # Run tests for the web wasm api
test_web_js_api_parallel: build_web_js_api_parallel
$(MAKE) -C tfhe/web_wasm_parallel_tests test
.PHONY: test_web_js_api_parallel_ci # Run tests for the web wasm api
test_web_js_api_parallel_ci: build_web_js_api_parallel
# This is an internal target, not meant to be called on its own.
run_web_js_api_parallel: build_web_js_api_parallel setup_venv
cd $(WEB_SERVER_DIR) && npm install && npm run build
source venv/bin/activate && \
python ci/webdriver.py \
--browser-path $(browser_path) \
--driver-path $(driver_path) \
--browser-kind $(browser_kind) \
--server-cmd "npm run server" \
--server-workdir "$(WEB_SERVER_DIR)" \
--id-pattern $(filter)
test_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
test_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
test_web_js_api_parallel_chrome: browser_kind = chrome
test_web_js_api_parallel_chrome: filter = Test
.PHONY: test_web_js_api_parallel_chrome # Run tests for the web wasm api
test_web_js_api_parallel_chrome: run_web_js_api_parallel
.PHONY: test_web_js_api_parallel_chrome_ci # Run tests for the web wasm api
test_web_js_api_parallel_chrome_ci: setup_venv
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) -C tfhe/web_wasm_parallel_tests test-ci
$(MAKE) test_web_js_api_parallel_chrome
.PHONY: no_tfhe_typo # Check we did not invert the h and f in tfhe
no_tfhe_typo:
@@ -862,6 +957,11 @@ dieharder_csprng: install_dieharder build_concrete_csprng
# Benchmarks
#
.PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
print_doc_bench_parameters:
RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache -p tfhe
.PHONY: bench_integer # Run benchmarks for unsigned integer
bench_integer: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) \
@@ -883,6 +983,18 @@ bench_integer_gpu: install_rs_check_toolchain
--bench integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench glwe_packing_compression-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,nightly-avx512 -p $(TFHE_SPEC) --
.PHONY: bench_integer_compression_gpu
bench_integer_compression_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench glwe_packing_compression-integer-bench \
--features=$(TARGET_ARCH_FEATURE),integer,internal-keycache,gpu -p $(TFHE_SPEC) --
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=MULTI_BIT \
@@ -981,15 +1093,20 @@ bench_ks_gpu: install_rs_check_toolchain
--bench ks-bench \
--features=$(TARGET_ARCH_FEATURE),boolean,shortint,gpu,internal-keycache,nightly-avx512 -p $(TFHE_SPEC)
.PHONY: bench_web_js_api_parallel # Run benchmarks for the web wasm api
bench_web_js_api_parallel: build_web_js_api_parallel
$(MAKE) -C tfhe/web_wasm_parallel_tests bench
bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
bench_web_js_api_parallel_chrome: browser_kind = chrome
bench_web_js_api_parallel_chrome: filter = Bench
.PHONY: bench_web_js_api_parallel_ci # Run benchmarks for the web wasm api
bench_web_js_api_parallel_ci: build_web_js_api_parallel
.PHONY: bench_web_js_api_parallel_chrome # Run benchmarks for the web wasm api
bench_web_js_api_parallel_chrome: run_web_js_api_parallel
.PHONY: bench_web_js_api_parallel_chrome_ci # Run benchmarks for the web wasm api
bench_web_js_api_parallel_chrome_ci: setup_venv
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) -C tfhe/web_wasm_parallel_tests bench-ci
$(MAKE) bench_web_js_api_parallel_chrome
#
# Utility tools
@@ -1037,7 +1154,7 @@ parse_wasm_benchmarks: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
--example wasm_benchmarks_parser \
--features=$(TARGET_ARCH_FEATURE),shortint,internal-keycache \
-- web_wasm_parallel_tests/test/benchmark_results
-- wasm_benchmark_results.json
.PHONY: write_params_to_file # Gather all crypto parameters into a file with a Sage readable format.
write_params_to_file: install_rs_check_toolchain
@@ -1076,14 +1193,14 @@ sha256_bool: install_rs_check_toolchain
--features=$(TARGET_ARCH_FEATURE),boolean
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
pcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested check_intra_md_links \
pcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested check_intra_md_links \
clippy_all tfhe_lints check_compile_tests
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
pcc_gpu: clippy_gpu clippy_cuda_backend check_compile_tests_benches_gpu
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_fmt lint_doc check_md_docs_are_tested clippy_fast \
fpcc: no_tfhe_typo no_dbg_log check_fmt check_typos lint_doc check_md_docs_are_tested clippy_fast \
check_compile_tests
.PHONY: conformance # Automatically fix problems that can be fixed

15
_typos.toml Normal file
View File

@@ -0,0 +1,15 @@
[default]
extend-ignore-identifiers-re = [
# Related to serialized object
"ser",
"unser",
# Used when dumping tfhe-rs parameters set into Sage format
"ND.*",
# Related to FHE strings example handling "banana"
"ba",
"enc_ba",
# Example with string replacing "hello" with "herlo"
"herlo",
# Example in trivium
"C9217BA0D762ACA1"
]

View File

@@ -1,3 +1,5 @@
#![allow(clippy::too_long_first_doc_paragraph)]
mod static_deque;
mod kreyvium;

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-cuda-backend"
version = "0.4.0-alpha.0"
version = "0.4.0"
edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"

View File

@@ -67,9 +67,21 @@ endif()
add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
# Check if the DEBUG flag is defined
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# Debug mode
message("Compiling in Debug mode")
add_definitions(-DDEBUG)
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
else()
# Release mode
message("Compiling in Release mode")
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O3")
endif()
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")

View File

@@ -18,7 +18,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t *nth_array, uint32_t num_nths,
uint32_t glwe_dimension,
uint32_t polynomial_size);
};

View File

@@ -8,7 +8,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory);
@@ -17,8 +17,9 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory);
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -27,7 +28,7 @@ void cuda_integer_compress_radix_ciphertext_64(
void cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, void *indexes_array,
void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
@@ -71,7 +72,8 @@ template <typename Torus> struct int_compression {
sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
scratch_packing_keyswitch_lwe_list_to_glwe_64(
streams[0], gpu_indexes[0], &fp_ks_buffer,
@@ -94,41 +96,45 @@ template <typename Torus> struct int_decompression {
uint32_t storage_log_modulus;
uint32_t num_radix_blocks;
uint32_t body_count;
Torus *tmp_extracted_glwe;
Torus *tmp_extracted_lwe;
uint32_t *tmp_indexes_array;
int_radix_lut<Torus> *carry_extract_lut;
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
uint32_t num_radix_blocks, uint32_t body_count,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
this->num_radix_blocks = num_radix_blocks;
this->body_count = body_count;
if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
Torus lwe_accumulator_size = (compression_params.glwe_dimension *
compression_params.polynomial_size +
1);
carry_extract_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_radix_blocks, allocate_gpu_memory);
tmp_extracted_glwe = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
tmp_indexes_array = (uint32_t *)cuda_malloc_async(
num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0]);
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
// Decompression
num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0]);
// Carry extract LUT
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
return x / encryption_params.message_modulus;
@@ -148,9 +154,10 @@ template <typename Torus> struct int_decompression {
uint32_t gpu_count) {
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
delete (carry_extract_lut);
delete carry_extract_lut;
}
};
#endif

View File

@@ -39,16 +39,15 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
bool cuda_check_support_cooperative_groups();
bool cuda_check_support_thread_block_clusters();
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
uint32_t gpu_index);
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
@@ -62,9 +61,13 @@ void cuda_synchronize_device(uint32_t gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
}
int cuda_get_max_shared_memory(uint32_t gpu_index);
}
bool cuda_check_support_cooperative_groups();
bool cuda_check_support_thread_block_clusters();
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,

View File

@@ -8,7 +8,7 @@ extern std::mutex m;
extern bool p2p_enabled;
extern "C" {
int cuda_setup_multi_gpu();
int32_t cuda_setup_multi_gpu();
}
// Define a variant type that can be either a vector or a single pointer

View File

@@ -80,6 +80,11 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cuda_apply_many_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t num_luts, uint32_t lut_stride);
void scratch_cuda_full_propagation_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -112,10 +117,11 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int8_t **mem_ptr_void);
void cuda_negate_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus);
void cuda_negate_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus);
void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
@@ -385,8 +391,8 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t shift);
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift);
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -970,28 +976,52 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
(params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(tmp_bits, 0,
bits_per_block * num_radix_blocks *
(params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_shift_bits = (Torus *)cuda_malloc_async(
max_num_bits_that_tell_shift * num_radix_blocks *
(params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(tmp_shift_bits, 0,
max_num_bits_that_tell_shift * num_radix_blocks *
(params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_rotated = (Torus *)cuda_malloc_async(
bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(tmp_rotated, 0,
bits_per_block * num_radix_blocks *
(params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_input_bits_a = (Torus *)cuda_malloc_async(
bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(tmp_input_bits_a, 0,
bits_per_block * num_radix_blocks *
(params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_input_bits_b = (Torus *)cuda_malloc_async(
bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(tmp_input_bits_b, 0,
bits_per_block * num_radix_blocks *
(params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_mux_inputs = (Torus *)cuda_malloc_async(
bits_per_block * num_radix_blocks * (params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(tmp_mux_inputs, 0,
bits_per_block * num_radix_blocks *
(params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
auto mux_lut_f = [](Torus x) -> Torus {
// x is expected to be x = 0bcba
@@ -1151,6 +1181,11 @@ template <typename Torus> struct int_sc_prop_memory {
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
step_output = (Torus *)cuda_malloc_async(
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
cuda_memset_async(generates_or_propagates, 0,
num_radix_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
cuda_memset_async(step_output, 0, num_radix_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
// declare functions for lut generation
auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
@@ -1267,6 +1302,11 @@ template <typename Torus> struct int_overflowing_sub_memory {
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
step_output = (Torus *)cuda_malloc_async(
num_radix_blocks * big_lwe_size_bytes, streams[0], gpu_indexes[0]);
cuda_memset_async(generates_or_propagates, 0,
num_radix_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
cuda_memset_async(step_output, 0, num_radix_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
// declare functions for lut generation
auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
@@ -1356,6 +1396,7 @@ template <typename Torus> struct int_overflowing_sub_memory {
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
Torus *new_blocks;
Torus *new_blocks_copy;
Torus *old_blocks;
Torus *small_lwe_vector;
int_radix_params params;
@@ -1383,17 +1424,40 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
new_blocks = (Torus *)cuda_malloc_async(
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
new_blocks_copy = (Torus *)cuda_malloc_async(
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
old_blocks = (Torus *)cuda_malloc_async(
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
small_lwe_vector = (Torus *)cuda_malloc_async(
max_pbs_count * (params.small_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(new_blocks, 0,
max_pbs_count * (params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(new_blocks_copy, 0,
max_pbs_count * (params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(old_blocks, 0,
max_pbs_count * (params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(small_lwe_vector, 0,
max_pbs_count * (params.small_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
d_smart_copy_in = (int32_t *)cuda_malloc_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
d_smart_copy_out = (int32_t *)cuda_malloc_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
streams[0], gpu_indexes[0]);
cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
streams[0], gpu_indexes[0]);
}
int_sum_ciphertexts_vec_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -1414,11 +1478,22 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
this->new_blocks = new_blocks;
this->old_blocks = old_blocks;
this->small_lwe_vector = small_lwe_vector;
new_blocks_copy = (Torus *)cuda_malloc_async(
max_pbs_count * (params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0]);
cuda_memset_async(new_blocks_copy, 0,
max_pbs_count * (params.big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
d_smart_copy_in = (int32_t *)cuda_malloc_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
d_smart_copy_out = (int32_t *)cuda_malloc_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0]);
cuda_memset_async(d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t),
streams[0], gpu_indexes[0]);
cuda_memset_async(d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t),
streams[0], gpu_indexes[0]);
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
@@ -1432,8 +1507,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
cuda_drop_async(small_lwe_vector, streams[0], gpu_indexes[0]);
}
cuda_drop_async(new_blocks_copy, streams[0], gpu_indexes[0]);
scp_mem->release(streams, gpu_indexes, gpu_count);
delete scp_mem;
}
};
@@ -2087,7 +2162,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
if (allocate_gpu_memory) {
Torus total_modulus = params.message_modulus * params.carry_modulus;
uint32_t max_value = total_modulus - 1;
uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
tmp_block_accumulated = (Torus *)cuda_malloc_async(

View File

@@ -69,7 +69,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
bool do_modulus_switch);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -77,18 +78,11 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
bool do_modulus_switch);
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count);
uint64_t get_buffer_size_programmable_bootstrap_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count);
}
template <typename Torus>
@@ -339,7 +333,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
template <typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -348,7 +343,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
#if (CUDA_ARCH >= 900)
template <typename Torus>
@@ -358,7 +354,8 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
template <typename Torus>
void scratch_cuda_programmable_bootstrap_tbc(

View File

@@ -17,8 +17,7 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
@@ -27,7 +26,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
@@ -47,8 +47,7 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
template <typename Torus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template <typename Torus>
@@ -58,7 +57,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
#endif
template <typename Torus>
@@ -74,13 +74,13 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template <typename Torus>
@@ -90,7 +90,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
template <typename Torus>
uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle(
@@ -126,7 +127,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
int8_t *d_mem_acc_step_two = NULL;
int8_t *d_mem_acc_cg = NULL;
int8_t *d_mem_acc_tbc = NULL;
uint32_t lwe_chunk_size;
double2 *keybundle_fft;
Torus *global_accumulator;
double2 *global_accumulator_fft;
@@ -138,6 +139,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
this->pbs_variant = pbs_variant;
this->lwe_chunk_size = lwe_chunk_size;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
// default

View File

@@ -23,7 +23,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
void *lwe_array_out, void *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t *nth_array, uint32_t num_nths,
uint32_t glwe_dimension,
uint32_t polynomial_size) {
@@ -31,43 +31,43 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
case 256:
host_sample_extract<uint64_t, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
glwe_dimension);
break;
case 512:
host_sample_extract<uint64_t, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
glwe_dimension);
break;
case 1024:
host_sample_extract<uint64_t, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
glwe_dimension);
break;
case 2048:
host_sample_extract<uint64_t, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
glwe_dimension);
break;
case 4096:
host_sample_extract<uint64_t, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
glwe_dimension);
break;
case 8192:
host_sample_extract<uint64_t, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
glwe_dimension);
break;
case 16384:
host_sample_extract<uint64_t, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
glwe_dimension);
break;
default:

View File

@@ -41,7 +41,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t lwe_per_glwe = params::degree;
auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
auto nth = nth_array[input_id];
// nth is ensured to be in [0, lwe_per_glwe)
auto nth = nth_array[input_id] % lwe_per_glwe;
sample_extract_mask<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);
sample_extract_body<Torus, params>(lwe_out, glwe_in, glwe_dimension, nth);

View File

@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
host_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector<uint32_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
host_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
@@ -66,7 +66,7 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_64(
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
host_packing_keyswitch_lwe_list_to_glwe(
host_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),

View File

@@ -280,6 +280,12 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes) {
if (num_lwes > polynomial_size)
PANIC("Cuda error: too many LWEs to pack. The number of LWEs should be "
"smaller than "
"polynomial_size.")
cudaSetDevice(gpu_index);
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

View File

@@ -1,10 +1,16 @@
#ifndef CNCRT_TORUS_CUH
#define CNCRT_TORUS_CUH
#include "polynomial/parameters.cuh"
#include "types/int128.cuh"
#include "utils/kernel_dimensions.cuh"
#include <limits>
template <typename T>
__host__ __device__ __forceinline__ constexpr double get_two_pow_torus_bits() {
return (sizeof(T) == 4) ? 4294967296.0 : 18446744073709551616.0;
}
template <typename T>
__device__ inline void typecast_double_to_torus(double x, T &r) {
r = T(x);
@@ -27,6 +33,16 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
r = lll;
}
template <typename T>
__device__ inline void typecast_double_round_to_torus(double x, T &r) {
constexpr double mx = get_two_pow_torus_bits<T>();
// floor must be used here because round has an issue with rounding .5,
// as it rounds away from zero.
double frac = x - floor(x);
frac *= mx;
typecast_double_to_torus(round(frac), r);
}
template <typename T>
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
uint32_t level_count) {

View File

@@ -137,6 +137,30 @@ void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
}
}
/// Copy memory within a GPU
void cuda_memcpy_gpu_to_gpu(void *dest, void *src, uint64_t size,
uint32_t gpu_index) {
if (size == 0)
return;
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
if (attr_dest.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
}
cudaPointerAttributes attr_src;
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
if (attr_src.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
}
check_cuda_error(cudaSetDevice(gpu_index));
if (attr_src.device == attr_dest.device) {
check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
} else {
check_cuda_error(
cudaMemcpyPeer(dest, attr_dest.device, src, attr_src.device, size));
}
}
/// Synchronizes device
void cuda_synchronize_device(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
@@ -177,8 +201,8 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
int num_blocks = (n + block_size - 1) / block_size;
// Launch the kernel
cuda_set_value_kernel<<<num_blocks, block_size, 0, stream>>>(d_array, value,
n);
cuda_set_value_kernel<Torus>
<<<num_blocks, block_size, 0, stream>>>(d_array, value, n);
check_cuda_error(cudaGetLastError());
}
}
@@ -247,14 +271,5 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
gpu_index);
check_cuda_error(cudaGetLastError());
#if CUDA_ARCH == 900
max_shared_memory = 226000;
#elif CUDA_ARCH == 890
max_shared_memory = 127000;
#elif CUDA_ARCH == 800
max_shared_memory = 163000;
#elif CUDA_ARCH == 700
max_shared_memory = 95000;
#endif
return max_shared_memory;
}

View File

@@ -6,6 +6,7 @@
#include "twiddles.cuh"
#include "types/complex/operations.cuh"
using Index = unsigned;
/*
* Direct negacyclic FFT:
* - before the FFT the N real coefficients are stored into a
@@ -31,290 +32,81 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
* full loop, which should increase performance
*/
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, v, w;
__syncthreads();
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
constexpr Index LOG2_DEGREE = params::log2_degree;
constexpr Index HALF_DEGREE = params::degree >> 1;
constexpr Index STRIDE = params::degree / params::opt;
Index tid = threadIdx.x;
double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
// load into registers
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
u[i] = A[tid];
v[i] = A[tid + HALF_DEGREE];
tid += STRIDE;
}
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
i1 = tid;
i2 = tid + params::degree / 2;
u = A[i1];
v = A[i2] * (double2){0.707106781186547461715008466854,
0.707106781186547461715008466854};
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = v[i] * (double2){0.707106781186547461715008466854,
0.707106781186547461715008466854};
v[i] = u[i] - w;
u[i] = u[i] + w;
}
__syncthreads();
// level 2
// from this level there are more than one twiddles and none of them has equal
// real and imag parts, so complete complex multiplication is needed
// for each level params::degree / 2^level represents number of coefficients
// inside divided chunk of specific level
//
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
Index twiddle_shift = 1;
for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
twiddle_shift <<= 1;
w = negtwiddles[twid_id + 2];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 6
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
// from level 8, we need to check size of params degree, because we support
// minimum actual polynomial size = 256, when compressed size is halfed and
// minimum supported compressed size is 128, so we always need first 7
// levels of butterfly operation, since butterfly levels are hardcoded
// we need to check if polynomial size is big enough to require specific level
// of butterfly.
if constexpr (params::degree >= 256) {
// level 8
tid = threadIdx.x;
__syncthreads();
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
A[tid] = (u_stays_in_register) ? v[i] : u[i];
tid = tid + STRIDE;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
w = A[tid ^ lane_mask];
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
w = negtwiddles[tid / lane_mask + twiddle_shift];
w = negtwiddles[twid_id + 256];
u = A[i1];
v = A[i2] * w;
w *= v[i];
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
v[i] = u[i] - w;
u[i] = u[i] + w;
tid = tid + STRIDE;
}
__syncthreads();
}
__syncthreads();
if constexpr (params::degree >= 1024) {
// level 10
tid = threadIdx.x;
// store registers in SM
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
w = negtwiddles[twid_id + 2048];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles[twid_id + 4096];
u = A[i1];
v = A[i2] * w;
A[i1] += v;
A[i2] = u - v;
tid += params::degree / params::opt;
}
__syncthreads();
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
A[tid * 2] = u[i];
A[tid * 2 + 1] = v[i];
tid = tid + STRIDE;
}
__syncthreads();
}
/*
@@ -329,284 +121,82 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
* full loop, which should increase performance
*/
__syncthreads();
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
constexpr Index LOG2_DEGREE = params::log2_degree;
constexpr Index DEGREE = params::degree;
constexpr Index HALF_DEGREE = params::degree >> 1;
constexpr Index STRIDE = params::degree / params::opt;
size_t tid = threadIdx.x;
size_t twid_id;
size_t i1, i2;
double2 u, w;
double2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
// divide input by compressed polynomial size
tid = threadIdx.x;
for (size_t i = 0; i < params::opt; ++i) {
A[tid] /= params::degree;
tid += params::degree / params::opt;
}
__syncthreads();
// none of the twiddles have equal real and imag part, so
// complete complex multiplication has to be done
// here we have more than one twiddle
// mapping in backward fft is reversed
// butterfly operation is started from last level
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
// load into registers and divide by compressed polynomial size
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8192);
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
u[i] = A[2 * tid];
v[i] = A[2 * tid + 1];
w = negtwiddles[twid_id + 4096];
u = A[i1] - A[i2];
u[i] /= DEGREE;
v[i] /= DEGREE;
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += STRIDE;
}
tid += params::degree / params::opt;
Index twiddle_shift = DEGREE;
for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
tid = threadIdx.x;
twiddle_shift >>= 1;
// at this point registers are ready for the butterfly
tid = threadIdx.x;
__syncthreads();
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = (u[i] - v[i]);
u[i] += v[i];
v[i] = w * conjugate(negtwiddles[tid / lane_mask + twiddle_shift]);
// keep one of the register for next iteration and store another one in sm
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
A[tid] = (u_stays_in_register) ? v[i] : u[i];
tid = tid + STRIDE;
}
__syncthreads();
}
if constexpr (params::degree >= 4096) {
// level 12
// prepare registers for next butterfly iteration
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4096);
i1 = 2 * (params::degree / 4096) * twid_id +
(tid & (params::degree / 4096 - 1));
i2 = i1 + params::degree / 4096;
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
w = A[tid ^ lane_mask];
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
w = negtwiddles[twid_id + 2048];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
tid = tid + STRIDE;
}
__syncthreads();
}
if constexpr (params::degree >= 2048) {
// level 11
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2048);
i1 = 2 * (params::degree / 2048) * twid_id +
(tid & (params::degree / 2048 - 1));
i2 = i1 + params::degree / 2048;
w = negtwiddles[twid_id + 1024];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 1024) {
// level 10
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 1024);
i1 = 2 * (params::degree / 1024) * twid_id +
(tid & (params::degree / 1024 - 1));
i2 = i1 + params::degree / 1024;
w = negtwiddles[twid_id + 512];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 512) {
// level 9
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 512);
i1 = 2 * (params::degree / 512) * twid_id +
(tid & (params::degree / 512 - 1));
i2 = i1 + params::degree / 512;
w = negtwiddles[twid_id + 256];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
if constexpr (params::degree >= 256) {
// level 8
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 256);
i1 = 2 * (params::degree / 256) * twid_id +
(tid & (params::degree / 256 - 1));
i2 = i1 + params::degree / 256;
w = negtwiddles[twid_id + 128];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
}
// below level 8, we don't need to check size of params degree, because we
// support minimum actual polynomial size = 256, when compressed size is
// halfed and minimum supported compressed size is 128, so we always need
// last 7 levels of butterfly operation, since butterfly levels are hardcoded
// we don't need to check if polynomial size is big enough to require
// specific level of butterfly.
// level 7
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 128);
i1 = 2 * (params::degree / 128) * twid_id +
(tid & (params::degree / 128 - 1));
i2 = i1 + params::degree / 128;
w = negtwiddles[twid_id + 64];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
// last iteration
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = (u[i] - v[i]);
u[i] = u[i] + v[i];
v[i] = w * (double2){0.707106781186547461715008466854,
-0.707106781186547461715008466854};
}
__syncthreads();
// level 6
// store registers in SM
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 64);
i1 =
2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
i2 = i1 + params::degree / 64;
w = negtwiddles[twid_id + 32];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 5
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 32);
i1 =
2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
i2 = i1 + params::degree / 32;
w = negtwiddles[twid_id + 16];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 4
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 16);
i1 =
2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
i2 = i1 + params::degree / 16;
w = negtwiddles[twid_id + 8];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 3
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 8);
i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
i2 = i1 + params::degree / 8;
w = negtwiddles[twid_id + 4];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 2
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 4);
i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
i2 = i1 + params::degree / 4;
w = negtwiddles[twid_id + 2];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
}
__syncthreads();
// level 1
tid = threadIdx.x;
#pragma unroll
for (size_t i = 0; i < params::opt / 2; ++i) {
twid_id = tid / (params::degree / 2);
i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
i2 = i1 + params::degree / 2;
w = negtwiddles[twid_id + 1];
u = A[i1] - A[i2];
A[i1] += A[i2];
A[i2] = u * conjugate(w);
tid += params::degree / params::opt;
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
A[tid] = u[i];
A[tid + HALF_DEGREE] = v[i];
tid = tid + STRIDE;
}
__syncthreads();
}

View File

@@ -37,12 +37,12 @@ void host_resolve_signed_overflow(
streams[0], gpu_indexes[0], x, last_block_output_carry, d_clears,
mem->params.big_lwe_dimension, 1);
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
last_block_inner_propagation, x, mem->params.big_lwe_dimension,
1);
host_addition(streams[0], gpu_indexes[0], last_block_inner_propagation,
last_block_inner_propagation, last_block_input_carry,
mem->params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
last_block_inner_propagation, x,
mem->params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], last_block_inner_propagation,
last_block_inner_propagation, last_block_input_carry,
mem->params.big_lwe_dimension, 1);
host_apply_univariate_lut_kb<Torus>(streams, gpu_indexes, gpu_count, result,
last_block_inner_propagation,
@@ -94,14 +94,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
// phase 1
if (op == SIGNED_OPERATION::ADDITION) {
host_addition(streams[0], gpu_indexes[0], result, lhs, rhs,
big_lwe_dimension, num_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, rhs,
big_lwe_dimension, num_blocks);
} else {
host_integer_radix_negation(
host_integer_radix_negation<Torus>(
streams, gpu_indexes, gpu_count, neg_rhs, rhs, big_lwe_dimension,
num_blocks, radix_params.message_modulus, radix_params.carry_modulus);
host_addition(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
big_lwe_dimension, num_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], result, lhs, neg_rhs,
big_lwe_dimension, num_blocks);
}
// phase 2
@@ -109,10 +109,10 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
result, output_carry, input_carries,
mem_ptr->scp_mem, bsks, ksks, num_blocks);
host_generate_last_block_inner_propagation(
host_propagate_single_carry<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry,
input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks);
host_generate_last_block_inner_propagation<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size],
&rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks,
@@ -126,7 +126,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
// phase 3
auto input_carry = &input_carries[(num_blocks - 1) * big_lwe_size];
host_resolve_signed_overflow(
host_resolve_signed_overflow<Torus>(
streams, gpu_indexes, gpu_count, overflowed, last_block_inner_propagation,
input_carry, output_carry, mem_ptr->resolve_overflow_mem, bsks, ksks);

View File

@@ -17,7 +17,7 @@ void scratch_cuda_integer_radix_cmux_kb_64(
std::function<uint64_t(uint64_t)> predicate_lut_f =
[](uint64_t x) -> uint64_t { return x == 1; };
scratch_cuda_integer_radix_cmux_kb(
scratch_cuda_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_cmux_buffer<uint64_t> **)mem_ptr, predicate_lut_f,
lwe_ciphertext_count, params, allocate_gpu_memory);

View File

@@ -27,10 +27,11 @@ __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes,
auto lwe_array_out_block = tmp_lwe_array_input + i * big_lwe_size;
auto lwe_array_input_block = lwe_array_input + i * big_lwe_size;
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
lwe_array_out_block, predicate->lwe_indexes_in, lwe_array_input_block,
lwe_condition, predicate->lwe_indexes_in, params.big_lwe_dimension,
params.message_modulus, 1);
device_pack_bivariate_blocks<Torus>
<<<num_blocks, num_threads, 0, streams[0]>>>(
lwe_array_out_block, predicate->lwe_indexes_in,
lwe_array_input_block, lwe_condition, predicate->lwe_indexes_in,
params.big_lwe_dimension, params.message_modulus, 1);
check_cuda_error(cudaGetLastError());
}
@@ -57,13 +58,15 @@ __host__ void host_integer_radix_cmux_kb(
}
auto mem_true = mem_ptr->zero_if_true_buffer;
zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
lwe_array_true, lwe_condition, mem_true,
mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks);
zero_out_if<Torus>(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct,
lwe_array_true, lwe_condition, mem_true,
mem_ptr->inverted_predicate_lut, bsks, ksks,
num_radix_blocks);
auto mem_false = mem_ptr->zero_if_false_buffer;
zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct,
lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut,
bsks, ksks, num_radix_blocks);
zero_out_if<Torus>(false_streams, gpu_indexes, gpu_count,
mem_ptr->tmp_false_ct, lwe_array_false, lwe_condition,
mem_false, mem_ptr->predicate_lut, bsks, ksks,
num_radix_blocks);
for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
}
@@ -75,9 +78,9 @@ __host__ void host_integer_radix_cmux_kb(
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
// have kept its value
auto added_cts = mem_ptr->tmp_true_ct;
host_addition(streams[0], gpu_indexes[0], added_cts, mem_ptr->tmp_true_ct,
mem_ptr->tmp_false_ct, params.big_lwe_dimension,
num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], added_cts,
mem_ptr->tmp_true_ct, mem_ptr->tmp_false_ct,
params.big_lwe_dimension, num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,

View File

@@ -43,7 +43,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
// Add all blocks and store in sum
device_accumulate_all_blocks<<<num_blocks, num_threads, 0, stream>>>(
device_accumulate_all_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
output, input, lwe_dimension, num_radix_blocks);
check_cuda_error(cudaGetLastError());
}
@@ -62,7 +62,6 @@ __host__ void are_all_comparisons_block_true(
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
@@ -75,7 +74,7 @@ __host__ void are_all_comparisons_block_true(
auto tmp_out = are_all_block_true_buffer->tmp_out;
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = total_modulus - 1;
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
cuda_memcpy_async_gpu_to_gpu(tmp_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
@@ -96,8 +95,9 @@ __host__ void are_all_comparisons_block_true(
auto is_equal_to_num_blocks_map =
&are_all_block_true_buffer->is_equal_to_lut_map;
for (int i = 0; i < num_chunks; i++) {
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension, chunk_length);
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension,
chunk_length);
accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
@@ -121,9 +121,8 @@ __host__ void are_all_comparisons_block_true(
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
max_value, num_radix_blocks, true);
auto is_equal_to_num_blocks_lut_f = [max_value,
chunk_length](Torus x) -> Torus {
return (x & max_value) == chunk_length;
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
return x == chunk_length;
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], new_lut->get_lut(gpu_indexes[0], 0),
@@ -165,7 +164,6 @@ __host__ void is_at_least_one_comparisons_block_true(
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
@@ -174,7 +172,7 @@ __host__ void is_at_least_one_comparisons_block_true(
auto buffer = mem_ptr->eq_buffer->are_all_block_true_buffer;
uint32_t total_modulus = message_modulus * carry_modulus;
uint32_t max_value = total_modulus - 1;
uint32_t max_value = (total_modulus - 1) / (message_modulus - 1);
cuda_memcpy_async_gpu_to_gpu(mem_ptr->tmp_lwe_array_out, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
@@ -192,8 +190,9 @@ __host__ void is_at_least_one_comparisons_block_true(
auto input_blocks = mem_ptr->tmp_lwe_array_out;
auto accumulator = buffer->tmp_block_accumulated;
for (int i = 0; i < num_chunks; i++) {
accumulate_all_blocks(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension, chunk_length);
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], accumulator,
input_blocks, big_lwe_dimension,
chunk_length);
accumulator += (big_lwe_dimension + 1);
remaining_blocks -= (chunk_length - 1);
@@ -280,8 +279,8 @@ __host__ void host_compare_with_zero_equality(
uint32_t chunk_size =
std::min(remainder_blocks, num_elements_to_fill_carry);
accumulate_all_blocks(streams[0], gpu_indexes[0], sum_i, chunk,
big_lwe_dimension, chunk_size);
accumulate_all_blocks<Torus>(streams[0], gpu_indexes[0], sum_i, chunk,
big_lwe_dimension, chunk_size);
num_sum_blocks++;
remainder_blocks -= (chunk_size - 1);
@@ -295,8 +294,9 @@ __host__ void host_compare_with_zero_equality(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
zero_comparison);
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
sum, mem_ptr, bsks, ksks, num_sum_blocks);
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
lwe_array_out, sum, mem_ptr, bsks, ksks,
num_sum_blocks);
}
template <typename Torus>
@@ -310,7 +310,7 @@ __host__ void host_integer_radix_equality_check_kb(
// Applies the LUT for the comparison operation
auto comparisons = mem_ptr->tmp_block_comparisons;
integer_radix_apply_bivariate_lookup_table_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, comparisons, lwe_array_1, lwe_array_2,
bsks, ksks, num_radix_blocks, eq_buffer->operator_lut,
eq_buffer->operator_lut->params.message_modulus);
@@ -319,9 +319,9 @@ __host__ void host_integer_radix_equality_check_kb(
//
// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr, bsks, ksks,
num_radix_blocks);
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
lwe_array_out, comparisons, mem_ptr,
bsks, ksks, num_radix_blocks);
}
template <typename Torus>
@@ -352,19 +352,20 @@ compare_radix_blocks_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_subtraction(streams[0], gpu_indexes[0], lwe_array_out, lwe_array_left,
lwe_array_right, big_lwe_dimension, num_radix_blocks);
host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array_out,
lwe_array_left, lwe_array_right, big_lwe_dimension,
num_radix_blocks);
// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
num_radix_blocks, is_non_zero_lut);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace(
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
}
@@ -406,8 +407,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
auto inner_tree_leaf = tree_buffer->tree_inner_leaf_lut;
while (partial_block_count > 2) {
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
@@ -433,8 +434,8 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
std::function<Torus(Torus)> f;
if (partial_block_count == 2) {
pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);
f = [block_selector_f, sign_handler_f](Torus x) -> Torus {
int msb = (x >> 2) & 3;
@@ -454,9 +455,9 @@ tree_sign_reduction(cudaStream_t *streams, uint32_t *gpu_indexes,
last_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
gpu_count, lwe_array_out, y,
bsks, ksks, 1, last_lut);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
last_lut);
}
template <typename Torus>
@@ -488,19 +489,21 @@ __host__ void host_integer_radix_difference_check_kb(
if (mem_ptr->is_signed) {
packed_num_radix_blocks -= 2;
}
pack_blocks(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
pack_blocks(streams[0], gpu_indexes[0], packed_right, lwe_array_right,
big_lwe_dimension, packed_num_radix_blocks, message_modulus);
pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_left, lwe_array_left,
big_lwe_dimension, packed_num_radix_blocks,
message_modulus);
pack_blocks<Torus>(streams[0], gpu_indexes[0], packed_right,
lwe_array_right, big_lwe_dimension,
packed_num_radix_blocks, message_modulus);
// From this point we have half number of blocks
packed_num_radix_blocks /= 2;
// Clean noise
auto identity_lut = mem_ptr->identity_lut;
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
packed_num_radix_blocks, identity_lut);
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, packed_right, packed_right, bsks, ksks,
packed_num_radix_blocks, identity_lut);
@@ -517,16 +520,17 @@ __host__ void host_integer_radix_difference_check_kb(
if (!mem_ptr->is_signed) {
// Compare packed blocks, or simply the total number of radix blocks in the
// inputs
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
rhs, mem_ptr, bsks, ksks, packed_num_radix_blocks);
compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count, comparisons,
lhs, rhs, mem_ptr, bsks, ksks,
packed_num_radix_blocks);
num_comparisons = packed_num_radix_blocks;
} else {
// Packing is possible
if (carry_modulus >= message_modulus) {
// Compare (num_radix_blocks - 2) / 2 packed blocks
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons, lhs,
rhs, mem_ptr, bsks, ksks,
packed_num_radix_blocks);
compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
packed_num_radix_blocks);
// Compare the last block before the sign block separately
auto identity_lut = mem_ptr->identity_lut;
@@ -535,21 +539,21 @@ __host__ void host_integer_radix_difference_check_kb(
Torus *last_right_block_before_sign_block =
diff_buffer->tmp_packed_right +
packed_num_radix_blocks * big_lwe_size;
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
identity_lut);
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
1, identity_lut);
compare_radix_blocks_kb(
compare_radix_blocks_kb<Torus>(
streams, gpu_indexes, gpu_count,
comparisons + packed_num_radix_blocks * big_lwe_size,
last_left_block_before_sign_block, last_right_block_before_sign_block,
mem_ptr, bsks, ksks, 1);
// Compare the sign block separately
integer_radix_apply_bivariate_lookup_table_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
comparisons + (packed_num_radix_blocks + 1) * big_lwe_size,
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -558,11 +562,11 @@ __host__ void host_integer_radix_difference_check_kb(
num_comparisons = packed_num_radix_blocks + 2;
} else {
compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
lwe_array_left, lwe_array_right, mem_ptr, bsks,
ksks, num_radix_blocks - 1);
compare_radix_blocks_kb<Torus>(
streams, gpu_indexes, gpu_count, comparisons, lwe_array_left,
lwe_array_right, mem_ptr, bsks, ksks, num_radix_blocks - 1);
// Compare the sign block separately
integer_radix_apply_bivariate_lookup_table_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
comparisons + (num_radix_blocks - 1) * big_lwe_size,
lwe_array_left + (num_radix_blocks - 1) * big_lwe_size,
@@ -575,9 +579,9 @@ __host__ void host_integer_radix_difference_check_kb(
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
reduction_lut_f, bsks, ksks, num_comparisons);
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
reduction_lut_f, bsks, ksks, num_comparisons);
}
template <typename Torus>
@@ -601,16 +605,16 @@ host_integer_radix_maxmin_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
Torus **ksks, uint32_t total_num_radix_blocks) {
// Compute the sign
host_integer_radix_difference_check_kb(
host_integer_radix_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
lwe_array_left, lwe_array_right, mem_ptr, mem_ptr->identity_lut_f, bsks,
ksks, total_num_radix_blocks);
// Selector
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
total_num_radix_blocks);
host_integer_radix_cmux_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
}
#endif

View File

@@ -4,7 +4,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
@@ -14,35 +14,37 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
carry_modulus);
scratch_cuda_compress_integer_radix_ciphertext_64(
scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
(int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
compression_params, lwe_per_glwe, storage_log_modulus,
allocate_gpu_memory);
}
void scratch_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
bool allocate_gpu_memory) {
// Decompression doesn't keyswitch, so big and small dimensions are the same
int_radix_params encryption_params(
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
(encryption_glwe_dimension + 1) * encryption_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
message_modulus, carry_modulus);
int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
scratch_cuda_integer_decompress_radix_ciphertext_64(
scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
compression_params, storage_log_modulus, allocate_gpu_memory);
(int_decompression<uint64_t> **)mem_ptr, num_radix_blocks, body_count,
encryption_params, compression_params, storage_log_modulus,
allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -57,13 +59,13 @@ void cuda_integer_compress_radix_ciphertext_64(
}
void cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, void *indexes_array,
void *lwe_array_out, void *glwe_in, uint32_t *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
host_integer_decompress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
static_cast<uint32_t *>(indexes_array), indexes_array_size, bsks,
indexes_array, indexes_array_size, bsks,
(int_decompression<uint64_t> *)mem_ptr);
}

View File

@@ -12,58 +12,83 @@
template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
uint32_t in_len, uint32_t len) {
uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
auto nbits = sizeof(Torus) * 8;
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
auto glwe_index = tid / out_len;
auto i = tid % out_len;
auto chunk_array_in = array_in + glwe_index * in_len;
auto chunk_array_out = array_out + glwe_index * out_len;
if (tid < num_coeffs) {
auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < len) {
auto k = nbits * i / log_modulus;
auto j = k;
auto start_shift = i * nbits - j * log_modulus;
auto value = array_in[j] >> start_shift;
auto value = chunk_array_in[j] >> start_shift;
j++;
while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
auto shift = j * log_modulus - i * nbits;
value |= array_in[j] << shift;
value |= chunk_array_in[j] << shift;
j++;
}
array_out[i] = value;
chunk_array_out[i] = value;
}
}
template <typename Torus>
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
Torus *array_out, Torus *array_in, uint32_t num_inputs,
uint32_t body_count, int_compression<Torus> *mem_ptr) {
Torus *array_out, Torus *array_in, uint32_t num_glwes,
uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
if (array_in == array_out)
PANIC("Cuda error: Input and output must be different");
cudaSetDevice(gpu_index);
auto params = mem_ptr->compression_params;
auto compression_params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
// [0..num_glwes-1) GLWEs
auto in_len = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto number_bits_to_pack = in_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto len = (number_bits_to_pack + nbits - 1) / nbits;
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
// Last GLWE
auto last_body_count = num_lwes % compression_params.polynomial_size;
in_len =
compression_params.glwe_dimension * compression_params.polynomial_size +
last_body_count;
number_bits_to_pack = in_len * log_modulus;
auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits;
auto num_coeffs = (num_glwes - 1) * out_len + last_out_len;
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(len, 128, num_blocks, num_threads);
getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads);
dim3 grid(num_blocks);
dim3 threads(num_threads);
pack<<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus, in_len,
len);
cuda_memset_async(array_out, 0,
num_glwes * (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size * sizeof(Torus),
stream, gpu_index);
pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
num_coeffs, in_len, out_len);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void host_integer_compress(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *glwe_array_out, Torus *lwe_array_in,
Torus **fp_ksk, uint32_t num_lwes,
Torus **fp_ksk, uint32_t num_radix_blocks,
int_compression<Torus> *mem_ptr) {
auto compression_params = mem_ptr->compression_params;
@@ -71,52 +96,63 @@ __host__ void host_integer_compress(cudaStream_t *streams,
// Shift
auto lwe_shifted = mem_ptr->tmp_lwe;
host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
lwe_array_in,
(uint64_t)compression_params.message_modulus,
input_lwe_dimension, num_lwes);
host_cleartext_multiplication<Torus>(
streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
(uint64_t)compression_params.message_modulus, input_lwe_dimension,
num_radix_blocks);
uint32_t lwe_in_size = input_lwe_dimension + 1;
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
uint32_t num_glwes_for_compression =
num_radix_blocks / mem_ptr->lwe_per_glwe + 1;
// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
cuda_memset_async(tmp_glwe_array_out, 0,
num_glwes_for_compression *
(compression_params.glwe_dimension + 1) *
compression_params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
for (int i = 0; i < num_glwes; i++) {
auto lwe_subset = lwe_shifted + i * lwe_in_size;
auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
auto rem_lwes = num_radix_blocks;
host_packing_keyswitch_lwe_list_to_glwe(
auto lwe_subset = lwe_shifted;
auto glwe_out = tmp_glwe_array_out;
while (rem_lwes > 0) {
auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);
host_packing_keyswitch_lwe_list_to_glwe<Torus>(
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
compression_params.polynomial_size, compression_params.ks_base_log,
compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
compression_params.ks_level, chunk_size);
rem_lwes -= chunk_size;
lwe_subset += chunk_size * lwe_in_size;
glwe_out += glwe_out_size;
}
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
// Modulus switch
host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
body_count),
mem_ptr->storage_log_modulus);
check_cuda_error(cudaGetLastError());
host_modulus_switch_inplace<Torus>(
streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes_for_compression * (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size,
mem_ptr->storage_log_modulus);
host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out,
num_glwes, body_count, mem_ptr);
host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
tmp_glwe_array_out, num_glwes_for_compression,
num_radix_blocks, mem_ptr);
}
template <typename Torus>
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
uint32_t log_modulus, uint32_t initial_out_len) {
uint32_t log_modulus, uint32_t input_len,
uint32_t initial_out_len) {
auto nbits = sizeof(Torus) * 8;
auto i = threadIdx.x + blockIdx.x * blockDim.x;
auto chunk_array_in = array_in + index * input_len;
if (i < initial_out_len) {
// Unpack
Torus mask = ((Torus)1 << log_modulus) - 1;
@@ -130,11 +166,12 @@ __global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
Torus unpacked_i;
if (start_block == end_block_inclusive) {
auto single_part = array_in[start_block] >> start_remainder;
auto single_part = chunk_array_in[start_block] >> start_remainder;
unpacked_i = single_part & mask;
} else {
auto first_part = array_in[start_block] >> start_remainder;
auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
auto first_part = chunk_array_in[start_block] >> start_remainder;
auto second_part = chunk_array_in[start_block + 1]
<< (nbits - start_remainder);
unpacked_i = (first_part | second_part) & mask;
}
@@ -144,95 +181,210 @@ __global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
}
}
/// Extracts the glwe_index-nth GLWE ciphertext
template <typename Torus>
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *glwe_array_out, Torus *array_in,
uint32_t glwe_index,
int_decompression<Torus> *mem_ptr) {
if (array_in == glwe_array_out)
PANIC("Cuda error: Input and output must be different");
cudaSetDevice(gpu_index);
auto params = mem_ptr->compression_params;
auto compression_params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;
uint32_t body_count = mem_ptr->body_count;
uint32_t body_count =
std::min(mem_ptr->body_count, compression_params.polynomial_size);
auto initial_out_len =
params.glwe_dimension * params.polynomial_size + body_count * body_count;
compression_params.glwe_dimension * compression_params.polynomial_size +
body_count;
auto compressed_glwe_accumulator_size =
(compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto number_bits_to_unpack = compressed_glwe_accumulator_size * log_modulus;
auto nbits = sizeof(Torus) * 8;
// number_bits_to_unpack.div_ceil(Scalar::BITS)
auto input_len = (number_bits_to_unpack + nbits - 1) / nbits;
// We assure the tail of the glwe is zeroed
auto zeroed_slice =
glwe_array_out + params.glwe_dimension * params.polynomial_size;
cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
auto zeroed_slice = glwe_array_out + initial_out_len;
cuda_memset_async(zeroed_slice, 0,
(compression_params.polynomial_size - body_count) *
sizeof(Torus),
stream, gpu_index);
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
dim3 threads(num_threads);
extract<<<grid, threads, 0, stream>>>(glwe_array_out, array_in, glwe_index,
log_modulus, initial_out_len);
extract<Torus><<<grid, threads, 0, stream>>>(glwe_array_out, array_in,
glwe_index, log_modulus,
input_len, initial_out_len);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *packed_glwe_in, uint32_t *indexes_array,
uint32_t indexes_array_size, void **bsks,
int_decompression<Torus> *mem_ptr) {
uint32_t gpu_count, Torus *d_lwe_array_out,
Torus *d_packed_glwe_in, uint32_t *h_indexes_array,
uint32_t indexes_array_size, void **d_bsks,
int_decompression<Torus> *h_mem_ptr) {
auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
auto compression_params = mem_ptr->compression_params;
host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
mem_ptr);
auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
cuda_memcpy_async_to_gpu(d_indexes_array, h_indexes_array,
indexes_array_size * sizeof(uint32_t), streams[0],
gpu_indexes[0]);
auto num_lwes = mem_ptr->body_count;
auto compression_params = h_mem_ptr->compression_params;
auto lwe_per_glwe = compression_params.polynomial_size;
if (indexes_array_size > lwe_per_glwe)
PANIC("Cuda error: too many LWEs to decompress. The number of LWEs should "
"be smaller than "
"polynomial_size.")
// Sample extract
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
extracted_glwe, indexes_array, indexes_array_size,
compression_params.glwe_dimension,
compression_params.polynomial_size);
auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
if (num_radix_blocks != indexes_array_size)
PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs "
"should be the same as indexes_array_size.")
// the first element is the last index in h_indexes_array that lies in the
// related GLWE
std::vector<std::pair<int, Torus *>> glwe_vec;
// Extract all GLWEs
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
d_packed_glwe_in, current_glwe_index, h_mem_ptr);
glwe_vec.push_back(std::make_pair(0, extracted_glwe));
for (int i = 1; i < indexes_array_size; i++) {
auto glwe_index = h_indexes_array[i] / lwe_per_glwe;
if (glwe_index != current_glwe_index) {
extracted_glwe += glwe_accumulator_size;
current_glwe_index = glwe_index;
// Extracts a new GLWE
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
d_packed_glwe_in, glwe_index, h_mem_ptr);
glwe_vec.push_back(std::make_pair(i, extracted_glwe));
} else {
// Updates the index
glwe_vec.back().first++;
}
}
// Sample extract all LWEs
Torus lwe_accumulator_size = compression_params.small_lwe_dimension + 1;
auto extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
uint32_t current_idx = 0;
auto d_indexes_array_chunk = d_indexes_array;
for (const auto &max_idx_and_glwe : glwe_vec) {
uint32_t last_idx = max_idx_and_glwe.first;
extracted_glwe = max_idx_and_glwe.second;
auto num_lwes = last_idx + 1 - current_idx;
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
extracted_glwe, d_indexes_array_chunk, num_lwes,
compression_params.glwe_dimension,
compression_params.polynomial_size);
d_indexes_array_chunk += num_lwes;
extracted_lwe += lwe_accumulator_size;
current_idx = last_idx;
}
// Reset
extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
// In the case of extracting a single LWE these parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
auto encryption_params = mem_ptr->encryption_params;
auto carry_extract_lut = mem_ptr->carry_extract_lut;
execute_pbs_async<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
carry_extract_lut->lut_indexes_vec, extracted_lwe,
carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
encryption_params.glwe_dimension,
compression_params.glwe_dimension * compression_params.polynomial_size,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
encryption_params.pbs_type);
auto encryption_params = h_mem_ptr->encryption_params;
auto lut = h_mem_ptr->carry_extract_lut;
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
lut->lwe_indexes_in, d_bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride,
false);
} else {
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
compression_params.small_lwe_dimension + 1);
/// Apply PBS
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride,
false);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_radix_blocks,
encryption_params.big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}
template <typename Torus>
__host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
__host__ void scratch_cuda_compress_integer_radix_ciphertext(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_compression<Torus> **mem_ptr, uint32_t num_lwes,
int_compression<Torus> **mem_ptr, uint32_t num_radix_blocks,
int_radix_params compression_params, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
*mem_ptr = new int_compression<Torus>(
streams, gpu_indexes, gpu_count, compression_params, num_lwes,
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
}
template <typename Torus>
__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
int_radix_params encryption_params, int_radix_params compression_params,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
int_decompression<Torus> **mem_ptr, uint32_t num_radix_blocks,
uint32_t body_count, int_radix_params encryption_params,
int_radix_params compression_params, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_lwes, storage_log_modulus, allocate_gpu_memory);
num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory);
}
#endif

View File

@@ -282,7 +282,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// Shift the mask so that we will only keep bits we should
uint32_t shifted_mask = full_message_mask >> shift_amount;
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsks, ksks, 1,
mem_ptr->masking_luts_1[shifted_mask]);
@@ -310,7 +310,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsks, ksks, 1,
mem_ptr->masking_luts_2[shifted_mask]);
@@ -334,7 +334,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
interesting_remainder1.insert(0, numerator_block_1.first_block(),
streams[0], gpu_indexes[0]);
host_integer_radix_logical_scalar_shift_kb_inplace(
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1.data, 1,
mem_ptr->shift_mem_1, bsks, ksks, interesting_remainder1.len);
@@ -342,7 +342,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
interesting_remainder1.len - 1, streams[0],
gpu_indexes[0]);
host_radix_blocks_rotate_left(
host_radix_blocks_rotate_left<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder1.data,
tmp_radix.data, 1, interesting_remainder1.len, big_lwe_size);
@@ -363,7 +363,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
auto left_shift_interesting_remainder2 =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
host_integer_radix_logical_scalar_shift_kb_inplace(
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, interesting_remainder2.data, 1,
mem_ptr->shift_mem_2, bsks, ksks, interesting_remainder2.len);
}; // left_shift_interesting_remainder2
@@ -396,10 +396,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// but in that position, interesting_remainder2 always has a 0
auto &merged_interesting_remainder = interesting_remainder1;
host_addition(streams[0], gpu_indexes[0], merged_interesting_remainder.data,
merged_interesting_remainder.data,
interesting_remainder2.data, radix_params.big_lwe_dimension,
merged_interesting_remainder.len);
host_addition<Torus>(
streams[0], gpu_indexes[0], merged_interesting_remainder.data,
merged_interesting_remainder.data, interesting_remainder2.data,
radix_params.big_lwe_dimension, merged_interesting_remainder.len);
// after create_clean_version_of_merged_remainder
// `merged_interesting_remainder` will be reused as
@@ -439,7 +439,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// We could call unchecked_scalar_ne
// But we are in the special case where scalar == 0
// So we can skip some stuff
host_compare_with_zero_equality(
host_compare_with_zero_equality<Torus>(
streams, gpu_indexes, gpu_count, tmp_1.data, trivial_blocks.data,
mem_ptr->comparison_buffer, bsks, ksks, trivial_blocks.len,
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
@@ -447,7 +447,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
tmp_1.len =
ceil_div(trivial_blocks.len, message_modulus * carry_modulus - 1);
is_at_least_one_comparisons_block_true(
is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count,
at_least_one_upper_block_is_non_zero.data, tmp_1.data,
mem_ptr->comparison_buffer, bsks, ksks, tmp_1.len);
@@ -460,7 +460,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// `cleaned_merged_interesting_remainder` - radix ciphertext
auto create_clean_version_of_merged_remainder =
[&](cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count) {
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data, bsks, ksks,
@@ -486,10 +486,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
}
host_addition(streams[0], gpu_indexes[0], overflow_sum.data,
subtraction_overflowed.data,
at_least_one_upper_block_is_non_zero.data,
radix_params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], overflow_sum.data,
subtraction_overflowed.data,
at_least_one_upper_block_is_non_zero.data,
radix_params.big_lwe_dimension, 1);
int factor = (i) ? 3 : 2;
int factor_lut_id = factor - 2;
@@ -528,10 +528,10 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
mem_ptr->merge_overflow_flags_luts[pos_in_block]
->params.message_modulus);
host_addition(streams[0], gpu_indexes[0],
&quotient[block_of_bit * big_lwe_size],
&quotient[block_of_bit * big_lwe_size],
did_not_overflow.data, radix_params.big_lwe_dimension, 1);
host_addition<Torus>(
streams[0], gpu_indexes[0], &quotient[block_of_bit * big_lwe_size],
&quotient[block_of_bit * big_lwe_size], did_not_overflow.data,
radix_params.big_lwe_dimension, 1);
};
for (uint j = 0; j < gpu_count; j++) {
@@ -564,17 +564,17 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
// Clean the quotient and remainder
// as even though they have no carries, they are not at nominal noise level
host_addition(streams[0], gpu_indexes[0], remainder, remainder1.data,
remainder2.data, radix_params.big_lwe_dimension,
remainder1.len);
host_addition<Torus>(streams[0], gpu_indexes[0], remainder, remainder1.data,
remainder2.data, radix_params.big_lwe_dimension,
remainder1.len);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
ksks, num_blocks, mem_ptr->message_extract_lut_2);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {

View File

@@ -53,7 +53,7 @@ void scratch_cuda_propagate_single_carry_kb_64_inplace(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);
scratch_cuda_propagate_single_carry_kb_inplace(
scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sc_prop_memory<uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
@@ -131,6 +131,19 @@ void cleanup_cuda_apply_univariate_lut_kb_64(void **streams,
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cuda_apply_many_univariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t lut_count, uint32_t lut_stride) {
host_apply_many_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks, num_blocks,
lut_count, lut_stride);
}
void scratch_cuda_apply_bivariate_lut_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -195,15 +208,15 @@ void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t shift) {
void *output_radix_lwe, void *generates_or_propagates, int8_t *mem_ptr,
void **ksks, void **bsks, uint32_t num_blocks, uint32_t shift) {
int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;
host_compute_prefix_sum_hillis_steele<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe), params,
static_cast<uint64_t *>(generates_or_propagates), params,
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_blocks);
}

View File

@@ -78,7 +78,7 @@ host_radix_blocks_rotate_right(cudaStream_t *streams, uint32_t *gpu_indexes,
"pointers should be different");
}
cudaSetDevice(gpu_indexes[0]);
radix_blocks_rotate_right<<<blocks_count, 1024, 0, streams[0]>>>(
radix_blocks_rotate_right<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
dst, src, value, blocks_count, lwe_size);
}
@@ -95,7 +95,7 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
"pointers should be different");
}
cudaSetDevice(gpu_indexes[0]);
radix_blocks_rotate_left<<<blocks_count, 1024, 0, streams[0]>>>(
radix_blocks_rotate_left<Torus><<<blocks_count, 1024, 0, streams[0]>>>(
dst, src, value, blocks_count, lwe_size);
}
@@ -124,8 +124,8 @@ host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t lwe_size) {
cudaSetDevice(gpu_indexes[0]);
int num_blocks = blocks_count / 2, num_threads = 1024;
radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
src, blocks_count, lwe_size);
radix_blocks_reverse_lwe_inplace<Torus>
<<<num_blocks, num_threads, 0, streams[0]>>>(src, blocks_count, lwe_size);
}
// polynomial_size threads
@@ -164,9 +164,10 @@ __host__ void pack_bivariate_blocks(cudaStream_t *streams,
int num_blocks = 0, num_threads = 0;
int num_entries = num_radix_blocks * (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
device_pack_bivariate_blocks<<<num_blocks, num_threads, 0, streams[0]>>>(
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2, lwe_indexes_in,
lwe_dimension, shift, num_radix_blocks);
device_pack_bivariate_blocks<Torus>
<<<num_blocks, num_threads, 0, streams[0]>>>(
lwe_array_out, lwe_indexes_out, lwe_array_1, lwe_array_2,
lwe_indexes_in, lwe_dimension, shift, num_radix_blocks);
check_cuda_error(cudaGetLastError());
}
@@ -188,6 +189,94 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
if (active_gpu_count == 1) {
execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], lwe_array_in,
lut->lwe_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, ks_base_log, ks_level,
num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
true);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
big_lwe_dimension + 1);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, small_lwe_dimension,
ks_base_log, ks_level, num_radix_blocks);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
lut_stride, true);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}
template <typename Torus>
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, void **bsks, Torus **ksks,
uint32_t num_radix_blocks, int_radix_lut<Torus> *lut, uint32_t lut_count,
uint32_t lut_stride) {
// apply_lookup_table
auto params = lut->params;
auto pbs_type = params.pbs_type;
auto big_lwe_dimension = params.big_lwe_dimension;
auto small_lwe_dimension = params.small_lwe_dimension;
auto ks_level = params.ks_level;
auto ks_base_log = params.ks_base_log;
auto pbs_level = params.pbs_level;
auto pbs_base_log = params.pbs_base_log;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
@@ -210,7 +299,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type);
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
true);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -236,7 +326,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type);
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
lut_stride, true);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -271,12 +362,16 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
auto polynomial_size = params.polynomial_size;
auto grouping_factor = params.grouping_factor;
// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
// Left message is shifted
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, lwe_array_pbs_in,
lut->lwe_trivial_indexes, lwe_array_1, lwe_array_2,
lut->lwe_indexes_in, big_lwe_dimension, shift,
num_radix_blocks);
pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count,
lwe_array_pbs_in, lut->lwe_trivial_indexes,
lwe_array_1, lwe_array_2, lut->lwe_indexes_in,
big_lwe_dimension, shift, num_radix_blocks);
check_cuda_error(cudaGetLastError());
/// For multi GPU execution we create vectors of pointers for inputs and
@@ -301,7 +396,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type);
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
true);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
@@ -323,7 +419,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
pbs_level, grouping_factor, num_radix_blocks, pbs_type);
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
lut_stride, true);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -380,7 +477,7 @@ void generate_lookup_table(Torus *acc, uint32_t glwe_dimension,
body[i] = -body[i];
}
rotate_left(body, half_box_size, polynomial_size);
rotate_left<Torus>(body, half_box_size, polynomial_size);
}
template <typename Torus>
@@ -442,7 +539,6 @@ void generate_device_accumulator_bivariate(
message_modulus, carry_modulus, f);
// copy host lut and lut_indexes_vec to device
cuda_synchronize_stream(stream, gpu_index);
cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size *
sizeof(Torus),
@@ -508,7 +604,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);
cuda_synchronize_stream(stream, gpu_index);
// copy host lut and lut_indexes_vec to device
cuda_memcpy_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -590,13 +685,13 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
ksks, num_blocks, luts_array);
// compute prefix sum with hillis&steele
host_compute_prefix_sum_hillis_steele(
host_compute_prefix_sum_hillis_steele<Torus>(
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
generates_or_propagates, 1, num_blocks,
big_lwe_size);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
step_output, generates_or_propagates, 1,
num_blocks, big_lwe_size);
if (carry_out != nullptr) {
cuda_memcpy_async_gpu_to_gpu(carry_out, step_output, big_lwe_size_bytes,
streams[0], gpu_indexes[0]);
@@ -610,8 +705,9 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
gpu_indexes[0]);
}
host_addition(streams[0], gpu_indexes[0], lwe_array, lwe_array, step_output,
glwe_dimension * polynomial_size, num_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
step_output, glwe_dimension * polynomial_size,
num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -664,14 +760,15 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
generates_or_propagates, 1, num_blocks,
big_lwe_size);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
step_output, generates_or_propagates, 1,
num_blocks, big_lwe_size);
cuda_memset_async(step_output, 0, big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
host_subtraction(streams[0], gpu_indexes[0], lwe_array, lwe_array,
step_output, glwe_dimension * polynomial_size, num_blocks);
host_subtraction<Torus>(streams[0], gpu_indexes[0], lwe_array, lwe_array,
step_output, glwe_dimension * polynomial_size,
num_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks,
@@ -697,6 +794,9 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
int small_lwe_size = (params.small_lwe_dimension + 1);
// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
for (int i = 0; i < num_blocks; i++) {
auto cur_input_block = &input_blocks[i * big_lwe_size];
@@ -719,7 +819,8 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
params.glwe_dimension, params.small_lwe_dimension,
params.polynomial_size, params.pbs_base_log, params.pbs_level,
params.grouping_factor, 2, params.pbs_type);
params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride,
true);
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
big_lwe_size * sizeof(Torus), streams[0],
@@ -727,10 +828,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
if (i < num_blocks - 1) {
auto next_input_block = &input_blocks[(i + 1) * big_lwe_size];
host_addition(streams[0], gpu_indexes[0], next_input_block,
next_input_block,
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], next_input_block,
next_input_block,
&mem_ptr->tmp_big_lwe_vector[big_lwe_size],
params.big_lwe_dimension, 1);
}
}
}
@@ -794,7 +895,7 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
device_pack_blocks<<<num_blocks, num_threads, 0, stream>>>(
device_pack_blocks<Torus><<<num_blocks, num_threads, 0, stream>>>(
lwe_array_out, lwe_array_in, lwe_dimension, num_radix_blocks, factor);
}
@@ -840,7 +941,7 @@ create_trivial_radix(cudaStream_t stream, uint32_t gpu_index,
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_create_trivial_radix<<<grid, thds, 0, stream>>>(
device_create_trivial_radix<Torus><<<grid, thds, 0, stream>>>(
lwe_array_out, scalar_array, num_scalar_blocks, lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
}
@@ -857,7 +958,7 @@ __host__ void extract_n_bits(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t num_radix_blocks, uint32_t bits_per_block,
int_bit_extract_luts_buffer<Torus> *bit_extract) {
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks,
num_radix_blocks * bits_per_block, bit_extract->lut);
}
@@ -870,7 +971,6 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
std::function<Torus(Torus)> sign_handler_f, void **bsks,
Torus **ksks, uint32_t num_sign_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto diff_buffer = mem_ptr->diff_buffer;
auto params = mem_ptr->params;
@@ -904,9 +1004,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
while (num_sign_blocks > 2) {
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a,
big_lwe_dimension, num_sign_blocks, 4);
integer_radix_apply_univariate_lookup_table_kb(
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
big_lwe_dimension, num_sign_blocks, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks,
num_sign_blocks / 2, lut);
@@ -937,11 +1037,11 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
final_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension,
2, 4);
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
gpu_count, signs_array_out,
signs_b, bsks, ksks, 1, lut);
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
big_lwe_dimension, 2, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks,
1, lut);
} else {
@@ -957,9 +1057,9 @@ reduce_signs(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
final_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
integer_radix_apply_univariate_lookup_table_kb(streams, gpu_indexes,
gpu_count, signs_array_out,
signs_a, bsks, ksks, 1, lut);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
1, lut);
}
}
@@ -992,6 +1092,18 @@ void host_apply_univariate_lut_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
num_blocks, mem);
}
template <typename Torus>
void host_apply_many_univariate_lut_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *radix_lwe_out, Torus *radix_lwe_in, int_radix_lut<Torus> *mem,
Torus **ksks, void **bsks, uint32_t num_blocks, uint32_t lut_count,
uint32_t lut_stride) {
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks,
num_blocks, mem, lut_count, lut_stride);
}
template <typename Torus>
void scratch_cuda_apply_bivariate_lut_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,

View File

@@ -241,7 +241,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
nullptr);
break;
case 1024:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -249,7 +250,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
nullptr);
break;
case 2048:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -257,7 +259,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
nullptr);
break;
case 4096:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -265,7 +268,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
nullptr);
break;
case 8192:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -273,7 +277,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
nullptr);
break;
case 16384:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
@@ -281,7 +286,8 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec,
nullptr);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "

View File

@@ -186,9 +186,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
Torus *radix_lwe_out, Torus *terms, int *terms_degree, void **bsks,
uint64_t **ksks, int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
uint32_t num_blocks_in_radix, uint32_t num_radix_in_vec,
int_radix_lut<Torus> *reused_lut = nullptr) {
int_radix_lut<Torus> *reused_lut) {
auto new_blocks = mem_ptr->new_blocks;
auto new_blocks_copy = mem_ptr->new_blocks_copy;
auto old_blocks = mem_ptr->old_blocks;
auto small_lwe_vector = mem_ptr->small_lwe_vector;
@@ -205,12 +206,31 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto small_lwe_size = small_lwe_dimension + 1;
// In the case of extracting a single LWE this parameters are dummy
uint32_t lut_count = 1;
uint32_t lut_stride = 0;
if (num_radix_in_vec == 0)
return;
if (num_radix_in_vec == 1) {
cuda_memcpy_async_gpu_to_gpu(radix_lwe_out, terms,
num_blocks_in_radix * big_lwe_size *
sizeof(Torus),
streams[0], gpu_indexes[0]);
return;
}
if (old_blocks != terms) {
cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
num_blocks_in_radix * num_radix_in_vec *
big_lwe_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
}
if (num_radix_in_vec == 2) {
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks[num_blocks * big_lwe_size],
big_lwe_dimension, num_blocks);
return;
}
size_t r = num_radix_in_vec;
size_t total_modulus = message_modulus * carry_modulus;
@@ -271,7 +291,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
if (!ch_amount)
ch_amount++;
dim3 add_grid(ch_amount, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);
cudaSetDevice(gpu_indexes[0]);
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
@@ -288,7 +307,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
terms_degree, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
h_smart_copy_out, ch_amount, r, num_blocks, chunk_size, message_max,
total_count, message_count, carry_count, sm_copy_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
@@ -303,8 +321,11 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
// inside d_smart_copy_in there are only -1 values
// it's fine to call smart_copy with same pointer
// as source and destination
smart_copy<<<sm_copy_count, 1024, 0, streams[0]>>>(
new_blocks, new_blocks, d_smart_copy_out, d_smart_copy_in,
cuda_memcpy_async_gpu_to_gpu(new_blocks_copy, new_blocks,
r * num_blocks * big_lwe_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
new_blocks, new_blocks_copy, d_smart_copy_out, d_smart_copy_in,
big_lwe_size);
check_cuda_error(cudaGetLastError());
@@ -347,7 +368,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count,
mem_ptr->params.pbs_type);
mem_ptr->params.pbs_type, lut_count, lut_stride, true);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -395,7 +416,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
glwe_dimension, small_lwe_dimension, polynomial_size,
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
mem_ptr->params.grouping_factor, total_count,
mem_ptr->params.pbs_type);
mem_ptr->params.pbs_type, lut_count, lut_stride, true);
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
@@ -422,9 +443,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
luts_message_carry->release(streams, gpu_indexes, gpu_count);
delete (luts_message_carry);
host_addition(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks[num_blocks * big_lwe_size], big_lwe_dimension,
num_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks[num_blocks * big_lwe_size],
big_lwe_dimension, num_blocks);
}
template <typename Torus, class params>

View File

@@ -1,14 +1,16 @@
#include "integer/negation.cuh"
void cuda_negate_integer_radix_ciphertext_64_inplace(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *lwe_array,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
void cuda_negate_integer_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *lwe_array_in, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
host_integer_radix_negation(
host_integer_radix_negation<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(lwe_array),
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
}
void scratch_cuda_integer_radix_overflowing_sub_kb_64(

View File

@@ -25,14 +25,13 @@ template <typename Torus>
__global__ void
device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
uint64_t lwe_dimension, uint64_t message_modulus,
uint64_t carry_modulus, uint64_t delta) {
uint64_t delta) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < lwe_dimension + 1) {
bool is_body = (tid == lwe_dimension);
// z = ceil( degree / 2^p ) * 2^p
uint64_t z = (2 * message_modulus - 1) / message_modulus;
__syncthreads();
z *= message_modulus;
// (0,Delta*z) - ct
@@ -47,12 +46,9 @@ device_integer_radix_negation(Torus *output, Torus *input, int32_t num_blocks,
uint64_t encoded_zb = zb * delta;
__syncthreads();
// (0,Delta*z) - ct
output[tid] =
(is_body ? z * delta - (input[tid] + encoded_zb) : -input[tid]);
__syncthreads();
}
}
}
@@ -75,16 +71,15 @@ host_integer_radix_negation(cudaStream_t *streams, uint32_t *gpu_indexes,
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
uint64_t shared_mem = input_lwe_ciphertext_count * sizeof(uint32_t);
// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_negation<<<grid, thds, shared_mem, streams[0]>>>(
device_integer_radix_negation<<<grid, thds, 0, streams[0]>>>(
output, input, input_lwe_ciphertext_count, lwe_dimension, message_modulus,
carry_modulus, delta);
delta);
check_cuda_error(cudaGetLastError());
}
@@ -107,7 +102,7 @@ __host__ void host_integer_overflowing_sub_kb(
auto radix_params = mem_ptr->params;
host_unchecked_sub_with_correcting_term(
host_unchecked_sub_with_correcting_term<Torus>(
streams[0], gpu_indexes[0], radix_lwe_out, radix_lwe_left,
radix_lwe_right, radix_params.big_lwe_dimension, num_blocks,
radix_params.message_modulus, radix_params.carry_modulus,

View File

@@ -5,7 +5,7 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void *scalar_input, uint32_t lwe_dimension, uint32_t lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus) {
host_integer_radix_scalar_addition_inplace(
host_integer_radix_scalar_addition_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array), static_cast<uint64_t *>(scalar_input),
lwe_dimension, lwe_ciphertext_count, message_modulus, carry_modulus);

View File

@@ -18,10 +18,8 @@ __global__ void device_integer_radix_scalar_addition_inplace(
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < num_blocks) {
Torus scalar = scalar_input[tid];
Torus *body = lwe_array + tid * (lwe_dimension + 1) + lwe_dimension;
*body += scalar * delta;
lwe_array[tid * (lwe_dimension + 1) + lwe_dimension] +=
scalar_input[tid] * delta;
}
}
@@ -45,9 +43,10 @@ __host__ void host_integer_radix_scalar_addition_inplace(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_addition_inplace<<<grid, thds, 0, streams[0]>>>(
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
delta);
device_integer_radix_scalar_addition_inplace<Torus>
<<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
input_lwe_ciphertext_count, lwe_dimension,
delta);
check_cuda_error(cudaGetLastError());
}
@@ -83,8 +82,9 @@ __host__ void host_integer_radix_add_scalar_one_inplace(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_add_scalar_one_inplace<<<grid, thds, 0, streams[0]>>>(
lwe_array, input_lwe_ciphertext_count, lwe_dimension, delta);
device_integer_radix_add_scalar_one_inplace<Torus>
<<<grid, thds, 0, streams[0]>>>(lwe_array, input_lwe_ciphertext_count,
lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
}
@@ -122,10 +122,10 @@ __host__ void host_integer_radix_scalar_subtraction_inplace(
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);
device_integer_radix_scalar_subtraction_inplace<<<grid, thds, 0,
streams[0]>>>(
lwe_array, scalar_input, input_lwe_ciphertext_count, lwe_dimension,
delta);
device_integer_radix_scalar_subtraction_inplace<Torus>
<<<grid, thds, 0, streams[0]>>>(lwe_array, scalar_input,
input_lwe_ciphertext_count, lwe_dimension,
delta);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -3,6 +3,58 @@
#include "integer/comparison.cuh"
template <typename Torus>
__host__ void scalar_compare_radix_blocks_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
if (num_radix_blocks == 0)
return;
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
// be set to 1
// meaning that the output of the pbs will be the negative (modulo message
// space)
//
// Example:
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
// Since there was an overflow the bit of padding is 1 and not 0.
// When applying the LUT for an input value of 14 we would expect 1,
// but since the bit of padding is 1, we will get -1 modulus our message
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_integer_radix_scalar_subtraction_inplace<Torus>(
streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
// Apply LUT to compare to 0
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
ksks, num_radix_blocks, sign_lut);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
}
template <typename Torus>
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -45,10 +97,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
if (total_num_scalar_blocks == 0) {
// We only have to compare blocks with zero
// means scalar is zero
host_compare_with_zero_equality(streams, gpu_indexes, gpu_count,
mem_ptr->tmp_lwe_array_out, lwe_array_in,
mem_ptr, bsks, ksks, total_num_radix_blocks,
mem_ptr->is_zero_lut);
host_compare_with_zero_equality<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
lwe_array_in, mem_ptr, bsks, ksks, total_num_radix_blocks,
mem_ptr->is_zero_lut);
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
@@ -91,10 +143,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks,
message_modulus);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -106,22 +159,22 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
num_lsb_radix_blocks);
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks,
ksks, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsks, ksks,
num_lsb_radix_blocks);
tree_sign_reduction<Torus>(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
num_lsb_radix_blocks);
//////////////
// msb
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
num_msb_radix_blocks, mem_ptr->is_zero_lut);
host_compare_with_zero_equality<Torus>(
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
@@ -145,7 +198,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
scalar_bivariate_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
integer_radix_apply_bivariate_lookup_table_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
lwe_array_msb_out, bsks, ksks, 1, lut, lut->params.message_modulus);
@@ -159,10 +212,11 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
num_scalar_blocks, message_modulus);
pack_blocks<Torus>(streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks,
message_modulus);
pack_blocks<Torus>(streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
num_scalar_blocks, message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -173,16 +227,17 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_lwe_array_out;
scalar_compare_radix_blocks_kb(streams, gpu_indexes, gpu_count, comparisons,
lhs, rhs, mem_ptr, bsks, ksks,
num_lsb_radix_blocks);
scalar_compare_radix_blocks_kb<Torus>(streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks,
ksks, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
sign_handler_f, bsks, ksks, num_lsb_radix_blocks);
tree_sign_reduction<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
sign_handler_f, bsks, ksks,
num_lsb_radix_blocks);
}
}
@@ -229,7 +284,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// We only have to compare blocks with zero
// means scalar is zero
Torus *are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
host_compare_with_zero_equality(
host_compare_with_zero_equality<Torus>(
streams, gpu_indexes, gpu_count, are_all_msb_zeros, lwe_array_in,
mem_ptr, bsks, ksks, total_num_radix_blocks, mem_ptr->is_zero_lut);
Torus *sign_block =
@@ -277,7 +332,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
scalar_bivariate_last_leaf_lut_f);
lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
integer_radix_apply_bivariate_lookup_table_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
sign_block, bsks, ksks, 1, lut, lut->params.message_modulus);
@@ -304,10 +359,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks,
message_modulus);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
total_num_scalar_blocks, message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -319,24 +375,24 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// - 2 if lhs > rhs
auto comparisons = mem_ptr->tmp_block_comparisons;
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks, ksks,
num_lsb_radix_blocks);
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
comparisons, lhs, rhs, mem_ptr, bsks,
ksks, num_lsb_radix_blocks);
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out,
comparisons, mem_ptr->diff_buffer->tree_buffer,
mem_ptr->identity_lut_f, bsks, ksks,
num_lsb_radix_blocks);
tree_sign_reduction<Torus>(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons,
mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, ksks,
num_lsb_radix_blocks);
//////////////
// msb
// We remove the last block (which is the sign)
Torus *are_all_msb_zeros = lwe_array_msb_out;
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
are_all_msb_zeros, msb, mem_ptr, bsks, ksks,
num_msb_radix_blocks, mem_ptr->is_zero_lut);
host_compare_with_zero_equality<Torus>(
msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, mem_ptr,
bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
auto sign_bit_pos = (int)log2(message_modulus) - 1;
@@ -371,7 +427,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size;
integer_radix_apply_bivariate_lookup_table_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block,
are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut,
signed_msb_lut->params.message_modulus);
@@ -382,8 +438,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
//////////////
// Reduce the two blocks into one final
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks, 2);
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_array_lsb_out, mem_ptr, sign_handler_f, bsks, ksks,
2);
} else {
// We only have to do the regular comparison
@@ -403,10 +460,11 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
Torus *lhs = diff_buffer->tmp_packed_left;
Torus *rhs = diff_buffer->tmp_packed_right;
pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
num_lsb_radix_blocks - 1, message_modulus);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in,
big_lwe_dimension, num_lsb_radix_blocks - 1,
message_modulus);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0,
num_lsb_radix_blocks - 1, message_modulus);
// From this point we have half number of blocks
num_lsb_radix_blocks /= 2;
@@ -415,19 +473,19 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count,
lwe_array_ct_out, lhs, rhs, mem_ptr, bsks,
ksks, num_lsb_radix_blocks);
scalar_compare_radix_blocks_kb<Torus>(lsb_streams, gpu_indexes, gpu_count,
lwe_array_ct_out, lhs, rhs, mem_ptr,
bsks, ksks, num_lsb_radix_blocks);
Torus *encrypted_sign_block =
lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size;
Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1);
auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block;
create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block,
scalar_sign_block, big_lwe_dimension, 1, 1,
message_modulus, carry_modulus);
create_trivial_radix<Torus>(
msb_streams[0], gpu_indexes[0], trivial_sign_block, scalar_sign_block,
big_lwe_dimension, 1, 1, message_modulus, carry_modulus);
integer_radix_apply_bivariate_lookup_table_kb(
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out,
encrypted_sign_block, trivial_sign_block, bsks, ksks, 1,
mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
@@ -439,9 +497,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
// Reduces a vec containing radix blocks that encrypts a sign
// (inferior, equal, superior) to one single radix block containing the
// final sign
reduce_signs(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
num_lsb_radix_blocks + 1);
reduce_signs<Torus>(streams, gpu_indexes, gpu_count, lwe_array_out,
lwe_array_ct_out, mem_ptr, sign_handler_f, bsks, ksks,
num_lsb_radix_blocks + 1);
}
}
@@ -452,14 +510,13 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
cudaSetDevice(gpu_indexes[0]);
auto params = mem_ptr->params;
// Calculates the difference sign between the ciphertext and the scalar
// - 0 if lhs < rhs
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto sign = mem_ptr->tmp_lwe_array_out;
integer_radix_signed_scalar_difference_check_kb(
integer_radix_signed_scalar_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
total_num_scalar_blocks);
@@ -469,17 +526,17 @@ __host__ void integer_radix_signed_scalar_maxmin_kb(
auto lwe_array_left = lwe_array_in;
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
scalar_blocks, params.big_lwe_dimension,
total_num_radix_blocks, total_num_scalar_blocks,
params.message_modulus, params.carry_modulus);
create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
scalar_blocks, params.big_lwe_dimension,
total_num_radix_blocks, total_num_scalar_blocks,
params.message_modulus, params.carry_modulus);
// Selector
// CMUX for Max or Min
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
sign, lwe_array_left, lwe_array_right,
mem_ptr->cmux_buffer, bsks, ksks,
total_num_radix_blocks);
host_integer_radix_cmux_kb<Torus>(streams, gpu_indexes, gpu_count,
lwe_array_out, sign, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks,
ksks, total_num_radix_blocks);
}
template <typename Torus>
@@ -492,12 +549,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
if (mem_ptr->is_signed) {
// is signed and scalar is positive
integer_radix_signed_scalar_difference_check_kb(
integer_radix_signed_scalar_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
total_num_radix_blocks, total_num_scalar_blocks);
} else {
integer_radix_unsigned_scalar_difference_check_kb(
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, mem_ptr, sign_handler_f, bsks, ksks,
total_num_radix_blocks, total_num_scalar_blocks);
@@ -513,70 +570,16 @@ __host__ void host_integer_radix_signed_scalar_maxmin_kb(
if (mem_ptr->is_signed) {
// is signed and scalar is positive
integer_radix_signed_scalar_maxmin_kb(
integer_radix_signed_scalar_maxmin_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
total_num_scalar_blocks);
} else {
integer_radix_unsigned_scalar_maxmin_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in,
scalar_blocks, mem_ptr, bsks, ksks, total_num_radix_blocks,
total_num_scalar_blocks);
PANIC("Cuda error: only signed scalar maxmin can be called in signed "
"scalar comparison")
}
}
template <typename Torus>
__host__ void scalar_compare_radix_blocks_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *lwe_array_out, Torus *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {
if (num_radix_blocks == 0)
return;
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// When rhs > lhs, the subtraction will overflow, and the bit of padding will
// be set to 1
// meaning that the output of the pbs will be the negative (modulo message
// space)
//
// Example:
// lhs: 1, rhs: 3, message modulus: 4, carry modulus 4
// lhs - rhs = -2 % (4 * 4) = 14 = 1|1110 (padding_bit|b4b3b2b1)
// Since there was an overflow the bit of padding is 1 and not 0.
// When applying the LUT for an input value of 14 we would expect 1,
// but since the bit of padding is 1, we will get -1 modulus our message
// space, so (-1) % (4 * 4) = 15 = 1|1111 We then add one and get 0 = 0|0000
auto subtracted_blocks = mem_ptr->tmp_block_comparisons;
cuda_memcpy_async_gpu_to_gpu(subtracted_blocks, lwe_array_in,
num_radix_blocks * (big_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
// Subtract
// Here we need the true lwe sub, not the one that comes from shortint.
host_integer_radix_scalar_subtraction_inplace(
streams, gpu_indexes, gpu_count, subtracted_blocks, scalar_blocks,
big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus);
// Apply LUT to compare to 0
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb(
streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks,
ksks, num_radix_blocks, sign_lut);
// Add one
// Here Lhs can have the following values: (-1) % (message modulus * carry
// modulus), 0, 1 So the output values after the addition will be: 0, 1, 2
host_integer_radix_add_scalar_one_inplace(
streams, gpu_indexes, gpu_count, lwe_array_out, big_lwe_dimension,
num_radix_blocks, message_modulus, carry_modulus);
}
template <typename Torus>
__host__ void host_integer_radix_scalar_maxmin_kb(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -591,7 +594,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
// - 1 if lhs == rhs
// - 2 if lhs > rhs
auto sign = mem_ptr->tmp_lwe_array_out;
host_integer_radix_scalar_difference_check_kb(
host_integer_radix_scalar_difference_check_kb<Torus>(
streams, gpu_indexes, gpu_count, sign, lwe_array_in, scalar_blocks,
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, total_num_radix_blocks,
total_num_scalar_blocks);
@@ -601,17 +604,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
auto lwe_array_left = lwe_array_in;
auto lwe_array_right = mem_ptr->tmp_block_comparisons;
create_trivial_radix(streams[0], gpu_indexes[0], lwe_array_right,
scalar_blocks, params.big_lwe_dimension,
total_num_radix_blocks, total_num_scalar_blocks,
params.message_modulus, params.carry_modulus);
create_trivial_radix<Torus>(streams[0], gpu_indexes[0], lwe_array_right,
scalar_blocks, params.big_lwe_dimension,
total_num_radix_blocks, total_num_scalar_blocks,
params.message_modulus, params.carry_modulus);
// Selector
// CMUX for Max or Min
host_integer_radix_cmux_kb(streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left,
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks,
total_num_radix_blocks);
host_integer_radix_cmux_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
}
template <typename Torus>
@@ -659,10 +662,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
auto packed_scalar =
packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks;
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
big_lwe_dimension, num_lsb_radix_blocks, message_modulus);
pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0,
num_scalar_blocks, message_modulus);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb,
big_lwe_dimension, num_lsb_radix_blocks,
message_modulus);
pack_blocks<Torus>(lsb_streams[0], gpu_indexes[0], packed_scalar,
scalar_blocks, 0, num_scalar_blocks, message_modulus);
cuda_memcpy_async_gpu_to_gpu(
scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0),
@@ -670,7 +674,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
gpu_indexes[0]);
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0);
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks,
bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts);
}
@@ -689,9 +693,9 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
PANIC("Cuda error: integer operation not supported")
}
host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count,
lwe_array_msb_out, msb, mem_ptr, bsks, ksks,
num_msb_radix_blocks, msb_lut);
host_compare_with_zero_equality<Torus>(
msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, mem_ptr,
bsks, ksks, num_msb_radix_blocks, msb_lut);
}
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
@@ -701,13 +705,13 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
switch (mem_ptr->op) {
case COMPARISON_TYPE::EQ:
are_all_comparisons_block_true(
are_all_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
mem_ptr, bsks, ksks,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
break;
case COMPARISON_TYPE::NE:
is_at_least_one_comparisons_block_true(
is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
mem_ptr, bsks, ksks,
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));

View File

@@ -65,7 +65,7 @@ __host__ void host_integer_scalar_mul_radix(
cuda_memcpy_async_gpu_to_gpu(ptr, lwe_array,
lwe_size_bytes * num_radix_blocks,
streams[0], gpu_indexes[0]);
host_integer_radix_logical_scalar_shift_kb_inplace(
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
streams, gpu_indexes, gpu_count, ptr, shift_amount,
mem->logical_scalar_shift_buffer, bsks, ksks, num_radix_blocks);
} else {
@@ -82,15 +82,16 @@ __host__ void host_integer_scalar_mul_radix(
preshifted_buffer + (i % msg_bits) * num_radix_blocks * lwe_size;
T *block_shift_buffer =
all_shifted_buffer + j * num_radix_blocks * lwe_size;
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
block_shift_buffer, preshifted_radix_ct,
i / msg_bits, num_radix_blocks, lwe_size);
host_radix_blocks_rotate_right<T>(
streams, gpu_indexes, gpu_count, block_shift_buffer,
preshifted_radix_ct, i / msg_bits, num_radix_blocks, lwe_size);
// create trivial assign for value = 0
cuda_memset_async(block_shift_buffer, 0, (i / msg_bits) * lwe_size_bytes,
streams[0], gpu_indexes[0]);
j++;
}
}
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_drop_async(preshifted_buffer, streams[0], gpu_indexes[0]);
mem->logical_scalar_shift_buffer->release(streams, gpu_indexes, gpu_count);
@@ -108,7 +109,7 @@ __host__ void host_integer_scalar_mul_radix(
host_integer_partial_sum_ciphertexts_vec_kb<T, params>(
streams, gpu_indexes, gpu_count, lwe_array, all_shifted_buffer,
terms_degree, bsks, ksks, mem->sum_ciphertexts_vec_mem,
num_radix_blocks, j);
num_radix_blocks, j, nullptr);
auto scp_mem_ptr = mem->sum_ciphertexts_vec_mem->scp_mem;
host_propagate_single_carry<T>(streams, gpu_indexes, gpu_count, lwe_array,

View File

@@ -56,9 +56,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
// one block is responsible to process single lwe ciphertext
if (mem->shift_type == LEFT_SHIFT) {
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
@@ -70,9 +70,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
giver_blocks, lwe_array, 1, num_blocks,
big_lwe_size);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
giver_blocks, lwe_array, 1,
num_blocks, big_lwe_size);
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
} else {
// rotate left as the blocks are from LSB to MSB
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
@@ -97,8 +97,9 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
auto receiver_blocks = lwe_array;
auto giver_blocks = rotated_buffer;
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
lwe_array, 1, num_blocks, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
giver_blocks, lwe_array, 1, num_blocks,
big_lwe_size);
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

View File

@@ -53,9 +53,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
if (mem->shift_type == LEFT_SHIFT) {
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
// create trivial assign for value = 0
cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
@@ -83,9 +83,9 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
} else {
// right shift
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
// rotate left as the blocks are from LSB to MSB
// create trivial assign for value = 0
@@ -156,9 +156,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
Torus *last_block_copy = &padding_block[big_lwe_size];
if (mem->shift_type == RIGHT_SHIFT) {
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
num_blocks, big_lwe_size);
cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
num_blocks * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);
@@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
}
auto lut_univariate_padding_block =
mem->lut_buffers_univariate[num_bits_in_block - 1];
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
// Replace blocks 'pulled' from the left with the correct padding
@@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
if (shift_within_block != 0) {
auto lut_univariate_shift_last_block =
mem->lut_buffers_univariate[shift_within_block - 1];
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem->local_streams_2, gpu_indexes, gpu_count, last_block,
last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
}

View File

@@ -88,9 +88,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
switch (mem->shift_type) {
case LEFT_SHIFT:
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b, rotations,
total_nb_bits, big_lwe_size);
host_radix_blocks_rotate_right<Torus>(
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
rotations, total_nb_bits, big_lwe_size);
if (mem->is_signed && mem->shift_type == RIGHT_SHIFT)
for (int i = 0; i < rotations; i++)
@@ -103,9 +103,9 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
break;
case RIGHT_SHIFT:
// rotate left as the blocks are from LSB to MSB
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b, rotations,
total_nb_bits, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
rotations, total_nb_bits, big_lwe_size);
if (mem->is_signed)
for (int i = 0; i < rotations; i++)
@@ -119,15 +119,15 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
break;
case LEFT_ROTATE:
// rotate right as the blocks are from LSB to MSB
host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b, rotations,
total_nb_bits, big_lwe_size);
host_radix_blocks_rotate_right<Torus>(
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
rotations, total_nb_bits, big_lwe_size);
break;
case RIGHT_ROTATE:
// rotate left as the blocks are from LSB to MSB
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
rotated_input, input_bits_b, rotations,
total_nb_bits, big_lwe_size);
host_radix_blocks_rotate_left<Torus>(
streams, gpu_indexes, gpu_count, rotated_input, input_bits_b,
rotations, total_nb_bits, big_lwe_size);
break;
default:
PANIC("Unknown operation")
@@ -135,22 +135,21 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// host_pack bits into one block so that we have
// control_bit|b|a
cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
streams[0], gpu_indexes[0]); // Do we need this?
pack_bivariate_blocks(streams, gpu_indexes, gpu_count, mux_inputs,
mux_lut->lwe_indexes_out, rotated_input, input_bits_a,
mux_lut->lwe_indexes_in, big_lwe_dimension, 2,
total_nb_bits);
pack_bivariate_blocks<Torus>(streams, gpu_indexes, gpu_count, mux_inputs,
mux_lut->lwe_indexes_out, rotated_input,
input_bits_a, mux_lut->lwe_indexes_in,
big_lwe_dimension, 2, total_nb_bits);
// The shift bit is already properly aligned/positioned
for (int i = 0; i < total_nb_bits; i++)
host_addition(streams[0], gpu_indexes[0], mux_inputs + i * big_lwe_size,
mux_inputs + i * big_lwe_size, shift_bit,
mem->params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0],
mux_inputs + i * big_lwe_size,
mux_inputs + i * big_lwe_size, shift_bit,
mem->params.big_lwe_dimension, 1);
// we have
// control_bit|b|a
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks,
total_nb_bits, mux_lut);
}
@@ -179,8 +178,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
auto bit_to_add = input_bits_a + i * big_lwe_size;
for (int j = 0; j < num_radix_blocks; j++) {
host_addition(streams[0], gpu_indexes[0], block, block, bit_to_add,
big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], block, block, bit_to_add,
big_lwe_dimension, 1);
block += big_lwe_size;
bit_to_add += bits_per_block * big_lwe_size;
@@ -188,7 +187,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
// To give back a clean ciphertext
auto cleaning_lut = mem->cleaning_lut;
integer_radix_apply_univariate_lookup_table_kb(
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks,
num_radix_blocks, cleaning_lut);
}

View File

@@ -11,11 +11,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in_1),
static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in_1),
static_cast<uint32_t *>(lwe_array_in_2),
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
@@ -51,11 +51,11 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in_1),
static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
input_lwe_ciphertext_count);
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in_1),
static_cast<uint64_t *>(lwe_array_in_2),
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u32 input LWE ciphertext vector with a u32
@@ -66,11 +66,12 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
host_addition_plaintext<uint32_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(plaintext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
@@ -105,9 +106,10 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *plaintext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(plaintext_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
host_addition_plaintext<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(plaintext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -40,10 +40,10 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cuda_memcpy_async_gpu_to_gpu(output, lwe_input,
(lwe_dimension + 1) * lwe_ciphertext_count,
stream, gpu_index);
plaintext_addition<<<grid, thds, 0, stream>>>(
cuda_memcpy_async_gpu_to_gpu(
output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count * sizeof(T),
stream, gpu_index);
plaintext_addition<T><<<grid, thds, 0, stream>>>(
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -78,7 +78,7 @@ __host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
addition<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
addition<T><<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -112,7 +112,8 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
subtraction<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
subtraction<T>
<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -150,7 +151,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
(input_lwe_dimension + 1) * sizeof(T),
stream, gpu_index);
radix_body_subtraction_inplace<<<grid, thds, 0, stream>>>(
radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
output, plaintext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -176,7 +177,6 @@ __global__ void unchecked_sub_with_correcting_term(
}
}
template <typename T>
__host__ void host_unchecked_sub_with_correcting_term(
cudaStream_t stream, uint32_t gpu_index, T *output, T *input_1, T *input_2,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count,
@@ -193,7 +193,7 @@ __host__ void host_unchecked_sub_with_correcting_term(
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
unchecked_sub_with_correcting_term<<<grid, thds, 0, stream>>>(
unchecked_sub_with_correcting_term<T><<<grid, thds, 0, stream>>>(
output, input_1, input_2, num_entries, lwe_size, message_modulus,
carry_modulus, degree);
check_cuda_error(cudaGetLastError());

View File

@@ -9,7 +9,7 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_vec_multiplication(
host_cleartext_vec_multiplication<uint32_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
@@ -49,7 +49,7 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_vec_multiplication(
host_cleartext_vec_multiplication<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),

View File

@@ -46,7 +46,7 @@ host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cleartext_vec_multiplication<<<grid, thds, 0, stream>>>(
cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
@@ -82,7 +82,7 @@ host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cleartext_multiplication<<<grid, thds, 0, stream>>>(
cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}

View File

@@ -10,10 +10,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}
/*
@@ -44,8 +44,8 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_negation(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
input_lwe_dimension, input_lwe_ciphertext_count);
}

View File

@@ -37,7 +37,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
negation<<<grid, thds, 0, stream>>>(output, input, num_entries);
negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
check_cuda_error(cudaGetLastError());
}

View File

@@ -95,6 +95,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);
constexpr double two_pow_torus_bits = get_two_pow_torus_bits<T>();
// compress real bsk to complex and divide it on DOUBLE_MAX
for (int i = 0; i < total_polynomials; i++) {
int complex_current_poly_idx = i * polynomial_size / 2;
@@ -103,10 +104,8 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
h_bsk[complex_current_poly_idx + j].x = src[torus_current_poly_idx + j];
h_bsk[complex_current_poly_idx + j].y =
src[torus_current_poly_idx + j + polynomial_size / 2];
h_bsk[complex_current_poly_idx + j].x /=
(double)std::numeric_limits<T>::max();
h_bsk[complex_current_poly_idx + j].y /=
(double)std::numeric_limits<T>::max();
h_bsk[complex_current_poly_idx + j].x /= two_pow_torus_bits;
h_bsk[complex_current_poly_idx + j].y /= two_pow_torus_bits;
}
}
@@ -116,12 +115,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
@@ -134,12 +127,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
@@ -152,12 +139,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
@@ -170,12 +151,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
@@ -188,12 +163,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
@@ -206,12 +175,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
@@ -224,12 +187,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
buffer);
@@ -270,14 +227,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
@@ -291,14 +240,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
@@ -312,14 +253,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 1024:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
@@ -333,14 +266,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 2048:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
@@ -354,14 +279,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 4096:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
@@ -375,14 +292,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 8192:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
output, buffer);
@@ -396,14 +305,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>,
cudaFuncCachePreferShared));
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
FULLSM>
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,

View File

@@ -127,7 +127,8 @@ void execute_pbs_async(
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type) {
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
// 32 bits
@@ -159,7 +160,8 @@ void execute_pbs_async(
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu);
polynomial_size, base_log, level_count, num_inputs_on_gpu,
lut_count, lut_stride, do_modulus_switch);
}
break;
default:
@@ -198,7 +200,7 @@ void execute_pbs_async(
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
num_inputs_on_gpu);
num_inputs_on_gpu, lut_count, lut_stride);
}
break;
case CLASSICAL:
@@ -226,7 +228,8 @@ void execute_pbs_async(
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu);
polynomial_size, base_log, level_count, num_inputs_on_gpu,
lut_count, lut_stride, do_modulus_switch);
}
break;
default:
@@ -268,9 +271,8 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
scratch_cuda_multi_bit_programmable_bootstrap_64(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, grouping_factor,
input_lwe_ciphertext_count, allocate_gpu_memory);
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
break;
case CLASSICAL:
scratch_cuda_programmable_bootstrap_64(

View File

@@ -1,15 +1,5 @@
#include "programmable_bootstrap_amortized.cuh"
/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count) {
return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
}
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 32 bits inputs, into `buffer`. It also

View File

@@ -258,28 +258,6 @@ __host__ void scratch_programmable_bootstrap_amortized(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(0);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
cudaFuncSetAttribute(
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm);
cudaFuncSetCacheConfig(
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
check_cuda_error(cudaFuncSetCacheConfig(
device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
}
if (allocate_gpu_memory) {
uint64_t buffer_size =
get_buffer_size_programmable_bootstrap_amortized<Torus>(

View File

@@ -44,7 +44,8 @@ __global__ void device_programmable_bootstrap_cg(
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
uint64_t device_memory_size_per_block, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch) {
grid_group grid = this_grid();
@@ -93,8 +94,11 @@ __global__ void device_programmable_bootstrap_cg(
// Put "b" in [0, 2N[
Torus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
if (do_modulus_switch)
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
else
b_hat = block_lwe_array_in[lwe_dimension];
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
@@ -106,7 +110,10 @@ __global__ void device_programmable_bootstrap_cg(
// Put "a" in [0, 2N[
Torus a_hat = 0;
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
if (do_modulus_switch)
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
else
a_hat = block_lwe_array_in[i];
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
@@ -151,8 +158,38 @@ __global__ void device_programmable_bootstrap_cg(
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, glwe_dimension,
i * lut_stride);
}
}
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
}
@@ -163,30 +200,6 @@ __host__ void scratch_programmable_bootstrap_cg(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(0);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);
@@ -202,7 +215,8 @@ __host__ void host_programmable_bootstrap_cg(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t lut_count, uint32_t lut_stride, bool do_modulus_switch) {
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
@@ -226,7 +240,7 @@ __host__ void host_programmable_bootstrap_cg(
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
void *kernel_args[14];
void *kernel_args[16];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lwe_output_indexes;
kernel_args[2] = &lut_vector;
@@ -240,6 +254,9 @@ __host__ void host_programmable_bootstrap_cg(
kernel_args[10] = &base_log;
kernel_args[11] = &level_count;
kernel_args[12] = &d_mem;
kernel_args[14] = &lut_count;
kernel_args[15] = &lut_stride;
kernel_args[16] = &do_modulus_switch;
if (max_shared_memory < partial_sm) {
kernel_args[13] = &full_dm;

View File

@@ -30,7 +30,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block) {
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t lut_count, uint32_t lut_stride) {
grid_group grid = this_grid();
@@ -129,9 +130,44 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
// Always extract one by default
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, glwe_dimension,
i * lut_stride);
}
}
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
} else {
// Load the accumulator calculated in previous iterations
@@ -177,69 +213,6 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
polynomial_size);
uint64_t full_sm_cg_accumulate =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_sm_cg_accumulate =
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(0);
if (max_shared_memory < full_sm_keybundle) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (max_shared_memory < partial_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory < full_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
@@ -256,8 +229,9 @@ __host__ void execute_cg_external_product_loop(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_chunk_size, int lwe_offset) {
uint32_t lwe_offset, uint32_t lut_count, uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint64_t full_dm =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
@@ -275,13 +249,15 @@ __host__ void execute_cg_external_product_loop(
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;
auto d_mem = buffer->d_mem_acc_cg;
auto keybundle_fft = buffer->keybundle_fft;
auto global_accumulator = buffer->global_accumulator;
auto buffer_fft = buffer->global_accumulator_fft;
void *kernel_args[20];
void *kernel_args[22];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lwe_output_indexes;
kernel_args[2] = &lut_vector;
@@ -301,6 +277,8 @@ __host__ void execute_cg_external_product_loop(
kernel_args[16] = &chunk_size;
kernel_args[17] = &keybundle_size_per_input;
kernel_args[18] = &d_mem;
kernel_args[20] = &lut_count;
kernel_args[21] = &lut_stride;
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -333,10 +311,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, num_samples, polynomial_size);
auto lwe_chunk_size = buffer->lwe_chunk_size;
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {
@@ -345,14 +323,15 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
execute_compute_keybundle<Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
grouping_factor, level_count, lwe_offset);
// Accumulate
execute_cg_external_product_loop<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
grouping_factor, base_log, level_count, lwe_offset, lut_count,
lut_stride);
}
}

View File

@@ -122,7 +122,8 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch) {
switch (polynomial_size) {
case 256:
@@ -130,49 +131,56 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 512:
host_programmable_bootstrap_tbc<Torus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 1024:
host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 2048:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 4096:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 8192:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 16384:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -182,25 +190,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
}
#endif
/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_programmable_bootstrap_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count))
return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count);
else
return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count);
}
template <typename Torus>
void scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
@@ -389,7 +378,8 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch) {
switch (polynomial_size) {
case 256:
@@ -397,49 +387,56 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 512:
host_programmable_bootstrap_cg<Torus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 1024:
host_programmable_bootstrap_cg<Torus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 2048:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 4096:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 8192:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 16384:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -455,7 +452,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch) {
switch (polynomial_size) {
case 256:
@@ -463,49 +461,56 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 512:
host_programmable_bootstrap<Torus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 1024:
host_programmable_bootstrap<Torus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 2048:
host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 4096:
host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 8192:
host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case 16384:
host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
lwe_dimension, polynomial_size, base_log, level_count, num_samples);
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -522,7 +527,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
bool do_modulus_switch) {
if (base_log > 32)
PANIC("Cuda error (classical PBS): base log should be > number of bits "
@@ -542,7 +548,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -556,7 +563,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -567,7 +575,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -641,7 +650,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
bool do_modulus_switch) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be > number of bits "
"in the ciphertext representation (64)");
@@ -660,7 +670,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -674,7 +685,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
case PBS_VARIANT::DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -685,7 +697,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
lut_count, lut_stride, do_modulus_switch);
break;
default:
PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -713,7 +726,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -722,7 +736,8 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
void *stream, uint32_t gpu_index,
@@ -742,7 +757,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -751,7 +767,8 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
void *stream, uint32_t gpu_index,
@@ -779,7 +796,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
@@ -787,7 +805,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples);
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride, bool do_modulus_switch);
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,

View File

@@ -27,7 +27,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {
int8_t *device_mem, uint64_t device_memory_size_per_block,
bool do_modulus_switch) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -75,8 +76,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// First iteration
// Put "b" in [0, 2N[
Torus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
if (do_modulus_switch)
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
else
b_hat = block_lwe_array_in[lwe_dimension];
// The y-dimension is used to select the element of the GLWE this block will
// compute
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
@@ -94,8 +98,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Put "a" in [0, 2N[
Torus a_hat = 0;
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
params::log2_degree + 1); // 2 * params::log2_degree + 1);
if (do_modulus_switch)
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
params::log2_degree + 1); // 2 * params::log2_degree + 1);
else
a_hat = block_lwe_array_in[lwe_iteration];
synchronize_threads_in_block();
@@ -141,7 +148,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t lut_count, uint32_t lut_stride) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -216,8 +224,38 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, glwe_dimension,
i * lut_stride);
}
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
} else {
// Persist the updated accumulator
@@ -278,55 +316,6 @@ __host__ void scratch_programmable_bootstrap(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
uint64_t full_sm_step_one =
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
polynomial_size);
uint64_t full_sm_step_two =
get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
polynomial_size);
uint64_t partial_sm =
get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
int max_shared_memory = cuda_get_max_shared_memory(0);
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
// Configure step two
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_two<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_two<Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
*buffer = new pbs_buffer<Torus, CLASSICAL>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
@@ -342,7 +331,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
uint64_t full_sm, uint64_t full_dm, bool do_modulus_switch) {
int max_shared_memory = cuda_get_max_shared_memory(0);
cudaSetDevice(gpu_index);
@@ -355,36 +344,35 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
level_count, d_mem, full_dm, do_modulus_switch);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
level_count, d_mem, partial_dm, do_modulus_switch);
} else {
device_programmable_bootstrap_step_one<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
level_count, d_mem, 0, do_modulus_switch);
}
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params>
__host__ void
execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector,
Torus *lut_vector_indexes, double2 *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
uint64_t full_sm, uint64_t full_dm) {
__host__ void execute_step_two(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
double2 *bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, int8_t *d_mem, int lwe_iteration,
uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
uint64_t full_dm, uint32_t lut_count, uint32_t lut_stride) {
int max_shared_memory = cuda_get_max_shared_memory(0);
cudaSetDevice(gpu_index);
@@ -397,21 +385,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
level_count, d_mem, full_dm, lut_count, lut_stride);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
level_count, d_mem, partial_dm, lut_count, lut_stride);
} else {
device_programmable_bootstrap_step_two<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
level_count, d_mem, 0, lut_count, lut_stride);
}
check_cuda_error(cudaGetLastError());
}
@@ -425,7 +413,8 @@ __host__ void host_programmable_bootstrap(
Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t lut_count, uint32_t lut_stride, bool do_modulus_switch) {
cudaSetDevice(gpu_index);
// With SM each block corresponds to either the mask or body, no need to
@@ -455,13 +444,15 @@ __host__ void host_programmable_bootstrap(
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
do_modulus_switch);
execute_step_two<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, bootstrapping_key, global_accumulator,
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two);
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
lut_count, lut_stride);
}
}

View File

@@ -65,7 +65,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -78,7 +79,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -86,7 +87,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -94,7 +95,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -102,7 +103,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 4096:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -110,7 +111,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 8192:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -118,7 +119,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 16384:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -126,7 +127,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -142,7 +143,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -155,7 +157,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 512:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -163,7 +165,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 1024:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -171,7 +173,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 2048:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -179,7 +181,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 4096:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -187,7 +189,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 8192:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -195,7 +197,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 16384:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -203,7 +205,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -218,7 +220,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
uint32_t lut_stride) {
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
@@ -235,7 +238,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
#else
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
@@ -250,7 +253,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -262,7 +265,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -328,52 +331,51 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 512:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 1024:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 2048:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 4096:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 8192:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 16384:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -383,10 +385,9 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
}
void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory) {
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -394,8 +395,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
level_count))
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
else
#endif
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
@@ -408,8 +409,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
else
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
}
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
@@ -440,6 +441,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
int max_blocks_per_sm;
int max_shared_memory = cuda_get_max_shared_memory(0);
cudaSetDevice(gpu_index);
if (max_shared_memory < full_sm_keybundle)
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
@@ -486,10 +488,9 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory);
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template void
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -499,7 +500,8 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
void *stream, uint32_t gpu_index,
@@ -515,7 +517,8 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
template bool
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -526,52 +529,51 @@ has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
template <typename Torus>
void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 512:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 1024:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 2048:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 4096:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 8192:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
case 16384:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -586,7 +588,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -599,7 +602,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 512:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -607,7 +610,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 1024:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -615,23 +618,37 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 2048:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
case 2048: {
int num_sms = 0;
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1))
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_samples, lut_count, lut_stride);
else
host_tbc_multi_bit_programmable_bootstrap<Torus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_samples, lut_count, lut_stride);
break;
}
case 4096:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 8192:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -639,7 +656,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
case 16384:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -647,7 +664,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples);
num_samples, lut_count, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -658,8 +675,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template void
@@ -670,5 +686,6 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lut_count, uint32_t lut_stride);
#endif

Some files were not shown because too many files have changed in this diff Show More