Compare commits

..

107 Commits

Author SHA1 Message Date
Agnes Leroy
d0fdaeea75 Fix Makefile 2026-02-13 09:49:21 +01:00
Agnes Leroy
077c063c20 Add cost for 8xL40 2026-02-13 09:01:10 +01:00
Agnes Leroy
565b903534 Add decomp_sns_comp bench to summary 2026-02-12 17:44:24 +01:00
Agnes Leroy
0aaae99b56 Add 8xL40 to slab config 2026-02-12 16:34:03 +01:00
Agnes Leroy
7bdd0adeff Fix Makefile 2026-02-12 15:05:00 +01:00
Guillermo Oyarzun
d8d155e484 feat(gpu): implement specialized pbs for any architecture 2026-02-12 11:46:14 +01:00
Agnes Leroy
3169ee8093 chore(gpu): bench classic or multi-bit params for compress/decompress 2026-02-12 11:45:44 +01:00
David Testé
8119c2287b WIP: run classic an multi_bit benchs on gpu 2026-02-12 11:42:48 +01:00
David Testé
e4c7f83e17 WIP: use placeholder workflow to run benches 2026-02-10 16:09:26 +01:00
David Testé
349846bc11 WIP: add profile selection for GPU bench 2026-02-10 16:09:25 +01:00
David Testé
b1fb4b2ae0 WIP: adding missing DEX bench 2026-02-10 16:09:25 +01:00
David Testé
da3c55c50b WIP: implement bench workflow testing cpu and gpu 2026-02-10 16:09:22 +01:00
Agnes Leroy
7593093d8f chore(gpu): add classic params to zk and sns benches 2026-02-10 16:07:00 +01:00
David Testé
344ea55426 chore(bench): allow result parsing of kv store benchmarks
Naming has been reworked to stick with the future naming standard.
Call to write_to_json() has been added to be able to parse the
results and send them to database.
2026-02-10 16:02:27 +01:00
David Testé
53de52c9fd chore(bench): move kv store benchmarks to their own file 2026-02-10 16:02:27 +01:00
Arthur Meyre
14870536e4 chore: upate webpack to 5.105.0 which is signed and has provenance 2026-02-10 14:38:22 +01:00
Arthur Meyre
35cc35bd96 chore: revert backward compatibility change
- this change should not have been needed and poses risks for backward
compatibility
- HL CUDA: use dedicated type for the ReRand
2026-02-10 14:25:25 +01:00
Guillermo Oyarzun
20403d6325 fix(gpu): avoid unspecified behavior during the reduction 2026-02-10 11:33:40 +01:00
Andrei Stoian
71f52b2853 fix(gpu): more refactor 2026-02-09 22:32:57 +01:00
Andrei Stoian
d83e57f29a fix(gpu): encapsulate lut generation 2026-02-09 22:32:57 +01:00
Agnes Leroy
9592445bd8 fix(gpu): revert changes in compression
This reverts commit 91a927e09e and
7fac0bf3b2.
2026-02-09 17:04:17 +01:00
David Testé
035285fcbe chore(ci): rollback slsa-github-generator action version pinning
According to the action documentation, pinning to a commit cannot be done yet.
2026-02-09 10:20:56 +01:00
David Testé
650f8a400f chore(ci): add runs-on configuration file
This is done before migrating the CI running on AWS to avoid inline runs-on configuration in each workflow file.
2026-02-09 09:35:29 +01:00
Pedro Alves
7fac0bf3b2 fix(gpu): use correct lwe_per_glwe value in compression metadata 2026-02-08 12:14:07 -03:00
Pedro Alves
91a927e09e fix(gpu): add lwe_per_glwe consistency checks between scratch and host functions 2026-02-08 12:14:07 -03:00
Pedro Alves
87cf5dd8a0 fix(gpu): remove unused number_bits_to_unpack initialization in host_extract 2026-02-08 12:14:07 -03:00
Pedro Alves
52d90e3c62 fix(gpu): assert all items have same shape in from_vec_cuda_lwe_ciphertexts_list 2026-02-08 12:14:07 -03:00
Pedro Alves
4abaf92dbd fix(gpu): assert exactly one element in CudaLweCiphertextList::into_lwe_ciphertext 2026-02-08 12:14:07 -03:00
Pedro Alves
140d27f11d fix(gpu): use lwe_compact_ciphertext_list_size for validation in CudaLweCompactCiphertextList::from_d_vec 2026-02-08 12:14:07 -03:00
Agnes Leroy
cd4f677248 chore(gpu): improve checks in device code 2026-02-06 18:31:29 +01:00
Agnes Leroy
6ad8f30e3f fix(gpu): fix logic in lwe ciphertext list 2026-02-06 18:18:14 +01:00
David Testé
58f075b669 chore(ci): run workflows on push only on public repository 2026-02-06 18:00:55 +01:00
David Testé
4145497a47 chore(ci): fix parameters check workflow setup condition
Prior to this, parameters_check workflow could never run on pull-request event even if one the paths were matched.
2026-02-06 18:00:55 +01:00
Pedro Alves
c939687351 chore(gpu): replaces (a + b - 1) / b patterns in the cuda backend by CEIL_DIV
- also, this commit renames kernel_dimensions.cuh to helper.cuh and copies the content of the older helper.cuh into helper_debug.cuh
2026-02-06 16:48:25 +01:00
Agnes Leroy
316c345d0a chore(gpu): add some missing checks in core crypto 2026-02-06 15:49:17 +01:00
Agnes Leroy
dcb0f892ef chore(gpu): cleanup device.cu binding, remove _async fuinctions from core crypto 2026-02-06 15:49:17 +01:00
Thomas Montaigu
4e1ab7f769 chore: move shortint expanded types into shortint mod
The expanded types definitions were in the high level API
as it was originally related to the XofKeySet feature.

However, since it's now used even in non-xof setting
we decided to move these types to shortint module
where they conceptually belong
2026-02-06 15:49:01 +01:00
Thomas Montaigu
b8843352a5 feat(hlapi): add is_conformant for CompressedXofKeySet 2026-02-06 15:48:42 +01:00
Thomas Montaigu
8f9571dc64 fix(xofkeyset): generate multibit decompression key when params are multibit 2026-02-06 15:48:42 +01:00
Thomas Montaigu
1d7c7dfa98 fix(conformance): GGSW list had wrong group count 2026-02-06 15:48:42 +01:00
Thomas Montaigu
b3029d7296 chore(xof_key_set): make generate_with_pre_seeded_generator public
MPC teams needs to be able to generate a CompressedXofKeySet
from an existing ClientKey
2026-02-06 13:29:03 +01:00
Agnes Leroy
b4c8f782c4 chore(gpu): add fallback for 4-l40 in CI 2026-02-06 12:00:25 +01:00
David Testé
94fb1c61fe doc: fix specs description of aws hpc7a.96xlarge instance
These instances have two sockets, each equipped with a 96-core CPU.
2026-02-06 10:50:20 +01:00
Guillermo Oyarzun
9a870652dd fix(gpu): clean unused variables in specialized classical pbs 2026-02-06 09:59:07 +01:00
David Testé
b71799de2f chore(ci): add svg generation for erc20 benchmarks
This commit introduces the concept of a benchmark subset in the
data_extractor. This allows a user to fetch only part of the
benchmark results on a given layer. For now only HLAPI ERC20
benchmarks handling is implemented.

Also, the benchmark type 'both' has been added. It allows a user
to fetch both latency and throughput results in the database.
This is used in ERC20 SVG generation to display these two
benchmark types within the same table.
2026-02-05 18:24:12 +01:00
Agnes Leroy
869af08f1e chore(gpu): make valgrind run weekly 2026-02-05 17:46:10 +01:00
Agnes Leroy
4a2eac2990 chore(gpu): extend time for 4090 tests now that tests take longer 2026-02-05 16:51:52 +01:00
David Testé
bbe62324fa chore(ci): pin slsa-github-generator to a specific commit
Zizmor analysis didn't pick up this non-pinned action usage. It's now pinned to the commit of the v2.1.0 of the action.
2026-02-05 16:11:14 +01:00
Andrei Stoian
bdc5d8597e fix(gpu): valgrind improve error return 2026-02-05 13:55:45 +01:00
Andrei Stoian
cba7f17c12 fix(gpu): deliberate leak 2026-02-05 13:55:45 +01:00
Andrei Stoian
74bbdf9038 fix(gpu): valgrind error on leaks 2026-02-05 13:55:45 +01:00
Guillermo Oyarzun
7da02520dd feat(gpu): create different threshold for multi-gpu pbs128 2026-02-05 13:18:30 +01:00
Theo Souchon
0398dccf29 chore(bench): add dedup operations to reduce ci time 2026-02-05 13:07:56 +01:00
Theo Souchon
e3ad38d077 chore(bench): add throughput mode on hlapi operations 2026-02-05 13:07:56 +01:00
Guillermo Oyarzun
c471c3f687 fix(gpu): fix race condition in tbc implementations 2026-02-05 12:44:50 +01:00
David Testé
e40e127393 chore(bench): add fast benchmark capability to integer zk
Now, one can run a fast ZK benchmark. This would run only the compute load 'verify' with 64bit and 256bit to prove using a 2048bit CRS.
Fast benchmark can be triggered by using the make environment variable 'BENCH_OP_FLAVOR' with the value 'fast_default' or 'fast'.
2026-02-05 11:52:32 +01:00
Agnes Leroy
9ad43d62c3 chore(gpu): split fast h100 workflow into core and hlapi 2026-02-05 11:46:41 +01:00
Guillermo Oyarzun
8daccd1d85 feat(gpu): avoid register spilling memory in ff128 2026-02-04 18:08:42 +01:00
Guillermo Oyarzun
ed117630e8 feat(gpu): use 512 threads for pbs128 flavors 2026-02-04 18:08:42 +01:00
Enzo Di Maria
267ad280f2 feat(gpu): kreyvium 2026-02-04 15:14:55 +01:00
Agnes Leroy
9a556bc517 chore(gpu): reduce hl test time in CI 2026-02-04 13:54:37 +01:00
Agnes Leroy
5d651c0639 fix(gpu): fix small cpu memory leak 2026-02-04 11:42:00 +01:00
Thomas Montaigu
96b7c6ee60 refactor(csprng): move Seed backward compat in csprng
XofSeed had its backward_compatibility in the csprng
crate where it originates from. The Seed type did not,
and core_crypto had to do extra work to have this backward
compatibility.

So we move the backward compatibility into csprng for consistency.

Also, this removes a wrong re-export of serde
as there was

```
// generator.rs
pub mod serialization_proxy {
    pub(crate) use serde::{Deserialize, Serialize};
    // ...
}

// random/mod.rs
pub use generator::*;
```
which lead to Deserialize, Serialize being re-export crate wise and some
other files of the lib imported these traits from here instead of serde
2026-02-04 09:37:59 +01:00
Thomas Montaigu
0340869347 feat(hlapi): add decompress_to_gpu for CompressedXofKeySet
Done using the newly added expand + convert pattern
2026-02-04 09:36:32 +01:00
Thomas Montaigu
daff533de4 refactor(hlapi): add IntegerExpandedServerKey::convert_to_gpu
And use it to convert from CompressedServerKey to CudaServerKey.
2026-02-04 09:36:32 +01:00
Thomas Montaigu
155c575bd9 refactor(hlapi): split gpu key conversion in expand/convert
converting from CompressedServerKey (Cpu) to CudaServerKey
was done via decompress_from_cpu/decompress_to_cuda methods.

We refactor to split these functions in 2: one that converts from cpu
(input in std domain for bootstrap keys), the other that
decompress/expand then calls convert
2026-02-04 09:36:32 +01:00
Thomas Montaigu
e942c22bc1 refactor(hlapi): add expand() method to CompressedServerKey
Add an expand method to CompressedServerKey that returns an
IntegerExpandedServerKey. Refactor decompress() to use the new expand()
then convert_to_cpu().

This will allow later to refactor the convertion from
CompressedServerKey to CudaServerKey to follow a similar pattern,
meaning we will be able to share the code that converts keys from CPU to
GPU between the normal server key and the xof server key
2026-02-04 09:36:32 +01:00
Thomas Montaigu
e620768e3c fix(hlapi-gpu): correct state of post decompression noise squashed ct
The GPU part did not set the correct state

This problem is sort of rare to encounter as generally a
CompressedSquashedNoiseCiphertextList is deserialized then used.
When deserialized its on the CPU, so calls to `get` use CPU
code which correctly set the state.

This problem is thus visible when either:
- safe_deserializing and manually moving the list to GPU
- deserialize
- directly expanding after the creation of the list
2026-02-04 09:36:32 +01:00
Thomas Montaigu
ebd6c06d7b fix(hlapi): return error when trying to expand non packed list 2026-02-04 09:36:32 +01:00
David Testé
fb630d5cd2 chore(bench): use tfhe-benchmark id format pattern in wasm
This brings consistency across benchmark ID naming and eases work
for data consumers like data_extractor.
2026-02-03 14:24:11 +01:00
Agnes Leroy
c2bcb10702 fix(gpu): fix noise level in match value 2026-02-03 09:05:06 +01:00
David Testé
a1f17f4a00 doc: add svg tables to zero-knowledge benchmarks
This removes the embedded GSheet.
These SVGs display more operations and inputs that reflect
real-use cases.
Throughput is available only for server-side computation as
it's meaningless to perform multiple proof in parallele on
the client-side.
2026-02-02 15:59:33 +01:00
Andrei Stoian
8f35a3a3d6 chore(gpu): refactor lut generation 2026-02-02 15:01:41 +01:00
Beka Barbakadze
2f111bc413 feat(gpu): add bit-wise consistency test for forward FFT128 2026-02-02 17:06:14 +04:00
dependabot[bot]
ee9a95f8dd chore(deps): bump actions/setup-python from 6.1.0 to 6.2.0
Bumps [actions/setup-python](https://github.com/actions/setup-python) from 6.1.0 to 6.2.0.
- [Release notes](https://github.com/actions/setup-python/releases)
- [Commits](83679a892e...a309ff8b42)

---
updated-dependencies:
- dependency-name: actions/setup-python
  dependency-version: 6.2.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-02-02 11:41:04 +01:00
dependabot[bot]
16bdbeb0ea chore(deps): bump zizmorcore/zizmor-action from 0.3.0 to 0.4.1
Bumps [zizmorcore/zizmor-action](https://github.com/zizmorcore/zizmor-action) from 0.3.0 to 0.4.1.
- [Release notes](https://github.com/zizmorcore/zizmor-action/releases)
- [Commits](e639db9933...135698455d)

---
updated-dependencies:
- dependency-name: zizmorcore/zizmor-action
  dependency-version: 0.4.1
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-02-02 11:40:54 +01:00
dependabot[bot]
75667b79ca chore(deps): bump JS-DevTools/npm-publish from 4.1.3 to 4.1.4
Bumps [JS-DevTools/npm-publish](https://github.com/js-devtools/npm-publish) from 4.1.3 to 4.1.4.
- [Release notes](https://github.com/js-devtools/npm-publish/releases)
- [Changelog](https://github.com/JS-DevTools/npm-publish/blob/main/CHANGELOG.md)
- [Commits](d2fef917d9...4ce4bd0f33)

---
updated-dependencies:
- dependency-name: JS-DevTools/npm-publish
  dependency-version: 4.1.4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-02-02 11:40:40 +01:00
dependabot[bot]
8b7f8ead23 chore(deps): bump foundry-rs/foundry-toolchain from 1.6.0 to 1.7.0
Bumps [foundry-rs/foundry-toolchain](https://github.com/foundry-rs/foundry-toolchain) from 1.6.0 to 1.7.0.
- [Release notes](https://github.com/foundry-rs/foundry-toolchain/releases)
- [Changelog](https://github.com/foundry-rs/foundry-toolchain/blob/master/RELEASE.md)
- [Commits](8b0419c685...8789b3e21e)

---
updated-dependencies:
- dependency-name: foundry-rs/foundry-toolchain
  dependency-version: 1.7.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-02-02 11:40:25 +01:00
David Testé
03aa3ddec2 chore(ci): remove pull-request opening option in workflow
In case of documentation benchmarks results, we want the upload to be manual.
2026-02-02 11:38:42 +01:00
Mayeul@Zama
40a439620f chore(core): remove PlanWrapper 2026-02-02 09:42:13 +01:00
Mayeul@Zama
0fca1796f3 feat(core): add GenericPlanMap::set 2026-02-02 09:42:13 +01:00
Mayeul@Zama
3fb5eb7a21 refactor(core): use typed PolynomialSize for Plan 2026-02-02 09:42:13 +01:00
Mayeul@Zama
d1f77b23f3 refactor(core): simplify PlanMap 2026-02-02 09:42:13 +01:00
Mayeul@Zama
e75bca1b1d refactor(core): factorize plan map management 2026-02-02 09:42:13 +01:00
Guillermo Oyarzun
aa1e9ffdfa chore(gpu): avoid running gpu noise in integer tests 2026-02-02 09:24:04 +01:00
Theo Souchon
45d76cf079 chore(bench): remove bitnot from hlapi bench because already tested 2026-01-30 15:36:08 +01:00
Beka Barbakadze
f4713ceeaa fix(gpu): add __syncthread and threadIdx condition for sample_extract_body in all pbs versions 2026-01-30 15:31:53 +01:00
Pedro Alves
600532e8f7 chore(gpu): add test for single-item compact ciphertext list expand 2026-01-30 09:37:58 +01:00
Andrei Stoian
7bfb236543 fix(gpu): more crypto param checks in cuda backend 2026-01-30 09:32:21 +01:00
Beka Barbakadze
7c0ecf48f2 refactor(gpu): refactor f128 and fft128 to prevent possible precision losses and improve performance 2026-01-30 08:55:22 +01:00
Pedro Alves
66b357d869 fix(gpu): fix some inconsistencies in decompression that could enable access of not allocated memory
- also adds a diagram explaining how compression / decompression work
2026-01-29 16:35:03 -03:00
Agnes Leroy
b7786afaf5 fix(gpu): add check on nullptr for dest_indexes in many lut gather 2026-01-29 10:08:06 +01:00
Guillermo Oyarzun
84931c420f fix(gpu): handling temporary events destruction 2026-01-28 15:46:08 +01:00
Pedro Alves
93201d6afb fix(gpu): fix an invalid access in expand when the number of LWEs is odd
- also improves test_expander_length_matches_data_items
- adds diagrams and explanations about GPU's expand
2026-01-28 14:32:56 +01:00
Arthur Meyre
1f6681ec39 chore: remove milestone reminde on PRs 2026-01-28 13:03:53 +01:00
Theo Souchon
31a1e977b2 fix(wasm): benchmark compilation for wasm 2026-01-27 15:45:38 +01:00
David Testé
2cfa0f74d1 chore(bench): use standard separator for zk_pke benchmark ids
Parameters and benchmark case configuration must be separated
with a '::' instead of an underscore to ease parsing by automated
tools like data_extractor.
2026-01-27 15:07:06 +01:00
David Testé
17fd3e9db5 chore(bench): add layer name to zk_pke benchmark ids
Layer name 'integer' was missing in benchmark IDs. This made it difficult for automated tools like data_extractor to handle ZK benchmark results.
2026-01-27 15:07:06 +01:00
David Testé
59646a76e4 chore(ci): ensure no credentials are leaked in action log
When using direct git command, credentials are exposed in the console logs. Despite the fact GitHub is redacting its secrets, adding --quiet flag ensures that, even if this redaction feature is flawed, we don't leak secrets in the action log.
To go further, we also shred the local git configuration file to remove any trace of the remote branch that contains crendentials.
2026-01-27 15:06:48 +01:00
Theo Souchon
406055671b chore(bench): add missing operation in hlapi benches 2026-01-27 11:23:13 +01:00
David Testé
87bb4d99d3 chore(ci): update slab-github-runner action to v1.5.0
This new version improves handling of asynchronous tasks related
to Slab and GitHub API.
2026-01-27 10:36:04 +01:00
Guillermo Oyarzun
9f9b54dcb8 fix(gpu): add panic for 32-bit Torus calls 2026-01-27 09:42:11 +01:00
Agnes Leroy
a8a796de6c chore(gpu): fix logic to check ptr validity in device.cu 2026-01-27 09:19:48 +01:00
Agnes Leroy
7b4093b572 chore(gpu): stop trying to enable NVlink since we don't use it 2026-01-27 09:19:40 +01:00
Thomas Montaigu
f52eb16581 refactor(xof_key_set): split decompression into expansion and conversion
Introduce IntegerExpandedServerKey as an intermediate representation
between compressed (seeded) keys and backend-specific formats. Decompression
is now a two-step process:

1. Seed expansion: decompress seeded keys into standard domain representations
   (e.g., LweBootstrapKey instead of FourierLweBootstrapKey)
2. Backend conversion: convert to target backend format (CPU Fourier, GPU, etc.)

This separation allows sharing the expansion step across backends while
specializing only the final conversion, as for this XOF based expansion
the order is important

Changes:
- Split xof_key_set.rs into module structure (mod.rs, internal.rs, test.rs)
- Add intermediate types, that contains the expanded, but not converted
  data
2026-01-26 18:53:53 +01:00
dependabot[bot]
96622506c5 chore(deps): bump actions/checkout from 6.0.1 to 6.0.2
Bumps [actions/checkout](https://github.com/actions/checkout) from 6.0.1 to 6.0.2.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](8e8c483db8...de0fac2e45)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-version: 6.0.2
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2026-01-26 17:49:17 +01:00
David Testé
ce73b934b2 chore(bench): add params type selection from env variable on gpu
Prior to this, multi-bit integer benchmarks on GPU could only be
launched from the make recipe 'bench_[signed_]integer_multi_bit'.
Adding the parameters selection to 'bench_[signed_]integer_gpu'
allows benchmark workflows to work as they are designed.
2026-01-26 17:08:30 +01:00
284 changed files with 15194 additions and 5777 deletions

15
.github/runs-on.yml vendored Normal file
View File

@@ -0,0 +1,15 @@
runners:
cpu-big:
family: m6i.32xlarge
image: cpu-tests-eu-west-3
volume: 200gb
spot: false
cpu-small:
family: m6i.4xlarge
image: cpu-tests-eu-west-3
volume: 200gb
spot: false
images:
cpu-tests-eu-west-3:
ami: "ami-0a786ffdb1411fac4" # Ubuntu 24.04

View File

@@ -34,6 +34,9 @@ permissions:
jobs:
setup-instance:
name: aws_tfhe_backward_compat_tests/setup-instance
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name != 'push'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -41,7 +44,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -66,7 +69,7 @@ jobs:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'true' # Needed to pull lfs data
token: ${{ env.CHECKOUT_TOKEN }}
@@ -141,7 +144,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -63,7 +63,7 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -171,7 +171,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -299,7 +299,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -50,7 +50,7 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -86,7 +86,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
timeout-minutes: 480 # 8 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -35,7 +35,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
timeout-minutes: 1440
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -100,7 +100,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -51,7 +51,7 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
@@ -172,7 +172,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -72,7 +72,7 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -155,7 +155,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -279,7 +279,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -147,7 +147,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -16,10 +16,12 @@ on:
- integer_zk
- shortint
- shortint_oprf
- hlapi
- hlapi_unsigned
- hlapi_signed
- hlapi_erc20
- hlapi_dex
- hlapi_noise_squash
- hlapi_kvstore
- tfhe_zk_pok
- boolean
- pbs

View File

@@ -126,7 +126,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -229,7 +229,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -261,7 +261,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -24,6 +24,7 @@ permissions: {}
jobs:
prepare-inputs:
name: benchmark_cpu_weekly/prepare-inputs
if: github.repository == 'zama-ai/tfhe-rs'
runs-on: ubuntu-latest
outputs:
is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -72,8 +73,7 @@ jobs:
run-benchmarks-integer:
name: benchmark_cpu_weekly/run-benchmarks-integer
if: github.repository == 'zama-ai/tfhe-rs'
&& (needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -92,8 +92,7 @@ jobs:
run-benchmarks-integer-zk-pke:
name: benchmark_cpu_weekly/run-benchmarks-integer-zk-pke
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -111,8 +110,7 @@ jobs:
run-benchmarks-hlapi-erc20:
name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -130,8 +128,7 @@ jobs:
run-benchmarks-hlapi-dex:
name: benchmark_cpu_weekly/run-benchmarks-hlapi-dex
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -149,8 +146,7 @@ jobs:
run-benchmarks-core-crypto:
name: benchmark_cpu_weekly/run-benchmarks-core-crypto
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -167,8 +163,7 @@ jobs:
run-benchmarks-shortint:
name: benchmark_cpu_weekly/run-benchmarks-shortint
if: github.repository == 'zama-ai/tfhe-rs'
&& (needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -186,8 +181,7 @@ jobs:
run-benchmarks-boolean:
name: benchmark_cpu_weekly/run-benchmarks-boolean
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -206,8 +200,7 @@ jobs:
run-benchmarks-tfhe-zk-pok:
name: benchmark_cpu_weekly/run-benchmarks-tfhe-zk-pok
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:

View File

@@ -33,7 +33,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -105,7 +105,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -137,7 +137,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -25,10 +25,6 @@ on:
description: "Generate SVG tables"
type: boolean
default: true
open-pr:
description: "Open a PR with the benchmark results"
type: boolean
default: false
permissions: {}
@@ -166,54 +162,3 @@ jobs:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
open-pr:
name: benchmark-documentation/open-pr
needs: [ generate-svgs-with-benchmarks-run, generate-svgs-without-benchmarks-run ]
if: ${{ always() && inputs.open-pr &&
(needs.generate-svgs-with-benchmarks-run.result == 'success' || needs.generate-svgs-without-benchmarks-run.result == 'success') }}
runs-on: ubuntu-latest
permissions:
contents: write # Needed to create a commit
pull-requests: write # Needed to open a pull-request
env:
PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
- name: Download SVG tables
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
with:
path: svg_tables
merge-multiple: 'true'
# Perform best effort to copy SVG tables. If the copy fails or files don't exist, the PR will still be created.
- name: Copy SVG tables to documentation location
run: |
cp -f svg_tables/*integer-benchmark*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
cp -f svg_tables/*pbs-benchmark-tuniform*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
cp -f svg_tables/cpu-gpu-hpu-integer-benchmark-fheuint64-tuniform-2m128-ciphertext.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
- name: Get current date
id: get-date
run: |
echo "date=$(date '+%g_%m_%d_%Hh%Mm%Ss')" >> "${GITHUB_OUTPUT}"
- name: Create pull-request
uses: peter-evans/create-pull-request@98357b18bf14b5342f975ff684046ec3b2a07725 # v8.0.0
with:
sign-commits: true # Commit will be signed by github-actions bot
add-paths: ${{ env.PATH_TO_DOC_ASSETS }}/*.svg
branch: gh-bot/docs/update-svg-tables-${{ steps.get-date.outputs.date }}
commit-message: |
chore(docs): update benchmark results for all backends
Automated documentation update from tfhe-rs CI pipeline.
title: |
[CI] chore(docs): update benchmark results for all backends
body: |
Documentation update triggered by GitHub workflow.
labels: documentation

View File

@@ -40,7 +40,7 @@ jobs:
timeout-minutes: 1440 # 24 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -123,7 +123,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab

View File

@@ -134,7 +134,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,7 +175,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -287,7 +287,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -324,7 +324,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -50,6 +50,8 @@ env:
jobs:
parse-inputs:
name: benchmark_gpu_coprocessor/parse-inputs
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
permissions:
contents: 'read'
@@ -92,7 +94,7 @@ jobs:
steps:
- name: Start remote instance
id: start-remote-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +132,7 @@ jobs:
git lfs install
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
path: tfhe-rs
persist-credentials: false
@@ -141,7 +143,7 @@ jobs:
ls
- name: Checkout fhevm
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
repository: zama-ai/fhevm
persist-credentials: 'false'
@@ -192,7 +194,7 @@ jobs:
cargo install sqlx-cli
- name: Install foundry
uses: foundry-rs/foundry-toolchain@8b0419c685ef46cb79ec93fbdc131174afceb730
uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10
- name: Cache cargo
uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7 # v5.0.2
@@ -299,7 +301,7 @@ jobs:
path: fhevm/$${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -324,7 +326,7 @@ jobs:
steps:
- name: Stop remote instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -25,6 +25,7 @@ permissions: {}
jobs:
prepare-inputs:
name: benchmark_cpu_weekly/prepare-inputs
if: github.repository == 'zama-ai/tfhe-rs'
runs-on: ubuntu-latest
outputs:
is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -49,8 +50,7 @@ jobs:
run-benchmarks-8-h100-sxm5-integer:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -72,8 +72,7 @@ jobs:
run-benchmarks-8-h100-sxm5-integer-compression:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-compression
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -95,8 +94,7 @@ jobs:
run-benchmarks-8-h100-sxm5-integer-zk-aes:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-zk-aes
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -118,8 +116,7 @@ jobs:
run-benchmarks-8-h100-sxm5-noise-squash:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-noise-squash
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -141,8 +138,7 @@ jobs:
run-benchmarks-1-h100-core-crypto:
name: benchmark_gpu_weekly/run-benchmarks-1-h100-core-crypto (1xH100)
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -166,8 +162,7 @@ jobs:
run-benchmarks-1-h100-erc20:
name: benchmark_gpu_weekly/run-benchmarks-1-h100-erc20
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -187,8 +182,7 @@ jobs:
run-benchmarks-2-h100-erc20:
name: benchmark_gpu_weekly/run-benchmarks-2-h100-erc20
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -208,8 +202,7 @@ jobs:
run-benchmarks-8-h100-erc20:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-erc20
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -233,8 +226,7 @@ jobs:
run-benchmarks-1-h100-dex:
name: benchmark_gpu_weekly/run-benchmarks-1-h100-dex
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -254,8 +246,7 @@ jobs:
run-benchmarks-2-h100-dex:
name: benchmark_gpu_weekly/run-benchmarks-2-h100-dex
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -275,8 +266,7 @@ jobs:
run-benchmarks-8-h100-dex:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-dex
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:

View File

@@ -12,7 +12,8 @@ on:
default: integer
options:
- integer
- hlapi
- hlapi_unsigned
- hlapi_signed
- hlapi_erc20
op_flavor:
description: "Operations set to run"

View File

@@ -126,7 +126,7 @@ jobs:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -191,7 +191,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab

View File

@@ -50,7 +50,7 @@ jobs:
pull-requests: write # Needed to write a comment in a pull-request
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0 # Needed to get commit hash
persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -305,13 +305,13 @@ jobs:
REF_NAME: ${{ github.head_ref || github.ref_name }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Install recent Python
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.12'
pip-install: -r ci/data_extractor/requirements.txt -r ci/perf_regression/requirements.txt
@@ -383,7 +383,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

126
.github/workflows/benchmark_summary.yml vendored Normal file
View File

@@ -0,0 +1,126 @@
# Run all benchmarks displayed in the internal documentation.
name: benchmark_summary
run-name: Benchmark Summary
on:
workflow_dispatch:
inputs:
run-cpu-benchmarks:
description: "Run CPU benchmarks"
type: boolean
default: true
run-gpu-benchmarks:
description: "Run GPU benchmarks"
type: boolean
default: true
gpu-profile:
description: "GPU Instance type"
required: true
default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
type: choice
options:
- "l40 (n3-L40x1)"
- "4-l40 (n3-L40x4)"
- "multi-a100-nvlink (n3-A100x8-NVLink)"
- "single-h100 (n3-H100x1)"
- "2-h100 (n3-H100x2)"
- "4-h100 (n3-H100x4)"
- "multi-h100 (n3-H100x8)"
- "multi-h100-nvlink (n3-H100x8-NVLink)"
- "multi-h100-sxm5 (n3-H100-SXM5x8)"
run-hpu-benchmarks:
description: "Run HPU benchmarks"
type: boolean
default: true
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
jobs:
parse-gpu-inputs:
name: benchmark_summary/parse-gpu-inputs
if: inputs.run-gpu-benchmarks
runs-on: ubuntu-latest
outputs:
profile: ${{ steps.parse_profile.outputs.profile }}
hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
env:
INPUTS_PROFILE: ${{ inputs.gpu-profile }}
steps:
- name: Parse profile
id: parse_profile
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
- name: Parse hardware name
id: parse_hardware_name
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
run-benchmarks-cpu:
name: benchmark_documentation/run-benchmarks-cpu-integer
uses: ./.github/workflows/benchmark_cpu_common.yml
if: inputs.run-cpu-benchmarks
with:
command: summary
bench_type: both
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-gpu:
name: benchmark_documentation/run-benchmarks-gpu
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-benchmarks
needs: parse-gpu-inputs
with:
profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
command: summary
bench_type: both
params_type: classical + multi_bit
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# TODO add make recipe for HPU benchmarks
# run-benchmarks-hpu:
# name: benchmark_documentation/run-benchmarks-hpu
# uses: ./.github/workflows/benchmark_hpu_common.yml
# if: inputs.run-hpu-benchmarks
# with:
# command: summary
# bench_type: both
# v80_pcie_dev: 24
# v80_serial_number: XFL12NWY3ZKG
# secrets:
# BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
# REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
# JOB_SECRET: ${{ secrets.JOB_SECRET }}
# SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
# SLAB_URL: ${{ secrets.SLAB_URL }}
# SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}

View File

@@ -31,13 +31,16 @@ permissions: {}
jobs:
setup-instance:
name: benchmark_tfhe_fft/setup-instance
if:
(github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +58,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -102,7 +105,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -134,7 +137,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -31,13 +31,16 @@ permissions: {}
jobs:
setup-instance:
name: benchmark_tfhe_ntt/setup-instance
if:
(github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -55,7 +58,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -102,7 +105,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -134,7 +137,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -31,15 +31,14 @@ jobs:
name: benchmark_wasm_client/should-run
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
(github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs')
permissions:
pull-requests: read # Needed to check for file change
outputs:
wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -71,7 +70,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -91,7 +90,7 @@ jobs:
browser: [ chrome, firefox ]
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -181,7 +180,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: zama-ai/slab
path: slab
@@ -213,7 +212,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -24,9 +24,11 @@ permissions: {}
jobs:
audit:
name: cargo_audit/audit
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
outputs:
matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -80,7 +80,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -140,7 +140,7 @@ jobs:
result: ${{ steps.set_builds_result.outputs.result }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -242,7 +242,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -26,7 +26,7 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
os: [ubuntu-latest, macos-latest, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -2,6 +2,7 @@
name: cargo_test_fft
on:
workflow_dispatch:
pull_request:
push:
branches:
@@ -22,6 +23,8 @@ permissions:
jobs:
should-run:
name: cargo_test_fft/should-run
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
@@ -29,7 +32,7 @@ jobs:
fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -56,7 +59,7 @@ jobs:
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
fail-fast: false
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -92,7 +95,7 @@ jobs:
if: needs.should-run.outputs.fft_test == 'true'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -2,6 +2,7 @@
name: cargo_test_ntt
on:
workflow_dispatch:
pull_request:
push:
branches:
@@ -24,6 +25,8 @@ permissions:
jobs:
should-run:
name: cargo_test_ntt/should-run
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
@@ -31,7 +34,7 @@ jobs:
ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: "false"
@@ -60,7 +63,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -87,7 +90,7 @@ jobs:
os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
fail-fast: false
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
@@ -143,7 +146,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -43,7 +43,7 @@ jobs:
echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"
- name: Check workflows security
uses: zizmorcore/zizmor-action@e639db99335bc9038abc0e066dfcd72e23d26fb4 # v0.3.0
uses: zizmorcore/zizmor-action@135698455da5c3b3e55f73f4419e481ab68cdd95 # v0.4.1
with:
advanced-security: 'false' # Print results directly in logs
persona: pedantic

View File

@@ -31,7 +31,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
timeout-minutes: 5760 # 4 days
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -37,7 +37,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -43,7 +43,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'

View File

@@ -19,8 +19,8 @@ on:
pull_request:
types: [ labeled ]
schedule:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
# Every other day at 1AM
- cron: "0 1 */2 * *"
permissions:
contents: read
@@ -37,11 +37,11 @@ jobs:
group: ${{ github.workflow_ref }}
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 1440 # 24 hours
timeout-minutes: 2880 # 48 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -23,8 +23,8 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# every month
- cron: "0 0 1 * *"
# every friday noon
- cron: "0 12 * * 5"
permissions:
contents: read
@@ -35,15 +35,15 @@ jobs:
setup-instance:
name: gpu_code_validation_tests/setup-instance
runs-on: ubuntu-latest
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved')
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,5 +1,5 @@
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: gpu_fast_h100_tests
name: gpu_core_h100_tests
env:
CARGO_TERM_COLOR: always
@@ -32,7 +32,7 @@ permissions:
jobs:
should-run:
name: gpu_fast_h100_tests/should-run
name: gpu_core_h100_tests/should-run
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -61,15 +61,14 @@ jobs:
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_fast_h100_tests.yml'
- '.github/workflows/gpu_core_h100_tests.yml'
- scripts/integer-tests.sh
- ci/slab.toml
setup-instance:
name: gpu_fast_h100_tests/setup-instance
name: gpu_core_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -87,7 +86,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -111,7 +110,7 @@ jobs:
echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
cuda-tests-linux:
name: gpu_fast_h100_tests/cuda-tests-linux
name: gpu_core_h100_tests/cuda-tests-linux
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -129,7 +128,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -155,20 +154,8 @@ jobs:
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
- name: Run user docs tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
- name: Test C API
run: |
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
- name: Run High Level API Tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
slack-notify:
name: gpu_fast_h100_tests/slack-notify
name: gpu_core_h100_tests/slack-notify
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -187,10 +174,10 @@ jobs:
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
SLACK_MESSAGE: "Core H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
teardown-instance:
name: gpu_fast_h100_tests/teardown-instance
name: gpu_core_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
@@ -198,7 +185,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -79,7 +79,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -151,7 +151,7 @@ jobs:
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
make test_high_level_api_gpu_fast
slack-notify:
name: gpu_fast_tests/slack-notify
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -35,7 +35,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,7 +68,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -154,7 +154,7 @@ jobs:
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu
make test_high_level_api_gpu_fast
slack-notify:
name: gpu_full_multi_gpu_tests/slack-notify
@@ -187,7 +187,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -0,0 +1,209 @@
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: gpu_hlapi_h100_tests
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACKIFY_MARKDOWN: true
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
PULL_REQUEST_MD_LINK: ""
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
# Secrets will be available only to zama-ai organization members
SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
permissions:
contents: read
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
jobs:
should-run:
name: gpu_hlapi_h100_tests/should-run
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
with:
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_hlapi_h100_tests.yml'
- scripts/integer-tests.sh
- ci/slab.toml
setup-instance:
name: gpu_hlapi_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
if: env.SECRETS_AVAILABLE == 'false'
run: |
echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
cuda-tests-linux:
name: gpu_hlapi_h100_tests/cuda-tests-linux
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow_ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.8"
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
gcc-version: ${{ matrix.gcc }}
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
- name: Enable nvidia multi-process service
run: |
nvidia-cuda-mps-control -d
- name: Run user docs tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
- name: Test C API
run: |
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
- name: Run High Level API Tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
slack-notify:
name: gpu_hlapi_h100_tests/slack-notify
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "HL API H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
teardown-instance:
name: gpu_hlapi_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -38,7 +38,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -65,7 +65,7 @@ jobs:
timeout-minutes: 4320 # 72 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -38,7 +38,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -74,7 +74,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -159,7 +159,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
- name: Stop instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -2,6 +2,7 @@
name: hpu_hlapi_tests
on:
workflow_dispatch:
pull_request:
push:
branches:
@@ -25,6 +26,8 @@ permissions: {}
jobs:
should-run:
name: hpu_hlapi_tests/should-run
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
@@ -32,7 +35,7 @@ jobs:
hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -62,7 +65,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,7 +86,7 @@ jobs:
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -114,7 +117,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -34,7 +34,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,7 +53,7 @@ jobs:
timeout-minutes: 4320 # 72 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
timeout-minutes: 720
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -52,7 +52,7 @@ jobs:
hash: ${{ steps.hash.outputs.hash }}
steps:
- name: Checkout
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -75,6 +75,7 @@ jobs:
name: make_release_common/provenance
if: ${{ !inputs.dry-run }}
needs: package
# This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
permissions:
actions: read # Needed to detect the GitHub Actions environment
@@ -93,7 +94,7 @@ jobs:
id-token: write # Needed for OIDC token exchange on crates.io
steps:
- name: Checkout
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
persist-credentials: 'false'

View File

@@ -37,7 +37,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
persist-credentials: "false"
@@ -117,6 +117,7 @@ jobs:
name: make_release_cuda/provenance
if: ${{ !inputs.dry_run }}
needs: [package]
# This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
permissions:
actions: read # Needed to detect the GitHub Actions environment
@@ -221,7 +222,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -68,7 +68,7 @@ jobs:
id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
steps:
- name: Checkout
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -92,7 +92,7 @@ jobs:
- name: Publish web package
if: ${{ inputs.push_web_package }}
uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
with:
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
@@ -109,7 +109,7 @@ jobs:
- name: Publish Node package
if: ${{ inputs.push_node_package }}
uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
with:
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}

View File

@@ -30,7 +30,7 @@ jobs:
name: parameters_check/setup-instance
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name == 'workflow_dispatch'
github.event_name != 'push'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -38,7 +38,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
toolchain: stable
- name: Checkout lattice-estimator
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
repository: malb/lattice-estimator
path: lattice_estimator
@@ -137,7 +137,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,18 +1,127 @@
# Placeholder workflow file allowing running it without having to merge to main first
name: placeholder_workflow
run-name: Summary benchs tests
on:
workflow_dispatch:
inputs:
run-cpu-benchmarks:
description: "Run CPU benchmarks"
type: boolean
default: true
run-gpu-benchmarks:
description: "Run GPU benchmarks"
type: boolean
default: true
gpu-profile:
description: "GPU Instance type"
required: true
default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
type: choice
options:
- "l40 (n3-L40x1)"
- "4-l40 (n3-L40x4)"
- "8-l40 (n3-L40x8)"
- "multi-a100-nvlink (n3-A100x8-NVLink)"
- "single-h100 (n3-H100x1)"
- "2-h100 (n3-H100x2)"
- "4-h100 (n3-H100x4)"
- "multi-h100 (n3-H100x8)"
- "multi-h100-nvlink (n3-H100x8-NVLink)"
- "multi-h100-sxm5 (n3-H100-SXM5x8)"
run-hpu-benchmarks:
description: "Run HPU benchmarks"
type: boolean
default: true
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
jobs:
placeholder:
name: placeholder_workflow/placeholder
parse-gpu-inputs:
name: benchmark_summary/parse-gpu-inputs
if: inputs.run-gpu-benchmarks
runs-on: ubuntu-latest
outputs:
profile: ${{ steps.parse_profile.outputs.profile }}
hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
env:
INPUTS_PROFILE: ${{ inputs.gpu-profile }}
steps:
- run: |
echo "Hello this is a Placeholder Workflow"
- name: Parse profile
id: parse_profile
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
- name: Parse hardware name
id: parse_hardware_name
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
run-benchmarks-cpu:
name: benchmark_documentation/run-benchmarks-cpu-integer
uses: ./.github/workflows/benchmark_cpu_common.yml
if: inputs.run-cpu-benchmarks
with:
command: summary
bench_type: both
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-gpu:
name: benchmark_documentation/run-benchmarks-gpu
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-benchmarks
needs: parse-gpu-inputs
with:
profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
command: summary
bench_type: both
params_type: classical + multi_bit
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# TODO add make recipe for HPU benchmarks
# run-benchmarks-hpu:
# name: benchmark_documentation/run-benchmarks-hpu
# uses: ./.github/workflows/benchmark_hpu_common.yml
# if: inputs.run-hpu-benchmarks
# with:
# command: summary
# bench_type: both
# v80_pcie_dev: 24
# v80_serial_number: XFL12NWY3ZKG
# secrets:
# BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
# REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
# JOB_SECRET: ${{ secrets.JOB_SECRET }}
# SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
# SLAB_URL: ${{ secrets.SLAB_URL }}
# SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}

View File

@@ -1,67 +0,0 @@
name: pr_milestone_check
on:
pull_request:
types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
# external contributors workflows are manually approved
jobs:
check-empty-milestone:
name: pr_milestone_check/check-empty-milestone
runs-on: ubuntu-latest
if: github.event.pull_request.milestone == null
permissions:
pull-requests: write # Need write access on pull requests to post comment
steps:
- name: Post Reminder Comment
uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
with:
route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
body: |
'### ❌ Milestone Missing
Please assign a milestone to this pull request. If your PR targets the next version of
TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
If your PR targets a patch version for previous releases: consider creating a dedicated
milestone e.g. v1.5.1 if it does not exist yet.'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Check Final Status
run: |
echo "::error::Milestone is missing. This check is failing."
exit 1
check-milestone-open:
name: pr_milestone_check/check-milestone-open
runs-on: ubuntu-latest
if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
permissions:
pull-requests: write # Need write access on pull requests to post comment
steps:
- name: Post Reminder Comment
uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
with:
route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
body: |
'### ❌ Milestone is closed
Please assign an open milestone to this pull request. If your PR targets the next version of
TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
If your PR targets a patch version for previous releases: consider creating a dedicated
milestone e.g. v1.5.1 if it does not exist yet.'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Check Final Status
run: |
echo "::error::Milestone is closed. This check is failing."
exit 1

View File

@@ -30,7 +30,7 @@ jobs:
run: |
echo ">>> Cloning source repo..."
git lfs install
git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -47,6 +47,8 @@ jobs:
echo ">>> Pushing all LFS items..."
git lfs push --all destination "${DESTINATION_BRANCH}"
shred --remove .git/config
- name: git-sync-tags
env:
@@ -59,7 +61,7 @@ jobs:
run: |
echo ">>> Cloning source repo..."
git lfs install
git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -70,3 +72,5 @@ jobs:
echo ">>> Pushing git changes..."
git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
shred --remove .git/config

View File

@@ -12,6 +12,7 @@ permissions: {}
jobs:
stale:
name: unverified_prs/stale
if: github.repository == 'zama-ai/tfhe-rs'
runs-on: ubuntu-latest
permissions:
issues: read # Needed to fetch all issues

161
Makefile
View File

@@ -733,11 +733,12 @@ test_core_crypto_gpu:
--features=gpu -p tfhe -- core_crypto::gpu::
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
test_integer_gpu:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=2
RUSTFLAGS="$(RUSTFLAGS)" cargo test --doc --profile $(CARGO_PROFILE) \
--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=4
test_integer_gpu: install_cargo_nextest
TEST_THREADS=2 \
DOCTEST_THREADS=4 \
./scripts/integer-tests.sh \
--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
--tfhe-package "tfhe" --all-but-noise
.PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
test_integer_gpu_debug:
@@ -1049,10 +1050,16 @@ test_high_level_api:
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
-- high_level_api::
test_high_level_api_gpu: install_cargo_nextest
test_high_level_api_gpu_fast: install_cargo_nextest # Run all the GPU tests for high_level_api except test_uniformity for oprf which is too long
RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-E "test(/high_level_api::.*gpu.*/)"
-E "test(/high_level_api::.*gpu.*/) and not test(/uniformity/)"
test_high_level_api_gpu: install_cargo_nextest # Run all the GPU tests for high_level_api
RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-E "test(/high_level_api::.*gpu.*/)"
test_list_gpu: install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo nextest list --cargo-profile $(CARGO_PROFILE) \
@@ -1371,6 +1378,9 @@ clippy_bench: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
-p tfhe-benchmark -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=shortint,internal-keycache \
-p tfhe-benchmark -- --no-deps -D warnings
.PHONY: clippy_bench_gpu # Run clippy lints on tfhe-benchmark
clippy_bench_gpu: install_rs_check_toolchain
@@ -1405,14 +1415,14 @@ bench_signed_integer: install_rs_check_toolchain
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
bench_integer_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_signed_integer_gpu # Run benchmarks for signed integer on GPU backend
bench_signed_integer_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-signed \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1428,14 +1438,14 @@ bench_integer_hpu: install_rs_check_toolchain
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-glwe_packing_compression \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_integer_compression_gpu
bench_integer_compression_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-glwe_packing_compression \
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1449,7 +1459,8 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain
.PHONY: bench_integer_zk_gpu
bench_integer_zk_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
@@ -1475,6 +1486,13 @@ bench_integer_trivium_gpu: install_rs_check_toolchain
--bench integer-trivium \
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_kreyvium_gpu # Run benchmarks for kreyvium on GPU backend
bench_integer_kreyvium_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-kreyvium \
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1509,7 +1527,7 @@ bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain
.PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
bench_integer_zk: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,zk-pok,pbs-stats \
@@ -1655,11 +1673,18 @@ bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
nvm use $(NODE_VERSION) && \
$(MAKE) bench_web_js_api_unsafe_coop_firefox
.PHONY: bench_hlapi # Run benchmarks for integer operations
bench_hlapi: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
bench_hlapi_unsigned: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi \
--bench hlapi_unsigned \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_hlapi_signed # Run benchmarks for signed integer operations
bench_hlapi_signed: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi_signed \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
@@ -1749,6 +1774,108 @@ bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_hlapi_kvstore # Run benchmarks for Key-Value Store operations
bench_hlapi_kvstore: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-kvstore \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_summary # Run summary benchmarks
bench_summary: install_rs_check_toolchain
# Arithmetic operations: addition, multiplication, division, comparison
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi_unsigned \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::add|::mul|::gt|::div_rem'
# Noise squash
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::noise_squash::'
# ERC20
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
# DEX
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-dex \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::no_cmux::'
# ZK
# Proof is done on CPU node of the instance
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,zk-pok,pbs-stats \
-p tfhe-benchmark -- '::pke_zk_proof'
# Verify is done on GPUs
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
# Compression
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-glwe_packing_compression \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_summary_gpu # Run summary benchmarks on GPU
bench_summary_gpu: install_rs_check_toolchain
# Arithmetic operations: addition, multiplication, division, comparison
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=FAST_DEFAULT __TFHE_RS_BENCH_BIT_SIZES_SET=FAST __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::add|::mul|::gt|::div_rem'
# Noise squash
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::noise_squash::'
# Noise squash and compression
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
# ERC20
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
# DEX
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-dex \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::no_cmux::'
# ZK
# Proof is done on CPU node of the instance
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,zk-pok,pbs-stats \
-p tfhe-benchmark -- '::pke_zk_proof'
# Verify is done on GPUs
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --
# Compression
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-glwe_packing_compression \
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_custom # Run benchmarks with a user-defined command
bench_custom: install_rs_check_toolchain

View File

@@ -87,6 +87,7 @@ fn main() {
"cuda/include/integer/rerand.h",
"cuda/include/aes/aes.h",
"cuda/include/trivium/trivium.h",
"cuda/include/kreyvium/kreyvium.h",
"cuda/include/zk/zk.h",
"cuda/include/keyswitch/keyswitch.h",
"cuda/include/keyswitch/ks_enums.h",

View File

@@ -29,15 +29,13 @@ template <typename Torus> struct int_aes_lut_buffers {
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus a, Torus b) -> Torus { return a & b; };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and_lut = streams.active_gpu_subset(
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
params.pbs_type);
this->and_lut->broadcast_lut(active_streams_and_lut);
this->and_lut->generate_and_broadcast_bivariate_lut(
active_streams_and_lut, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->flush_lut = new int_radix_lut<Torus>(
@@ -46,14 +44,11 @@ template <typename Torus> struct int_aes_lut_buffers {
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
return x & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush_lut = streams.active_gpu_subset(
AES_STATE_BITS * num_aes_inputs, params.pbs_type);
this->flush_lut->broadcast_lut(active_streams_flush_lut);
this->flush_lut->generate_and_broadcast_lut(
active_streams_flush_lut, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->carry_lut = new int_radix_lut<Torus>(
@@ -61,14 +56,11 @@ template <typename Torus> struct int_aes_lut_buffers {
std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
return (x >> 1) & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_lambda, allocate_gpu_memory);
auto active_streams_carry_lut =
streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
this->carry_lut->broadcast_lut(active_streams_carry_lut);
this->carry_lut->generate_and_broadcast_lut(
active_streams_carry_lut, {0}, {carry_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

View File

@@ -10,11 +10,7 @@ extern std::mutex m;
extern bool p2p_enabled;
extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
extern "C" {
int32_t cuda_setup_multi_gpu(int device_0_id);
}
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
// Define a variant type that can be either a vector or a single pointer
template <typename Torus>
using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
@@ -42,6 +38,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type);
uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
@@ -80,7 +78,15 @@ public:
_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
}
// Returns a subset of this set as an active subset for pbs128. An active
// subset is one that is temporarily used to perform some computation. For
// pbs128, the threshold is different, because the original threshold was
// designed for 2_2 params.
CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
return CudaStreams(
_streams, _gpu_indexes,
get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
}
// Returns a CudaStreams struct containing only the ith stream
CudaStreams get_ith(int i) const {
return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
@@ -144,9 +150,9 @@ public:
_gpu_count(src._gpu_count), _owns_streams(false) {}
CudaStreams &operator=(CudaStreams const &other) {
PANIC_IF_FALSE(this->_streams == nullptr ||
this->_streams == other._streams,
"Assigning an already initialized CudaStreams");
/* PANIC_IF_FALSE(this->_streams == nullptr ||
this->_streams == other._streams,
"Assigning an already initialized CudaStreams");*/
this->_streams = other._streams;
this->_gpu_indexes = other._gpu_indexes;
this->_gpu_count = other._gpu_count;

View File

@@ -45,12 +45,9 @@ template <typename Torus> struct boolean_bitop_buffer {
// BooleanBlock can have degree 0 or 1. when ct is 0 path is hardcoded,
// only lut for degree = 1 is generated
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, 2, gpu_memory_allocated);
lut->broadcast_lut(active_streams);
lut->generate_and_broadcast_bivariate_lut(active_streams, {0},
{lut_bivariate_f},
LUT_0_FOR_ALL_BLOCKS, {}, 2);
}
break;
default:
@@ -65,14 +62,8 @@ template <typename Torus> struct boolean_bitop_buffer {
return x % params.message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0),
message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
message_extract_lut->broadcast_lut(active_streams);
message_extract_lut->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
}
tmp_lwe_left = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -142,12 +133,8 @@ template <typename Torus> struct int_bitop_buffer {
}
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(active_streams);
lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {lut_bivariate_f}, LUT_0_FOR_ALL_BLOCKS);
}
break;
default:
@@ -156,6 +143,8 @@ template <typename Torus> struct int_bitop_buffer {
num_radix_blocks, allocate_gpu_memory,
size_tracker);
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
@@ -171,14 +160,13 @@ template <typename Torus> struct int_bitop_buffer {
return x ^ rhs;
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(active_streams);
lut_funcs.push_back(lut_univariate_scalar_f);
lut_indices.push_back(i);
}
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
LUT_0_FOR_ALL_BLOCKS);
}
}
@@ -211,16 +199,11 @@ template <typename Torus> struct boolean_bitnot_buffer {
return x % message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0),
message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
message_extract_lut->broadcast_lut(active_streams);
message_extract_lut->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
}
}

View File

@@ -28,20 +28,16 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
uint32_t bits_per_block = std::log2(params.message_modulus);
uint32_t msg_modulus = params.message_modulus;
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
[msg_modulus, bits_per_block](Torus x) {
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut->generate_and_broadcast_lut(
active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
const auto xm = x % msg_modulus;
const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
return (Torus)((msg_modulus - 1) * sign_bit);
},
allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut->broadcast_lut(active_streams);
}},
LUT_0_FOR_ALL_BLOCKS);
this->last_block = new CudaRadixCiphertextFFI;

View File

@@ -85,42 +85,28 @@ template <typename Torus> struct int_cmux_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
message_extract_lut_f, gpu_memory_allocated);
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams_pred =
streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
predicate_lut->broadcast_lut(active_streams_pred);
auto lut_index_generator = [num_radix_blocks](Torus *h_lut_indexes,
uint32_t num_indexes) {
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
};
predicate_lut->generate_and_broadcast_bivariate_lut(
active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
lut_index_generator);
auto active_streams_msg =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
message_extract_lut->broadcast_lut(active_streams_msg);
message_extract_lut->generate_and_broadcast_lut(
active_streams_msg, {0}, {message_extract_lut_f}, LUT_0_FOR_ALL_BLOCKS);
}
void release(CudaStreams streams) {

View File

@@ -28,7 +28,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
Torus total_modulus = params.message_modulus * params.carry_modulus;
uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
tmp_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,
@@ -39,22 +39,21 @@ template <typename Torus> struct int_are_all_block_true_buffer {
max_chunks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
allocate_gpu_memory, size_tracker);
auto is_max_value_f = [max_value](Torus x) -> Torus {
return x == max_value;
};
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
is_max_value->get_degree(0), is_max_value->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(max_chunks, params.pbs_type);
is_max_value->broadcast_lut(active_streams);
auto is_max_value_f = [max_value](Torus x) -> Torus {
return x == max_value;
};
is_max_value->generate_and_broadcast_lut(
active_streams, {0}, {is_max_value_f}, LUT_0_FOR_ALL_BLOCKS);
}
void release(CudaStreams streams) {
@@ -103,15 +102,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
is_non_zero_lut->broadcast_lut(active_streams);
is_non_zero_lut->generate_and_broadcast_lut(
active_streams, {0}, {is_non_zero_lut_f}, LUT_0_FOR_ALL_BLOCKS);
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -129,32 +123,27 @@ template <typename Torus> struct int_comparison_eq_buffer {
return (lhs == rhs);
}
};
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < total_modulus; i++) {
auto lut_f = [i, operator_f](Torus x) -> Torus {
return operator_f(i, x);
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
scalar_comparison_luts->get_lut(0, i),
scalar_comparison_luts->get_degree(i),
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, gpu_memory_allocated);
lut_funcs.push_back(lut_f);
lut_indices.push_back(i);
}
scalar_comparison_luts->broadcast_lut(active_streams);
scalar_comparison_luts->generate_and_broadcast_lut(
active_streams, lut_indices, lut_funcs, LUT_0_FOR_ALL_BLOCKS);
if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
operator_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, operator_f, gpu_memory_allocated);
operator_lut->broadcast_lut(active_streams);
operator_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {operator_f}, LUT_0_FOR_ALL_BLOCKS);
} else {
operator_lut = nullptr;
}
@@ -221,9 +210,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// LUTs
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
tree_last_leaf_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -234,15 +220,14 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
block_selector_f, gpu_memory_allocated);
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
tree_inner_leaf_lut->broadcast_lut(active_streams);
tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {block_selector_f}, LUT_0_FOR_ALL_BLOCKS);
}
void release(CudaStreams streams) {
@@ -426,12 +411,8 @@ template <typename Torus> struct int_comparison_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
identity_lut->broadcast_lut(active_streams);
identity_lut->generate_and_broadcast_lut(
active_streams, {0}, {identity_lut_f}, LUT_0_FOR_ALL_BLOCKS);
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -441,13 +422,8 @@ template <typename Torus> struct int_comparison_buffer {
is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_zero_f, gpu_memory_allocated);
is_zero_lut->broadcast_lut(active_streams);
is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
LUT_0_FOR_ALL_BLOCKS);
switch (op) {
case COMPARISON_TYPE::MAX:
@@ -522,13 +498,9 @@ template <typename Torus> struct int_comparison_buffer {
PANIC("Cuda error: sign_lut creation failed due to wrong function.")
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
signed_lut->broadcast_lut(active_streams);
signed_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {signed_lut_f}, LUT_0_FOR_ALL_BLOCKS);
}
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));

View File

@@ -11,16 +11,26 @@ template <typename Torus> struct int_compression {
Torus *tmp_glwe_array_out;
bool gpu_memory_allocated;
uint32_t lwe_per_glwe;
uint32_t max_num_glwes;
// num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
// compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
// defined by the chosen parameter set
int_compression(CudaStreams streams, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
// Calculate the actual number of GLWEs needed based on total radix blocks.
// This ensures we allocate enough memory when num_radix_blocks >
// lwe_per_glwe.
max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);
tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
sizeof(Torus),
@@ -28,7 +38,7 @@ template <typename Torus> struct int_compression {
allocate_gpu_memory));
tmp_glwe_array_out =
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
max_num_glwes * glwe_accumulator_size * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory));
@@ -106,19 +116,13 @@ template <typename Torus> struct int_decompression {
encryption_params.carry_modulus;
auto effective_compression_carry_modulus = 1;
generate_device_accumulator_with_encoding<Torus>(
streams.stream(0), streams.gpu_index(0),
decompression_rescale_lut->get_lut(0, 0),
decompression_rescale_lut->get_degree(0),
decompression_rescale_lut->get_max_degree(0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
effective_compression_message_modulus,
effective_compression_carry_modulus,
encryption_params.message_modulus, encryption_params.carry_modulus,
decompression_rescale_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(
num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
decompression_rescale_lut->broadcast_lut(active_streams);
decompression_rescale_lut->generate_and_broadcast_lut_with_encoding(
active_streams, {0}, {decompression_rescale_f},
effective_compression_message_modulus,
effective_compression_carry_modulus,
encryption_params.message_modulus, encryption_params.carry_modulus);
}
}
void release(CudaStreams streams) {

View File

@@ -283,12 +283,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
zero_out_if_not_1_lut_2};
size_t lut_gpu_indexes[2] = {0, 3};
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(lut_gpu_indexes[j]),
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
{0}, {zero_out_if_not_1_lut_f},
LUT_0_FOR_ALL_BLOCKS);
}
luts[0] = zero_out_if_not_2_lut_1;
@@ -296,12 +293,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
lut_gpu_indexes[0] = 1;
lut_gpu_indexes[1] = 2;
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(lut_gpu_indexes[j]),
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
{0}, {zero_out_if_not_2_lut_f},
LUT_0_FOR_ALL_BLOCKS);
}
quotient_lut_1 =
@@ -321,21 +315,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
};
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
generate_device_accumulator<Torus>(
streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
quotient_lut_1->generate_and_broadcast_lut(
streams.get_ith(2), {0}, {quotient_lut_1_f}, LUT_0_FOR_ALL_BLOCKS);
quotient_lut_2->generate_and_broadcast_lut(
streams.get_ith(1), {0}, {quotient_lut_2_f}, LUT_0_FOR_ALL_BLOCKS);
quotient_lut_3->generate_and_broadcast_lut(
streams.get_ith(0), {0}, {quotient_lut_3_f}, LUT_0_FOR_ALL_BLOCKS);
message_extract_lut_1 = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -350,15 +335,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
luts[0] = message_extract_lut_1;
luts[1] = message_extract_lut_2;
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
luts[j]->broadcast_lut(active_streams);
luts[j]->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
}
}
@@ -503,29 +485,35 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
(Torus *)cuda_malloc_with_size_tracking_async(
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
} else {
h_lut_indexes[index] = 2;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else {
h_lut_indexes[index] = 2;
h_lut_indexes[index] = index_in_grouping + group_size;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
first_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
allocate_gpu_memory);
};
generate_lut_indexes<Torus>(streams, index_generator,
first_indexes_for_overflow_sub_gpu_0[nb - 1],
nb, 2 * group_size + 1, h_lut_indexes,
allocate_gpu_memory);
}
// Extra indexes for the luts in second step
uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
for (int nb = 1; nb <= num_blocks; nb++) {
second_indexes_for_overflow_sub_gpu_0[nb - 1] =
(Torus *)cuda_malloc_with_size_tracking_async(
@@ -536,24 +524,37 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
uint32_t) {
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
}
};
generate_lut_indexes<Torus>(streams, index_generator,
second_indexes_for_overflow_sub_gpu_0[nb - 1],
nb, num_luts_second_step, h_lut_indexes,
allocate_gpu_memory);
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
bool may_have_its_padding_bit_set =
!is_in_first_grouping && (index_in_grouping == group_size - 1);
@@ -567,10 +568,6 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
h_scalar[index] = 0;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
second_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
scalars_for_overflow_sub_gpu_0[nb - 1], h_scalar, nb * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
@@ -1007,24 +1004,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
masking_luts_2[i] = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
masking_luts_1[i]->broadcast_lut(active_streams_1);
masking_luts_1[i]->generate_and_broadcast_lut(
active_streams_1, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_2 =
streams.active_gpu_subset(num_blocks, params.pbs_type);
masking_luts_2[i]->broadcast_lut(active_streams_2);
masking_luts_2[i]->generate_and_broadcast_lut(
active_streams_2, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);
}
// create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1042,15 +1029,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
luts[j]->broadcast_lut(active_streams);
luts[j]->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
}
// Give name to closures to improve readability
@@ -1076,24 +1060,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
}
};
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_if_overflow_did_not_happen[0]->get_lut(0, 0),
zero_out_if_overflow_did_not_happen[0]->get_degree(0),
zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, params.message_modulus - 2,
gpu_memory_allocated);
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(active_streams);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
zero_out_if_overflow_did_not_happen[1]->get_degree(0),
zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, params.message_modulus - 1,
gpu_memory_allocated);
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(active_streams);
zero_out_if_overflow_did_not_happen[0]
->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
LUT_0_FOR_ALL_BLOCKS, {},
params.message_modulus - 2);
zero_out_if_overflow_did_not_happen[1]
->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
LUT_0_FOR_ALL_BLOCKS, {},
params.message_modulus - 1);
// create and generate zero_out_if_overflow_happened
zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -1110,24 +1084,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
}
};
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_if_overflow_happened[0]->get_lut(0, 0),
zero_out_if_overflow_happened[0]->get_degree(0),
zero_out_if_overflow_happened[0]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
gpu_memory_allocated);
zero_out_if_overflow_happened[0]->broadcast_lut(active_streams);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_if_overflow_happened[1]->get_lut(0, 0),
zero_out_if_overflow_happened[1]->get_degree(0),
zero_out_if_overflow_happened[1]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
gpu_memory_allocated);
zero_out_if_overflow_happened[1]->broadcast_lut(active_streams);
zero_out_if_overflow_happened[0]->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
params.message_modulus - 2);
zero_out_if_overflow_happened[1]->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
params.message_modulus - 1);
// merge_overflow_flags_luts
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
@@ -1141,14 +1103,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
merge_overflow_flags_luts[i]->get_lut(0, 0),
merge_overflow_flags_luts[i]->get_degree(0),
merge_overflow_flags_luts[i]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
active_gpu_count_for_bits, {0}, {lut_f_bit}, LUT_0_FOR_ALL_BLOCKS);
}
}
@@ -1220,29 +1176,34 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
(Torus *)cuda_malloc_with_size_tracking_async(
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
} else {
h_lut_indexes[index] = 2;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else {
h_lut_indexes[index] = 2;
h_lut_indexes[index] = index_in_grouping + group_size;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
allocate_gpu_memory);
};
generate_lut_indexes<Torus>(
streams, index_generator, first_indexes_for_overflow_sub[nb - 1], nb,
2 * group_size + 1, h_lut_indexes, allocate_gpu_memory);
}
// Extra indexes for the luts in second step
uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
for (int nb = 1; nb <= num_blocks; nb++) {
second_indexes_for_overflow_sub[nb - 1] =
(Torus *)cuda_malloc_with_size_tracking_async(
@@ -1253,24 +1214,36 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
uint32_t) {
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
}
};
generate_lut_indexes<Torus>(
streams, index_generator, second_indexes_for_overflow_sub[nb - 1], nb,
num_luts_second_step, h_lut_indexes, allocate_gpu_memory);
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
bool may_have_its_padding_bit_set =
!is_in_first_grouping && (index_in_grouping == group_size - 1);
@@ -1284,10 +1257,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
h_scalar[index] = 0;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
@@ -1557,16 +1526,12 @@ template <typename Torus> struct int_div_rem_memory {
compare_signed_bits_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
compare_signed_bits_lut->get_lut(0, 0),
compare_signed_bits_lut->get_degree(0),
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
f_compare_extracted_signed_bits, gpu_memory_allocated);
auto active_gpu_count_cmp =
streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
LUT_0_FOR_ALL_BLOCKS);
}
}

View File

@@ -53,13 +53,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
return count;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
univ_lut_mem->broadcast_lut(active_streams);
univ_lut_mem->generate_and_broadcast_lut(
active_streams, {0}, {generate_uni_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);
auto generate_bi_lut_lambda =
[num_bits](Torus block_num_bit_count,
@@ -70,13 +65,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
return 0;
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
biv_lut_mem->broadcast_lut(active_streams);
biv_lut_mem->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {generate_bi_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->tmp_ct = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -232,7 +222,7 @@ template <typename Torus> struct int_ilog2_buffer {
this->sum_output_not_propagated, counter_num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->lut_message_not =
lut_message_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus)> lut_message_lambda =
@@ -240,16 +230,11 @@ template <typename Torus> struct int_ilog2_buffer {
uint64_t message = x % this->params.message_modulus;
return (~message) % this->params.message_modulus;
};
generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
this->lut_message_not->get_lut(0, 0),
this->lut_message_not->get_degree(0),
this->lut_message_not->get_max_degree(0),
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus,
lut_message_lambda, allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
lut_message_not->broadcast_lut(active_streams);
lut_message_not->generate_and_broadcast_lut(
active_streams, {0}, {lut_message_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->lut_carry_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -259,13 +244,8 @@ template <typename Torus> struct int_ilog2_buffer {
uint64_t carry = x / this->params.message_modulus;
return (~carry) % this->params.message_modulus;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0),
this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_carry_lambda, allocate_gpu_memory);
lut_carry_not->broadcast_lut(active_streams);
lut_carry_not->generate_and_broadcast_lut(
active_streams, {0}, {lut_carry_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->message_blocks_not = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -37,17 +37,12 @@ template <typename Torus> struct int_mul_memory {
zero_out_predicate_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_predicate_lut->get_lut(0, 0),
zero_out_predicate_lut->get_degree(0),
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_predicate_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
zero_out_predicate_lut->broadcast_lut(active_streams);
zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {zero_out_predicate_lut_f},
LUT_0_FOR_ALL_BLOCKS);
zero_out_mem = new int_zero_out_if_buffer<Torus>(
streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -55,10 +50,7 @@ template <typename Torus> struct int_mul_memory {
return;
}
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// 'vector_result_lsb' contains blocks from all possible shifts of
// radix_lwe_left excluding zero ciphertext blocks
@@ -70,6 +62,10 @@ template <typename Torus> struct int_mul_memory {
int total_block_count = num_radix_blocks * num_radix_blocks;
GPU_ASSERT(lsb_vector_block_count + msb_vector_block_count ==
total_block_count,
"MSB and LSB vector block counts don't match");
// allocate memory for intermediate buffers
vector_result_sb = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -91,8 +87,6 @@ template <typename Torus> struct int_mul_memory {
// luts_array -> lut = {lsb_acc, msb_acc}
luts_array = new int_radix_lut<Torus>(streams, params, 2, total_block_count,
allocate_gpu_memory, size_tracker);
auto lsb_acc = luts_array->get_lut(0, 0);
auto msb_acc = luts_array->get_lut(0, 1);
// define functions for each accumulator
auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
@@ -102,30 +96,21 @@ template <typename Torus> struct int_mul_memory {
return (x * y) / message_modulus;
};
// generate accumulators
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lsb_acc,
luts_array->get_degree(0), luts_array->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_lsb, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), msb_acc,
luts_array->get_degree(1), luts_array->get_max_degree(1),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_msb, gpu_memory_allocated);
// lut_indexes_vec for luts_array should be reinitialized
// first lsb_vector_block_count value should reference to lsb_acc
// last msb_vector_block_count values should reference to msb_acc
// for message and carry default lut_indexes_vec is fine
if (allocate_gpu_memory)
cuda_set_value_async<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
auto active_streams =
streams.active_gpu_subset(total_block_count, params.pbs_type);
luts_array->broadcast_lut(active_streams);
auto lut_index_generator = [lsb_vector_block_count](Torus *h_lut_indexes,
uint32_t num_indexes) {
for (uint32_t i = 0; i < num_indexes; i++) {
h_lut_indexes[i] = (i < lsb_vector_block_count) ? 0 : 1;
}
};
luts_array->generate_and_broadcast_bivariate_lut(
active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, lut_index_generator);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, params, num_radix_blocks, 2 * num_radix_blocks,

View File

@@ -22,8 +22,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
uint32_t calculated_active_blocks =
total_random_bits == 0
? 0
: (total_random_bits + message_bits_per_block - 1) /
message_bits_per_block;
: CEIL_DIV(total_random_bits, message_bits_per_block);
if (num_blocks_to_process != calculated_active_blocks) {
PANIC(
"num_blocks_to_process should be equal to calculated_active_blocks");
@@ -53,6 +52,10 @@ template <typename Torus> struct int_grouped_oprf_memory {
// Pre-generate all possible LUTs.
//
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
std::vector<uint64_t> lut_degrees;
for (uint32_t random_bit = 1; random_bit <= message_bits_per_block;
++random_bit) {
uint64_t p = 1ULL << random_bit;
@@ -70,14 +73,13 @@ template <typename Torus> struct int_grouped_oprf_memory {
uint64_t degree = 0;
uint32_t lut_index = random_bit - 1;
generate_device_accumulator_no_encoding<Torus>(
streams.stream(0), streams.gpu_index(0), luts->get_lut(0, lut_index),
degree, params.message_modulus, params.carry_modulus,
params.glwe_dimension, params.polynomial_size, lut_f,
allocate_gpu_memory);
lut_funcs.push_back(lut_f);
lut_indices.push_back(lut_index);
// In OPRF the degree is hard set to p - 1 instead of the LUT degree
degree = p - 1;
*luts->get_degree(lut_index) = degree;
lut_degrees.push_back(degree);
}
// For each block, this loop determines the exact number of bits to generate
@@ -102,10 +104,6 @@ template <typename Torus> struct int_grouped_oprf_memory {
Torus plaintext_to_add = (p - 1) * delta / 2;
h_corrections[i * lwe_size + params.big_lwe_dimension] = plaintext_to_add;
if (bits_for_this_block < 1) {
PANIC("bits_for_this_block should be greater than 1");
}
this->h_lut_indexes[i] = bits_for_this_block - 1;
bits_processed += bits_for_this_block;
}
@@ -122,13 +120,35 @@ template <typename Torus> struct int_grouped_oprf_memory {
// Copy the prepared LUT indexes to the GPU 0, before broadcast to all other
// GPUs.
cuda_memcpy_with_size_tracking_async_to_gpu(
luts->get_lut_indexes(0, 0), this->h_lut_indexes,
num_blocks_to_process * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
luts->broadcast_lut(active_streams);
// No encoding for these LUTS. Generate LUT also sets LUT degrees to default
// values
auto luts_index_generator = [total_random_bits, message_bits_per_block](
Torus *h_lut_indexes, uint32_t num_blocks) {
uint64_t bits_processed = 0;
for (uint32_t i = 0; i < num_blocks; ++i) {
if (total_random_bits <= bits_processed) {
PANIC("total_random_bits should be greater than bits_processed");
}
uint64_t bits_remaining = total_random_bits - bits_processed;
uint32_t bits_for_this_block =
std::min((uint64_t)message_bits_per_block, bits_remaining);
if (bits_for_this_block < 1) {
PANIC("bits_for_this_block should be greater than 1");
}
h_lut_indexes[i] = bits_for_this_block - 1;
bits_processed += bits_for_this_block;
}
};
luts->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
luts_index_generator, false, {},
this->h_lut_indexes);
// OPRF requires custom LUT degrees
for (uint32_t i = 0; i < lut_degrees.size(); ++i) {
*luts->get_degree(i) = lut_degrees[i];
}
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(h_corrections);
@@ -170,8 +190,7 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_random_input_blocks =
(num_input_random_bits + message_bits_per_block - 1) /
message_bits_per_block;
CEIL_DIV(num_input_random_bits, message_bits_per_block);
this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
streams, params, this->num_random_input_blocks, message_bits_per_block,

View File

@@ -85,15 +85,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
// right shift
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->broadcast_lut(active_streams);
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
@@ -172,16 +168,10 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
// right shift
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->broadcast_lut(active_streams);
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
}
@@ -271,16 +261,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return shifted | padding;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
shift_last_block_lut_univariate->get_lut(0, 0),
shift_last_block_lut_univariate->get_degree(0),
shift_last_block_lut_univariate->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
auto active_streams_shift_last =
streams.active_gpu_subset(1, params.pbs_type);
shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
shift_last_block_lut_univariate->generate_and_broadcast_lut(
active_streams_shift_last, {0}, {last_block_lut_f},
LUT_0_FOR_ALL_BLOCKS);
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
}
@@ -298,15 +283,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return (params.message_modulus - 1) * x_sign_bit;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
padding_block_lut_univariate->get_lut(0, 0),
padding_block_lut_univariate->get_degree(0),
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
padding_block_lut_f, gpu_memory_allocated);
// auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
padding_block_lut_univariate->broadcast_lut(active_streams);
padding_block_lut_univariate->generate_and_broadcast_lut(
active_streams, {0}, {padding_block_lut_f}, LUT_0_FOR_ALL_BLOCKS);
lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -339,16 +317,11 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return message_of_current_block + carry_of_previous_block;
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
shift_blocks_lut_bivariate->get_lut(0, 0),
shift_blocks_lut_bivariate->get_degree(0),
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
blocks_lut_f, gpu_memory_allocated);
auto active_streams_shift_blocks =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams_shift_blocks, {0}, {blocks_lut_f},
LUT_0_FOR_ALL_BLOCKS);
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
}

View File

@@ -113,27 +113,20 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
else
return current_bit;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
auto active_gpu_count_mux = streams.active_gpu_subset(
bits_per_block * num_radix_blocks, params.pbs_type);
mux_lut->broadcast_lut(active_gpu_count_mux);
mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
LUT_0_FOR_ALL_BLOCKS);
auto cleaning_lut_f = [params](Torus x) -> Torus {
return x % params.message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
auto active_gpu_count_cleaning =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
cleaning_lut->generate_and_broadcast_lut(
active_gpu_count_cleaning, {0}, {cleaning_lut_f}, LUT_0_FOR_ALL_BLOCKS);
}
void release(CudaStreams streams) {

View File

@@ -74,45 +74,27 @@ template <typename Torus> struct int_overflowing_sub_memory {
luts_array, size_tracker,
allocate_gpu_memory, size_tracker);
auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
// generate luts (aka accumulators)
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
luts_array->get_degree(0), luts_array->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_lut_does_block_generate_carry, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_does_block_generate_or_propagate, luts_array->get_degree(1),
luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
gpu_memory_allocated);
if (allocate_gpu_memory)
cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, 1), 1,
num_radix_blocks - 1);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_borrow_propagation_sum->get_lut(0, 0),
luts_borrow_propagation_sum->get_degree(0),
luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_luts_borrow_propagation_sum, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
message_acc->get_degree(0), message_acc->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_message_acc, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_array->broadcast_lut(active_streams);
luts_borrow_propagation_sum->broadcast_lut(active_streams);
message_acc->broadcast_lut(active_streams);
luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {f_luts_borrow_propagation_sum},
LUT_0_FOR_ALL_BLOCKS);
auto luts_array_index_generator = [](Torus *h_lut_indexes,
uint32_t num_indexes) {
for (uint32_t i = 0; i < num_indexes; i++) {
h_lut_indexes[i] = (i == 0) ? 0 : 1;
}
};
luts_array->generate_and_broadcast_lut(
active_streams, {0, 1},
{f_lut_does_block_generate_carry,
f_lut_does_block_generate_or_propagate},
luts_array_index_generator);
// generate luts (aka accumulators)
message_acc->generate_and_broadcast_lut(
active_streams, {0}, {f_message_acc}, LUT_0_FOR_ALL_BLOCKS);
}
void release(CudaStreams streams) {

View File

@@ -7,7 +7,8 @@
#include <functional>
#include <vector>
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
// If we use more than 5 streams the result is incorrect
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;
template <typename Torus> struct int_equality_selectors_buffer {
int_radix_params params;
@@ -60,18 +61,10 @@ template <typename Torus> struct int_equality_selectors_buffer {
fns.push_back([i](Torus x) -> Torus { return (x == i); });
}
generate_many_lut_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->comparison_luts->get_lut(0, 0),
this->comparison_luts->get_degree(0),
this->comparison_luts->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
fns, allocate_gpu_memory);
this->comparison_luts->generate_and_broadcast_many_lut(
active_streams, {0}, {fns}, LUT_0_FOR_ALL_BLOCKS);
fns.clear();
this->comparison_luts->broadcast_lut(active_streams);
this->tmp_many_luts_output = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->tmp_many_luts_output,
@@ -175,8 +168,7 @@ template <typename Torus> struct int_possible_results_buffer {
this->lut_stride =
(ciphertext_modulus / this->max_luts_per_call) * box_size;
this->num_lut_accumulators =
(total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
this->num_lut_accumulators = CEIL_DIV(total_luts_needed, max_luts_per_call);
stream_luts =
new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
@@ -202,15 +194,10 @@ template <typename Torus> struct int_possible_results_buffer {
fns.push_back([c](Torus x) -> Torus { return (x == 1) * c; });
}
generate_many_lut_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), current_lut->get_lut(0, 0),
current_lut->get_degree(0), current_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, fns,
allocate_gpu_memory);
current_lut->generate_and_broadcast_many_lut(
streams.active_gpu_subset(1, params.pbs_type), {0}, {fns},
LUT_0_FOR_ALL_BLOCKS);
current_lut->broadcast_lut(
streams.active_gpu_subset(1, params.pbs_type));
stream_luts[lut_count++] = current_lut;
lut_value_start += luts_in_this_call;
}
@@ -298,14 +285,10 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
id_fn, allocate_gpu_memory);
lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
LUT_0_FOR_ALL_BLOCKS);
lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->stream_identity_luts[i] = lut;
}
@@ -318,27 +301,17 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->message_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->message_extract_lut->get_lut(0, 0),
this->message_extract_lut->get_degree(0),
this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
msg_fn, allocate_gpu_memory);
this->message_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->message_extract_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
LUT_0_FOR_ALL_BLOCKS);
this->carry_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->carry_extract_lut->get_lut(0, 0),
this->carry_extract_lut->get_degree(0),
this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
carry_fn, allocate_gpu_memory);
this->carry_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->carry_extract_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
LUT_0_FOR_ALL_BLOCKS);
this->partial_aggregated_vectors =
new CudaRadixCiphertextFFI *[num_streams];
@@ -1185,15 +1158,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
this->prefix_sum_lut = new int_radix_lut<Torus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
this->prefix_sum_lut->get_lut(0, 0),
this->prefix_sum_lut->get_degree(0),
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{prefix_sum_fn}, LUT_0_FOR_ALL_BLOCKS);
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1203,14 +1170,9 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
};
this->cleanup_lut = new int_radix_lut<Torus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
this->cleanup_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{cleanup_fn}, LUT_0_FOR_ALL_BLOCKS);
}
void release(CudaStreams streams) {
@@ -1376,15 +1338,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
this->prefix_sum_lut = new int_radix_lut<Torus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
this->prefix_sum_lut->get_lut(0, 0),
this->prefix_sum_lut->get_degree(0),
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{prefix_sum_fn}, LUT_0_FOR_ALL_BLOCKS);
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1394,14 +1350,9 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
};
this->cleanup_lut = new int_radix_lut<Torus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
this->cleanup_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{cleanup_fn}, LUT_0_FOR_ALL_BLOCKS);
}
void release(CudaStreams streams) {

View File

@@ -73,9 +73,10 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
int8_t **fp_ks_buffer,
bool gpu_memory_allocated);
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log, uint32_t level_count);
void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log,
uint32_t level_count);
}
#endif // CNCRT_KS_H_

View File

@@ -0,0 +1,24 @@
#ifndef KREYVIUM_H
#define KREYVIUM_H
#include "../integer/integer.h"
extern "C" {
uint64_t scratch_cuda_kreyvium_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
void cuda_kreyvium_generate_keystream_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cleanup_cuda_kreyvium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
}
#endif

View File

@@ -0,0 +1,320 @@
#ifndef KREYVIUM_UTILITIES_H
#define KREYVIUM_UTILITIES_H
#include "../integer/integer_utilities.h"
// Kreyvium specific constants
// The batch size is set to 64 to allow efficient parallel processing of 64
// steps at once.
constexpr uint32_t KREYVIUM_BATCH_SIZE = 64;
// In each Kreyvium step, there are exactly 3 non-linear AND operations:
// 1. (c109 & c108)
// 2. (a91 & a90)
// 3. (b82 & b81)
constexpr uint32_t KREYVIUM_NUM_AND_GATES = 3;
// In each Kreyvium step, there are 4 paths that require a "flush"
// to noise-cancel and extract the bit:
// 1. New bit for Register A
// 2. New bit for Register B
// 3. New bit for Register C
// 4. The Output Keystream bit
constexpr uint32_t KREYVIUM_NUM_FLUSH_PATHS = 4;
/// Struct to hold the LUTs.
template <typename Torus> struct int_kreyvium_lut_buffers {
// Bivariate AND Gate LUT:
// AND operation: f(a, b) = (a & 1) & (b & 1).
// This is a Bivariate PBS used for the non-linear parts of Kreyvium.
int_radix_lut<Torus> *and_lut;
// Univariate Flush/Identity LUT:
// MESSAGE EXTRACTION operation: f(x) = x & 1.
// This is a Univariate PBS used to "flush" the state (reset noise/carries).
int_radix_lut<Torus> *flush_lut;
int_kreyvium_lut_buffers(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_inputs,
uint64_t &size_tracker) {
uint32_t and_ops =
num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_AND_GATES;
uint32_t flush_ops =
num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_FLUSH_PATHS;
this->and_lut = new int_radix_lut<Torus>(streams, params, 1, and_ops,
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus lhs, Torus rhs) -> Torus { return (lhs & 1) & (rhs & 1); };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and =
streams.active_gpu_subset(and_ops, params.pbs_type);
this->and_lut->broadcast_lut(active_streams_and);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->flush_lut = new int_radix_lut<Torus>(
streams, params, 1, flush_ops, allocate_gpu_memory, size_tracker);
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
return x & 1;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush =
streams.active_gpu_subset(flush_ops, params.pbs_type);
this->flush_lut->broadcast_lut(active_streams_flush);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}
void release(CudaStreams streams) {
this->and_lut->release(streams);
delete this->and_lut;
this->and_lut = nullptr;
this->flush_lut->release(streams);
delete this->flush_lut;
this->flush_lut = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
/// Struct to hold the Kreyvium internal state and temporary workspaces.
template <typename Torus> struct int_kreyvium_state_workspaces {
CudaRadixCiphertextFFI *a_reg;
CudaRadixCiphertextFFI *b_reg;
CudaRadixCiphertextFFI *c_reg;
CudaRadixCiphertextFFI *k_reg;
CudaRadixCiphertextFFI *iv_reg;
// Shift Workspace
CudaRadixCiphertextFFI *shift_workspace;
// Temporary Update Buffers
CudaRadixCiphertextFFI *temp_a;
CudaRadixCiphertextFFI *temp_b;
CudaRadixCiphertextFFI *temp_c;
CudaRadixCiphertextFFI *packed_and_lhs;
CudaRadixCiphertextFFI *packed_and_rhs;
CudaRadixCiphertextFFI *packed_and_out;
// Flush/Cleanup Packing Buffers
CudaRadixCiphertextFFI *packed_flush_in;
CudaRadixCiphertextFFI *packed_flush_out;
uint32_t max_batch_blocks;
uint32_t k_offset;
uint32_t iv_offset;
int_kreyvium_state_workspaces(CudaStreams streams,
const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_inputs,
uint64_t &size_tracker) {
uint32_t batch_blocks = KREYVIUM_BATCH_SIZE * num_inputs;
this->max_batch_blocks = batch_blocks;
this->k_offset = 0;
this->iv_offset = 0;
this->a_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->b_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->c_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->k_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->k_reg, 128 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->iv_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->iv_reg, 128 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->shift_workspace = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->shift_workspace,
128 * num_inputs, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
this->temp_a = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_a, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->temp_b = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_b, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->temp_c = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_c, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->packed_and_lhs = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_and_lhs,
KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->packed_and_rhs = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_and_rhs,
KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->packed_and_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_and_out,
KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->packed_flush_in = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->packed_flush_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
}
void release(CudaStreams streams, bool allocate_gpu_memory) {
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->a_reg, allocate_gpu_memory);
delete this->a_reg;
this->a_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->b_reg, allocate_gpu_memory);
delete this->b_reg;
this->b_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->c_reg, allocate_gpu_memory);
delete this->c_reg;
this->c_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->k_reg, allocate_gpu_memory);
delete this->k_reg;
this->k_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->iv_reg, allocate_gpu_memory);
delete this->iv_reg;
this->iv_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->shift_workspace, allocate_gpu_memory);
delete this->shift_workspace;
this->shift_workspace = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_a, allocate_gpu_memory);
delete this->temp_a;
this->temp_a = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_b, allocate_gpu_memory);
delete this->temp_b;
this->temp_b = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_c, allocate_gpu_memory);
delete this->temp_c;
this->temp_c = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_and_lhs, allocate_gpu_memory);
delete this->packed_and_lhs;
this->packed_and_lhs = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_and_rhs, allocate_gpu_memory);
delete this->packed_and_rhs;
this->packed_and_rhs = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_and_out, allocate_gpu_memory);
delete this->packed_and_out;
this->packed_and_out = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_flush_in, allocate_gpu_memory);
delete this->packed_flush_in;
this->packed_flush_in = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_flush_out, allocate_gpu_memory);
delete this->packed_flush_out;
this->packed_flush_out = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
template <typename Torus> struct int_kreyvium_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_kreyvium_lut_buffers<Torus> *luts;
int_kreyvium_state_workspaces<Torus> *state;
int_kreyvium_buffer(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_inputs,
uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_inputs = num_inputs;
this->luts = new int_kreyvium_lut_buffers<Torus>(
streams, params, allocate_gpu_memory, num_inputs, size_tracker);
this->state = new int_kreyvium_state_workspaces<Torus>(
streams, params, allocate_gpu_memory, num_inputs, size_tracker);
}
void release(CudaStreams streams) {
luts->release(streams);
delete luts;
luts = nullptr;
state->release(streams, allocate_gpu_memory);
delete state;
state = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
#endif

View File

@@ -429,11 +429,9 @@ uint64_t get_buffer_size_programmable_bootstrap_cg(
}
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);
bool has_support_to_cuda_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory, uint32_t base_log);
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(

View File

@@ -13,12 +13,12 @@ void cuda_fourier_polynomial_mul(void *stream, uint32_t gpu_index,
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
uint32_t polynomial_size, uint32_t base_log);
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size);
uint32_t polynomial_size, uint32_t base_log);
void cuda_convert_lwe_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
@@ -61,13 +61,13 @@ uint64_t scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t base_log);
uint64_t scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t base_log);
uint64_t scratch_cuda_programmable_bootstrap_128(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,

View File

@@ -30,15 +30,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and =
streams.active_gpu_subset(total_lut_ops, params.pbs_type);
this->and_lut->broadcast_lut(active_streams_and);
this->and_lut->generate_and_broadcast_bivariate_lut(
active_streams_and, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
@@ -50,15 +45,10 @@ template <typename Torus> struct int_trivium_lut_buffers {
return x & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush =
streams.active_gpu_subset(total_flush_ops, params.pbs_type);
this->flush_lut->broadcast_lut(active_streams_flush);
this->flush_lut->generate_and_broadcast_lut(
active_streams_flush, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

View File

@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
uint32_t casting_output_dimension, uint32_t casting_ks_level,
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
KS_TYPE casting_key_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
const bool *is_boolean_array, const uint32_t is_boolean_array_len,
uint32_t num_compact_lists, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_expand_without_verification_64(
CudaStreamsFFI streams, void *lwe_array_out,

View File

@@ -118,7 +118,8 @@ template <typename Torus> struct zk_expand_mem {
zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
int_radix_params casting_params, KS_TYPE casting_key_type,
const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
const bool *is_boolean_array,
const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
bool allocate_gpu_memory, uint64_t &size_tracker)
: computing_params(computing_params), casting_params(casting_params),
num_compact_lists(num_compact_lists),
@@ -174,40 +175,6 @@ template <typename Torus> struct zk_expand_mem {
message_and_carry_extract_luts = new int_radix_lut<Torus>(
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 0),
message_and_carry_extract_luts->get_degree(0),
message_and_carry_extract_luts->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 1),
message_and_carry_extract_luts->get_degree(1),
message_and_carry_extract_luts->get_max_degree(1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 2),
message_and_carry_extract_luts->get_degree(2),
message_and_carry_extract_luts->get_max_degree(2),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 3),
message_and_carry_extract_luts->get_degree(3),
message_and_carry_extract_luts->get_max_degree(3),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
gpu_memory_allocated);
// We are always packing two LWEs. We just need to be sure we have enough
// space in the carry part to store a message of the same size as is in the
// message part.
@@ -270,29 +237,65 @@ template <typename Torus> struct zk_expand_mem {
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto lwe_index_in_list = i % num_lwes_in_kth;
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
lwe_index, num_packed_msgs * num_lwes);
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
h_indexes_out[lwe_index] =
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
// If the input relates to a boolean, shift the LUT so the correct one
// with sanitization is used
auto boolean_offset =
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
// is_boolean_array tells us which input is a boolean and thus the
// related output needs boolean sanitization. It naturally has
// total_blocks entries, but h_indexes_out reaches
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
// the ceiling causes out-of-bounds access. Reading garbage "true" would
// set h_lut_indexes to an invalid index pointing to uninitialized
// memory instead of a real LUT. Rust pads is_boolean_array with FALSE
// to match.
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
"Cuda error: index %d for is_boolean_array is out of "
"bounds (len is %d)",
h_indexes_out[lwe_index], is_boolean_array_len);
}
offset += num_lwes_in_kth;
}
message_and_carry_extract_luts->set_lwe_indexes(
streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
message_and_carry_extract_luts->broadcast_lut(active_streams);
// Index generator for message/carry extraction LUTs
auto index_gen = [num_compact_lists,
num_lwes_per_compact_list =
this->num_lwes_per_compact_list,
num_packed_msgs, is_boolean_array,
h_indexes_out](Torus *h_lut_indexes, uint32_t) {
auto offset = 0;
for (int k = 0; k < num_compact_lists; k++) {
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto boolean_offset =
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
}
offset += num_lwes_in_kth;
}
};
message_and_carry_extract_luts->generate_and_broadcast_lut(
active_streams, {0, 1, 2, 3},
{message_extract_lut_f, carry_extract_lut_f,
message_extract_and_sanitize_bool_lut_f,
carry_extract_and_sanitize_bool_lut_f},
index_gen, true, {}, h_lut_indexes);
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);

View File

@@ -183,9 +183,10 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_128(
base_log, level_count, num_lwes);
}
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log, uint32_t level_count) {
void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log,
uint32_t level_count) {
host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<const uint64_t *>(input),
static_cast<uint64_t *>(output), base_log,

View File

@@ -10,7 +10,6 @@
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
@@ -351,6 +350,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
Torus state =
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
#pragma unroll 1
for (int j = 0; j < level_count; j++) {
KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
@@ -363,16 +363,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
lwe_acc_out[shmem_index] = local_lwe_out;
}
if (tid <= lwe_dimension_out) {
for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
offset /= 2) {
__syncthreads();
for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
__syncthreads();
if (tid <= lwe_dimension_out && threadIdx.y < offset) {
lwe_acc_out[shmem_index] +=
lwe_acc_out[shmem_index + offset * blockDim.x];
}
if (threadIdx.y == 0)
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
}
if (tid <= lwe_dimension_out && threadIdx.y == 0)
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
}
template <typename Torus, typename KSTorus>

View File

@@ -12,12 +12,9 @@
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
// Finish the keyswitching operation and prepare GLWEs for accumulation.
// 1. Finish the keyswitching computation partially performed with a GEMM:
// - negate the dot product between the GLWE and KSK polynomial

View File

@@ -6,7 +6,7 @@
#include "helper_multi_gpu.h"
#include "polynomial/parameters.cuh"
#include "types/int128.cuh"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
#include <limits>
template <typename T>

View File

@@ -1,4 +1,5 @@
#include "device.h"
#include "utils/helper.cuh"
#include <cstdint>
#include <cuda_runtime.h>
#include <mutex>
@@ -6,6 +7,27 @@
#include <cuda_profiler_api.h>
#endif
void validate_device_ptr_and_gpu_index(const void *ptr, uint32_t gpu_index) {
GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer.")
}
}
int validate_device_ptr(const void *ptr) {
GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer.")
}
return attr.device;
}
uint32_t cuda_get_device() {
int device;
check_cuda_error(cudaGetDevice(&device));
@@ -247,13 +269,12 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
cudaStream_t stream,
uint32_t gpu_index,
bool gpu_memory_allocated) {
GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
if (size == 0 || !gpu_memory_allocated)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
}
validate_device_ptr_and_gpu_index(dest, gpu_index);
cuda_set_device(gpu_index);
check_cuda_error(
@@ -280,28 +301,16 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
uint32_t gpu_index, bool gpu_memory_allocated) {
if (size == 0 || !gpu_memory_allocated)
return;
GPU_ASSERT(dest != nullptr,
"Cuda error: trying to copy gpu->gpu to null ptr");
GPU_ASSERT(src != nullptr,
"Cuda error: trying to copy gpu->gpu from null ptr");
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
PANIC_IF_FALSE(
attr_dest.type == cudaMemoryTypeDevice,
"Cuda error: invalid dest device pointer in copy from GPU to GPU.");
cudaPointerAttributes attr_src;
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
PANIC_IF_FALSE(
attr_src.type == cudaMemoryTypeDevice,
"Cuda error: invalid src device pointer in copy from GPU to GPU.");
int src_gpu_index = validate_device_ptr(src);
int dest_gpu_index = validate_device_ptr(dest);
cuda_set_device(gpu_index);
if (attr_src.device == attr_dest.device) {
if (src_gpu_index == dest_gpu_index) {
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
} else {
check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
attr_src.device, size, stream));
check_cuda_error(cudaMemcpyPeerAsync(dest, dest_gpu_index, src,
src_gpu_index, size, stream));
}
}
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
@@ -349,11 +358,7 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
bool gpu_memory_allocated) {
if (size == 0 || !gpu_memory_allocated)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
}
validate_device_ptr_and_gpu_index(dest, gpu_index);
cuda_set_device(gpu_index);
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
}
@@ -383,7 +388,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
}
cuda_set_device(gpu_index);
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;
int num_blocks = CEIL_DIV(n, block_size);
// Launch the kernel
cuda_set_value_kernel<Torus>
@@ -406,13 +411,10 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
/// so it should be avoided at all costs
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
GPU_ASSERT(dest != nullptr, "Cuda error: null host ptr");
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, src));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
}
validate_device_ptr_and_gpu_index(src, gpu_index);
cuda_set_device(gpu_index);
check_cuda_error(

View File

@@ -188,7 +188,7 @@ __device__ void NSMFFT_direct_2_2_params(double2 *A, double2 *fft_out,
}
Index twiddle_shift = 1;
for (Index l = LOG2_DEGREE - 1; l >= 5; --l) {
for (Index l = LOG2_DEGREE - 1; l > 5; --l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
twiddle_shift <<= 1;
@@ -221,8 +221,8 @@ __device__ void NSMFFT_direct_2_2_params(double2 *A, double2 *fft_out,
tid = tid + STRIDE;
}
}
for (Index l = 4; l >= 1; --l) {
__syncthreads();
for (Index l = 5; l >= 1; --l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
twiddle_shift <<= 1;
@@ -425,7 +425,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
}
Index twiddle_shift = DEGREE;
for (Index l = 1; l <= 4; ++l) {
for (Index l = 1; l <= 5; ++l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
tid = threadIdx.x;
@@ -459,7 +459,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
}
}
for (Index l = 5; l <= LOG2_DEGREE - 1; ++l) {
for (Index l = 6; l <= LOG2_DEGREE - 1; ++l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
tid = threadIdx.x;
@@ -467,7 +467,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
// at this point registers are ready for the butterfly
tid = threadIdx.x;
__syncthreads();
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = (u[i] - v[i]);
@@ -495,6 +495,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
tid = tid + STRIDE;
}
__syncthreads();
}
// last iteration
@@ -540,6 +541,44 @@ __global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
}
}
/*
* global batch fft
* does fft in half size
* unrolling half size fft result in half size + 1 elements
* this function must be called with actual degree
* function takes as input already compressed input
*/
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT_classical_specialized(double2 *d_input,
double2 *d_output,
double2 *buffer) {
extern __shared__ double2 sharedMemoryFFT[];
// For specialized we will always have enough shared memory
double2 *fft = sharedMemoryFFT;
int tid = threadIdx.x;
double2 *shared_twiddles = fft + params::degree / 2;
double2 fft_regs[params::opt / 2];
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
shared_twiddles[tid] = negtwiddles[tid];
fft_regs[i] = d_input[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
__syncthreads();
NSMFFT_direct_2_2_params<HalfDegree<params>>(fft, fft_regs, shared_twiddles);
__syncthreads();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_output[blockIdx.x * (params::degree / 2) + tid] = fft_regs[i];
tid = tid + params::degree / params::opt;
}
}
/*
* global batch polynomial multiplication
* only used for fft tests

View File

@@ -68,9 +68,15 @@ struct alignas(16) f128 {
auto t = two_sum(a.lo, b.lo);
double hi = s.hi;
#ifdef __CUDA_ARCH__
double lo = __dadd_rn(s.lo, t.hi);
hi = __dadd_rn(hi, lo);
lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
#else
double lo = s.lo + t.hi;
hi = hi + lo;
lo = lo - (hi - s.hi);
#endif
return f128(hi, lo + t.lo);
}
@@ -104,8 +110,13 @@ struct alignas(16) f128 {
__host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
auto s = two_diff(a.hi, b.hi);
auto t = two_diff(a.lo, b.lo);
#ifdef __CUDA_ARCH__
s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
#else
s = quick_two_sum(s.hi, s.lo + t.hi);
return quick_two_sum(s.hi, s.lo + t.lo);
#endif
}
// Multiplication
@@ -220,16 +231,16 @@ struct f128x2 {
// Subtraction
__host__ __device__ friend f128x2 operator-(const f128x2 &a,
const f128x2 &b) {
return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
return f128x2(f128::sub_estimate(a.re, b.re),
f128::sub_estimate(a.im, b.im));
}
// Multiplication (complex multiplication)
__host__ __device__ friend f128x2 operator*(const f128x2 &a,
const f128x2 &b) {
const f128 a_im_b_im = f128::mul(a.im, b.im);
f128 real_part =
f128::add(f128::mul(a.re, b.re),
f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
return f128x2(real_part, imag_part);
}
@@ -243,8 +254,8 @@ struct f128x2 {
// Subtraction-assignment operator
__host__ __device__ f128x2 &operator-=(const f128x2 &other) {
re = f128::add(re, f128(-other.re.hi, -other.re.lo));
im = f128::add(im, f128(-other.im.hi, -other.im.lo));
re = f128::sub_estimate(re, other.re);
im = f128::sub_estimate(im, other.im);
return *this;
}
@@ -261,12 +272,20 @@ struct f128x2 {
};
__host__ __device__ inline uint64_t double_to_bits(double d) {
#ifdef __CUDA_ARCH__
uint64_t bits = __double_as_longlong(d);
#else
uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
#endif
return bits;
}
__host__ __device__ inline double bits_to_double(uint64_t bits) {
#ifdef __CUDA_ARCH__
double d = __longlong_as_double(bits);
#else
double d = *reinterpret_cast<double *>(&bits);
#endif
return d;
}
@@ -275,6 +294,8 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
const double A = ONE << 52;
const double B = ONE << 104;
const double C = ONE << 76;
// NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
// does not work here
const double D = 340282366920938500000000000000000000000.;
const __uint128_t threshold = (ONE << 104);
@@ -288,15 +309,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
uint64_t bits_l = A_bits | lower64;
double l_temp = bits_to_double(bits_l);
double l = l_temp - A;
uint64_t B_bits = double_to_bits(B);
uint64_t top64 = static_cast<uint64_t>(x >> 52);
uint64_t bits_h = B_bits | top64;
double h_temp = bits_to_double(bits_h);
#ifdef __CUDA_ARCH__
return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
#else
double l = l_temp - A;
double h = h_temp - B;
return (l + h);
#endif
} else {
uint64_t C_bits = double_to_bits(C);
@@ -310,15 +336,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
uint64_t bits_l = C_bits | lower64 | mask_part;
double l_temp = bits_to_double(bits_l);
double l = l_temp - C;
uint64_t D_bits = double_to_bits(D);
uint64_t top64 = static_cast<uint64_t>(x >> 76);
uint64_t bits_h = D_bits | top64;
double h_temp = bits_to_double(bits_h);
#ifdef __CUDA_ARCH__
return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
#else
double l = l_temp - C;
double h = h_temp - D;
return (l + h);
#endif
}
}
@@ -389,6 +420,8 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
auto x = f128::sub_estimate(a, f128::f128_floor(a));
// NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
// does not work here
const double normalization = 340282366920938500000000000000000000000.;
#ifdef __CUDA_ARCH__
x.hi = __dmul_rn(x.hi, normalization);
@@ -398,7 +431,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
x.lo *= normalization;
#endif
// TODO has to be round
x = f128::add_estimate(x, f128(0.5, 0.0));
x = f128::f128_floor(x);
__uint128_t x0 = f64_to_u128(x.hi);

View File

@@ -12,8 +12,9 @@
using Index = unsigned;
#define NEG_TWID(i) \
f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]), \
f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
f128x2( \
f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])), \
f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))
#define F64x4_TO_F128x2(f128x2_reg, ind) \
f128x2_reg.re.hi = dt_re_hi[ind]; \
@@ -75,7 +76,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
if (u_stays_in_register) {
F128x2_TO_F64x4(v[i], tid);
} else {
F128x2_TO_F64x4(u[i], tid);
}
tid = tid + STRIDE;
}
__syncthreads();
@@ -86,8 +91,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F64x4_TO_F128x2(w, tid ^ lane_mask);
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
if (u_stays_in_register) {
v[i] = w;
} else {
u[i] = w;
}
w = NEG_TWID(tid / lane_mask + twiddle_shift);
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
@@ -151,7 +159,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
// keep one of the register for next iteration and store another one in sm
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
if (u_stays_in_register) {
F128x2_TO_F64x4(v[i], tid);
} else {
F128x2_TO_F64x4(u[i], tid);
}
tid = tid + STRIDE;
}
@@ -165,8 +177,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
bool u_stays_in_register = rank < lane_mask;
F64x4_TO_F128x2(w, tid ^ lane_mask);
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
if (u_stays_in_register) {
v[i] = w;
} else {
u[i] = w;
}
tid = tid + STRIDE;
}
@@ -218,7 +233,7 @@ __device__ void convert_u128_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re, const __uint128_t *in_im) {
const double normalization = pow(2., -128.);
const double normalization = __longlong_as_double(0x37f0000000000000ULL);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {
@@ -241,7 +256,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
const double normalization = pow(2., -128.);
const double normalization = __longlong_as_double(0x37f0000000000000ULL);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {

View File

@@ -12,7 +12,7 @@
#include "integer/subtraction.cuh"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
// lwe_dimension + 1 threads
// todo: This kernel MUST be refactored to a binary reduction
@@ -98,7 +98,7 @@ __host__ void are_all_comparisons_block_true(
while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
int num_chunks = CEIL_DIV(remaining_blocks, max_value);
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
@@ -134,28 +134,26 @@ __host__ void are_all_comparisons_block_true(
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
return x == chunk_length;
};
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0),
is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
is_max_value_lut->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f, true,
are_all_block_true_buffer->preallocated_h_lut);
Torus *h_lut_indexes = is_max_value_lut->h_lut_indexes;
for (int index = 0; index < num_chunks; index++) {
if (index == num_chunks - 1) {
h_lut_indexes[index] = 1;
} else {
h_lut_indexes[index] = 0;
}
}
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
h_lut_indexes, num_chunks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0));
auto num_blocks = is_max_value_lut->num_blocks;
auto active_streams =
streams.active_gpu_subset(num_chunks, params.pbs_type);
is_max_value_lut->broadcast_lut(active_streams);
// Index generator: last chunk uses LUT 1, others use LUT 0
auto index_gen = [num_chunks, num_blocks](Torus *h_lut_indexes,
uint32_t) {
for (uint32_t index = 0; index < num_blocks; index++) {
if (index == num_chunks - 1) {
h_lut_indexes[index] = 1;
} else if (index < num_chunks - 1 || index >= num_chunks) {
h_lut_indexes[index] = 0;
}
}
};
is_max_value_lut->generate_and_broadcast_lut(
active_streams, {1}, {is_equal_to_num_blocks_lut_f}, index_gen,
true, {are_all_block_true_buffer->preallocated_h_lut});
}
lut = is_max_value_lut;
}
@@ -167,15 +165,10 @@ __host__ void are_all_comparisons_block_true(
streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
// Reset max_value_lut_indexes before returning, otherwise if the lut is
// reused the lut indexes will be wrong
memset(is_max_value_lut->h_lut_indexes, 0,
is_max_value_lut->num_blocks * sizeof(Torus));
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
is_max_value_lut->h_lut_indexes,
is_max_value_lut->num_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0));
auto active_gpu_count_is_max = streams.active_gpu_subset(
is_max_value_lut->num_blocks, params.pbs_type);
is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);
is_max_value_lut->set_lut_indexes_and_broadcast_constant(
active_gpu_count_is_max, 0);
reset_radix_ciphertext_blocks(lwe_array_out, 1);
return;
@@ -222,7 +215,7 @@ __host__ void is_at_least_one_comparisons_block_true(
uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
int num_chunks = CEIL_DIV(remaining_blocks, max_value);
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
@@ -483,14 +476,11 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
y = x;
f = sign_handler_f;
}
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f, true,
tree_buffer->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
last_lut->broadcast_lut(active_streams);
last_lut->generate_and_broadcast_lut(active_streams, {0}, {f},
LUT_0_FOR_ALL_BLOCKS, true,
{tree_buffer->preallocated_h_lut});
// Last leaf
integer_radix_apply_univariate_lookup_table<Torus>(streams, lwe_array_out, y,

View File

@@ -10,7 +10,122 @@
#include "integer/integer.cuh"
#include "linearalgebra/multiplication.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
/*
* =============================================================================
* GPU Compression/Decompression Algorithm: Overview
* =============================================================================
*
* The compression algorithm transforms standard LWE ciphertexts into a compact
* packed format. Decompression reverses this process.
*
* -----------------------------------------------------------------------------
* COMPRESSION INPUT (lwe_array_in)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | lwe_array_in (GPU memory) |
* +-------------------------------------------------------------------------+
* +---------------------------+---------------------------+-----------------+
* | LWE 0 | LWE 1 | ... |
* | [mask, body] | [mask, body] | |
* +---------------------------+---------------------------+-----------------+
* |<-- lwe_dimension + 1 -->|
*
* Total LWEs: total_lwe_bodies_count (num_radix_blocks)
*
* -----------------------------------------------------------------------------
* COMPRESSION PROCESS
* -----------------------------------------------------------------------------
*
* 1. Message Shift (64-bit only):
* Each LWE is multiplied by message_modulus to shift the message to MSB
*
* 2. Packing Keyswitch (LWE -> GLWE):
* Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
*
* +--------------------------------------------------------------+
* | lwe_per_glwe LWEs (input batch) |
* | LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1] |
* +--------------------------------------------------------------+
* |
* Packing Keyswitch
* v
* +--------------------------------------------------------------+
* | Single GLWE Ciphertext |
* | [A_0, A_1, ..., A_{k-1}, B] |
* | |<-- k * polynomial_size -->| |<-- polynomial_size -->| |
* +--------------------------------------------------------------+
*
* Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
* lwe_per_glwe)
*
* 3. Modulus Switch:
* Reduce precision from 64-bit torus to storage_log_modulus bits
*
* 4. Bit Packing:
* Pack multiple reduced-precision elements into dense bit representation
*
* -----------------------------------------------------------------------------
* COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | tmp_glwe_array_out (intermediate buffer) |
* +-------------------------------------------------------------------------+
* +----------------------------+----------------------------+---------------+
* | GLWE 0 | GLWE 1 | ... |
* | [A_0..A_{k-1}, B_0..B_N] | [A_0..A_{k-1}, B_0..B_N] | |
* +----------------------------+----------------------------+---------------+
* |<-- glwe_accumulator_size = (k+1)*N -->|
*
* Total size needed: num_glwes * glwe_accumulator_size elements
* Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
*
* -----------------------------------------------------------------------------
* PACKED OUTPUT (glwe_array_out)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | Packed GLWE Ciphertext List (bit-packed) |
* +-------------------------------------------------------------------------+
* +-------------------------------------------------------------------------+
* | Elements packed with storage_log_modulus bits per original element |
* | Total packed size: ceil(in_len * storage_log_modulus / 64) elements |
* +-------------------------------------------------------------------------+
*
* =============================================================================
* DECOMPRESSION (Extract) Algorithm
* =============================================================================
*
* Decompression receives an array of LWE indexes. For each index, it identifies
* the corresponding GLWE, extracts that GLWE from the packed representation,
* and then sample-extracts the requested LWE from the GLWE.
*
* -----------------------------------------------------------------------------
* EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | Extracted GLWE Ciphertext |
* +-------------------------------------------------------------------------+
* +---------------------------------------+-----------------+---------------+
* | Mask (A polynomials) | Body (B) | Tail |
* | [A_0, ..., A_{k-1}] | (body_count) | (zeroed) |
* | k * polynomial_size elements | elements | elements |
* +---------------------------------------+-----------------+---------------+
* |<------------------- initial_out_len ------------------->| |
* |<------------------------ glwe_ciphertext_size ------------------------->|
*
* For the last GLWE, body_count may be less than polynomial_size (partial).
* The tail region must be zeroed to ensure defined behavior.
*
* tail_size = glwe_ciphertext_size - initial_out_len
* tail_offset = initial_out_len (NOT 0!)
*
* =============================================================================
*/
template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
@@ -66,7 +181,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto nbits = sizeof(Torus) * 8;
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
auto out_len = CEIL_DIV(number_bits_to_pack, nbits);
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
@@ -108,6 +223,8 @@ host_integer_compress(CudaStreams streams,
uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
glwe_array_out->lwe_per_glwe - 1) /
glwe_array_out->lwe_per_glwe;
PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
"Invalid number of GLWEs");
// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -200,8 +317,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
uint32_t num_glwes =
(total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
uint32_t num_glwes = CEIL_DIV(total_lwe_bodies_count, polynomial_size);
// Compressed length of the compressed GLWE we want to extract
uint32_t body_count = 0;
@@ -218,19 +334,21 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;
// Calculates how many bits this particular GLWE shall use
auto number_bits_to_unpack = initial_out_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
// Calculates how many bits a full-packed GLWE shall use
number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
auto len = (number_bits_to_unpack + nbits - 1) / nbits;
// Calculate how many bits a full-packed GLWE uses, to determine
// the stride between consecutive packed GLWEs in the input buffer
auto number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
auto len = CEIL_DIV(number_bits_to_unpack, nbits);
// Uses that length to set the input pointer
auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;
// Ensure the tail of the GLWE is zeroed
// The extract kernel writes initial_out_len elements starting at offset 0.
// We must zero the tail region (from initial_out_len to
// glwe_ciphertext_size)
if (initial_out_len < glwe_ciphertext_size) {
cuda_memset_async(glwe_array_out, 0,
cuda_memset_async(glwe_array_out + initial_out_len, 0,
(glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
stream, gpu_index);
}

View File

@@ -14,7 +14,6 @@
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
#include "utils/helper_profile.cuh"
#include "utils/kernel_dimensions.cuh"
#include <algorithm>
#include <functional>
@@ -273,8 +272,7 @@ __global__ void device_radix_split_simulators_and_grouping_pgns(
}
}
if ((blockIdx.x / group_size + 1) <
(blocks_count + group_size - 1) / group_size) {
if ((blockIdx.x / group_size + 1) < CEIL_DIV(blocks_count, group_size)) {
size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
@@ -363,7 +361,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
num_radix_blocks > src1->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks should have more "
"blocks than the number used in sum in groups")
auto num_groups = (num_radix_blocks + group_size - 1) / group_size;
auto num_groups = CEIL_DIV(num_radix_blocks, group_size);
if (src2->num_radix_blocks < num_groups)
PANIC("Cuda error: second input in sum in groups should have at least "
"num_groups blocks")
@@ -544,6 +542,24 @@ __host__ void integer_radix_apply_univariate_lookup_table(
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
// Verify consistency between set_lut_indexes and apply_lookup_table
GPU_ASSERT(
num_radix_blocks <= lut->last_broadcast_num_radix_blocks,
"num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
num_radix_blocks, lut->last_broadcast_num_radix_blocks);
GPU_ASSERT(active_streams.count() <= lut->last_broadcast_streams.count(),
"active_streams count (%u) must match last_broadcast_streams "
"count (%u)",
active_streams.count(), lut->last_broadcast_streams.count());
for (uint32_t i = 0; i < active_streams.count(); i++) {
GPU_ASSERT(active_streams.gpu_index(i) ==
lut->last_broadcast_streams.gpu_index(i),
"active_streams gpu_index(%u) = %u must match "
"last_broadcast_streams gpu_index(%u) = %u",
i, active_streams.gpu_index(i), i,
lut->last_broadcast_streams.gpu_index(i));
}
if (active_streams.count() == 1) {
execute_keyswitch_async<Torus>(
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -962,8 +978,9 @@ uint64_t generate_many_lookup_table(
template <typename Torus>
void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size,
std::function<Torus(uint32_t)> f) {
std::function<Torus(Torus)> f) {
// accumulator number of elements is (glwe_dimension + 1) * polynomial_size
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
auto body = &acc[glwe_dimension * polynomial_size];
@@ -975,9 +992,9 @@ void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,
template <typename Torus>
void generate_device_accumulator_no_encoding(
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t &degree,
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
uint32_t polynomial_size, std::function<Torus(Torus)> f,
bool gpu_memory_allocated) {
Torus *h_lut =
@@ -986,7 +1003,7 @@ void generate_device_accumulator_no_encoding(
generate_lookup_table_no_encoding<Torus>(h_lut, glwe_dimension,
polynomial_size, f);
degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;
*degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -1738,12 +1755,9 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
signs_array_in, 0, num_sign_blocks);
if (num_sign_blocks > 2) {
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
reduce_two_orderings_function, true, diff_buffer->preallocated_h_lut1);
lut->broadcast_lut(lut->active_streams);
lut->generate_and_broadcast_lut(
lut->active_streams, {0}, {reduce_two_orderings_function},
LUT_0_FOR_ALL_BLOCKS, true, {diff_buffer->preallocated_h_lut1});
while (num_sign_blocks > 2) {
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
@@ -1769,12 +1783,10 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
};
auto lut = diff_buffer->reduce_signs_lut;
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
diff_buffer->preallocated_h_lut2);
lut->broadcast_lut(lut->active_streams);
lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
LUT_0_FOR_ALL_BLOCKS, true,
{diff_buffer->preallocated_h_lut2});
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
signs_a, num_sign_blocks, message_modulus);
@@ -1789,12 +1801,9 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
};
auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
diff_buffer->preallocated_h_lut2);
lut->broadcast_lut(lut->active_streams);
lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
LUT_0_FOR_ALL_BLOCKS, true,
{diff_buffer->preallocated_h_lut2});
integer_radix_apply_univariate_lookup_table<Torus>(
streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
@@ -2344,7 +2353,7 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
// from the output ct
auto active_streams = streams.active_gpu_subset(
auto active_streams = streams.active_gpu_subset_u128(
lwe_array_out->num_radix_blocks, params.pbs_type);
if (active_streams.count() == 1) {
execute_keyswitch_async<InputTorus>(

Some files were not shown because too many files have changed in this diff Show More