Compare commits

...

67 Commits

Author SHA1 Message Date
David Testé
9d18c6c3d8 chore(ci): update slab-github-runner action to v1.6.0
This action version now uses node24 as runner since node20 support is
dropped on April 2026.
2026-04-01 09:47:54 +02:00
Thomas Montaigu
638f4c8bab chore: add compressed xof key set backward to 1.5 2026-03-11 17:26:14 +01:00
Thomas Montaigu
509afb0967 refactor: use div_ceil to compute public seed bytes
Coupled with the assert this ensures at least one byte for the seed.
2026-03-11 17:26:14 +01:00
Thomas Montaigu
da3d966fba feat: impl Tagged for [Compressed]XofKeySet 2026-03-11 17:26:14 +01:00
Thomas Montaigu
fa396a7e80 chore: bump tfhe-csprng to 0.8.1 2026-03-11 17:26:14 +01:00
Thomas Montaigu
f92a5267d2 fix(csprng): fix offset being lost during fork
The offset was not passed to children, and the parent 'lost' it.

Meaning that after a fork using a generator initialized with XofSeed,
after a fork, the offset was lost for both the children and parent.

Refactor the initialization chain and methods of the AesCtrGenerator
to simplify things. (Although the struct is pub, it's in a private module
thus it's not public API)

The fix is to pass the offset.
* A test to check that the parent correctly continues after the fork has
  been added
* Tests now pass a random offset to make sure this case is properly
  handled
2026-03-11 17:26:14 +01:00
Nicolas Sarlin
d76fef0d82 chore: bump tfhe to 1.5.4 2026-03-03 15:03:40 +01:00
Nicolas Sarlin
1f403aade1 fix(integer): reject packed legacy compact list 2026-03-03 10:17:40 +01:00
David Testé
081bb0f332 chore(ci): update slab-github-runner action to v1.5.1
This version adds randomization on sleep duration between calls to
GitHub API when looking for runner registration. This reduces the
risk of API rate-limiting.
2026-02-12 09:27:13 +01:00
David Testé
c8b19c3f10 chore(ci): update slab-github-runner action to v1.5.0
This new version improves handling of asynchronous tasks related
to Slab and GitHub API.
2026-02-11 13:59:49 +01:00
Agnes Leroy
7408ca1126 chore(gpu): add fallback for 4-l40 in CI 2026-02-11 13:59:49 +01:00
Arthur Meyre
8712746296 chore: bump TFHE-rs to 1.5.3 2026-02-11 13:59:49 +01:00
Arthur Meyre
271644f482 chore: bump tfhe-cuda-backend to 0.13.2 2026-02-11 13:59:49 +01:00
Arthur Meyre
5a7650a2de chore: revert backward compatibility change
- this change should not have been needed and poses risks for backward
compatibility
- HL CUDA: use dedicated type for the ReRand
2026-02-11 13:59:49 +01:00
Guillermo Oyarzun
a3a5b6fe40 fix(gpu): avoid unspecified behavior during the reduction 2026-02-11 13:55:47 +01:00
Pedro Alves
2210d637c4 fix(gpu): remove unused number_bits_to_unpack initialization in host_extract 2026-02-09 17:03:09 +01:00
Pedro Alves
1e5a38acb0 fix(gpu): assert all items have same shape in from_vec_cuda_lwe_ciphertexts_list 2026-02-09 17:03:09 +01:00
Pedro Alves
f6b08c45e9 fix(gpu): assert exactly one element in CudaLweCiphertextList::into_lwe_ciphertext 2026-02-09 17:03:09 +01:00
Pedro Alves
6145f6b680 fix(gpu): use lwe_compact_ciphertext_list_size for validation in CudaLweCompactCiphertextList::from_d_vec 2026-02-09 17:03:09 +01:00
Agnes Leroy
0d6602df84 chore(gpu): improve checks in device code 2026-02-09 17:03:09 +01:00
Agnes Leroy
a006d5d71f fix(gpu): fix logic in lwe ciphertext list 2026-02-09 17:03:09 +01:00
Pedro Alves
a7c5f2dbf8 chore(gpu): replaces (a + b - 1) / b patterns in the cuda backend by CEIL_DIV
- also, this commit renames kernel_dimensions.cuh to helper.cuh and copies the content of the older helper.cuh into helper_debug.cuh
2026-02-09 17:03:09 +01:00
Agnes Leroy
0816c331c5 chore(gpu): add some missing checks in core crypto 2026-02-09 17:03:09 +01:00
Agnes Leroy
89df44917b chore(gpu): cleanup device.cu binding, remove _async fuinctions from core crypto 2026-02-09 17:03:09 +01:00
Guillermo Oyarzun
b218c98194 fix(gpu): clean unused variables in specialized classical pbs 2026-02-09 17:03:09 +01:00
Andrei Stoian
2355cf4d89 fix(gpu): valgrind error on leaks 2026-02-09 17:03:09 +01:00
Guillermo Oyarzun
3b793273cf feat(gpu): create different threshold for multi-gpu pbs128 2026-02-09 17:03:09 +01:00
Guillermo Oyarzun
d3b52d92cb fix(gpu): fix race condition in tbc implementations 2026-02-09 17:03:09 +01:00
Guillermo Oyarzun
af7d69e16c feat(gpu): avoid register spilling memory in ff128 2026-02-09 17:03:09 +01:00
Guillermo Oyarzun
6bb211f2fd feat(gpu): use 512 threads for pbs128 flavors 2026-02-09 17:03:09 +01:00
Agnes Leroy
0fcce501a2 fix(gpu): fix small cpu memory leak 2026-02-09 17:03:09 +01:00
Agnes Leroy
204555e11c fix(gpu): fix noise level in match value 2026-02-09 17:03:09 +01:00
Arthur Meyre
b00e8bafe5 chore: bump TFHE-rs to version 1.5.2 2026-02-09 13:05:01 +01:00
Arthur Meyre
140a780bb6 chore: bump tfhe-cuda-backend to 0.13.1 2026-02-09 13:05:01 +01:00
Thomas Montaigu
e646130324 chore: move shortint expanded types into shortint mod
The expanded types definitions were in the high level API
as it was originally related to the XofKeySet feature.

However, since it's now used even in non-xof setting
we decided to move these types to shortint module
where they conceptually belong
2026-02-09 09:14:25 +01:00
Thomas Montaigu
f79204e23e feat(hlapi): add is_conformant for CompressedXofKeySet 2026-02-09 09:14:25 +01:00
Thomas Montaigu
b6b24f4fb9 fix(xofkeyset): generate multibit decompression key when params are multibit 2026-02-09 09:14:25 +01:00
Thomas Montaigu
41621dc4e3 fix(conformance): GGSW list had wrong group count 2026-02-09 09:14:25 +01:00
Thomas Montaigu
662038da19 chore(xof_key_set): make generate_with_pre_seeded_generator public
MPC teams needs to be able to generate a CompressedXofKeySet
from an existing ClientKey
2026-02-09 09:14:25 +01:00
Thomas Montaigu
9d21aecae9 feat(hlapi): add decompress_to_gpu for CompressedXofKeySet
Done using the newly added expand + convert pattern
2026-02-09 09:14:25 +01:00
Thomas Montaigu
58dbdf7dd4 refactor(hlapi): add IntegerExpandedServerKey::convert_to_gpu
And use it to convert from CompressedServerKey to CudaServerKey.
2026-02-09 09:14:25 +01:00
Thomas Montaigu
1a7b7ace47 refactor(hlapi): split gpu key conversion in expand/convert
converting from CompressedServerKey (Cpu) to CudaServerKey
was done via decompress_from_cpu/decompress_to_cuda methods.

We refactor to split these functions in 2: one that converts from cpu
(input in std domain for bootstrap keys), the other that
decompress/expand then calls convert
2026-02-09 09:14:25 +01:00
Thomas Montaigu
7797b60ef2 refactor(hlapi): add expand() method to CompressedServerKey
Add an expand method to CompressedServerKey that returns an
IntegerExpandedServerKey. Refactor decompress() to use the new expand()
then convert_to_cpu().

This will allow later to refactor the convertion from
CompressedServerKey to CudaServerKey to follow a similar pattern,
meaning we will be able to share the code that converts keys from CPU to
GPU between the normal server key and the xof server key
2026-02-09 09:14:25 +01:00
Thomas Montaigu
56c0a9fa5e fix(hlapi-gpu): correct state of post decompression noise squashed ct
The GPU part did not set the correct state

This problem is sort of rare to encounter as generally a
CompressedSquashedNoiseCiphertextList is deserialized then used.
When deserialized its on the CPU, so calls to `get` use CPU
code which correctly set the state.

This problem is thus visible when either:
- safe_deserializing and manually moving the list to GPU
- deserialize
- directly expanding after the creation of the list
2026-02-09 09:14:25 +01:00
Thomas Montaigu
37bcb7763d fix(hlapi): return error when trying to expand non packed list 2026-02-09 09:14:25 +01:00
Thomas Montaigu
ae978bedc6 refactor(xof_key_set): split decompression into expansion and conversion
Introduce IntegerExpandedServerKey as an intermediate representation
between compressed (seeded) keys and backend-specific formats. Decompression
is now a two-step process:

1. Seed expansion: decompress seeded keys into standard domain representations
   (e.g., LweBootstrapKey instead of FourierLweBootstrapKey)
2. Backend conversion: convert to target backend format (CPU Fourier, GPU, etc.)

This separation allows sharing the expansion step across backends while
specializing only the final conversion, as for this XOF based expansion
the order is important

Changes:
- Split xof_key_set.rs into module structure (mod.rs, internal.rs, test.rs)
- Add intermediate types, that contains the expanded, but not converted
  data
2026-02-09 09:14:25 +01:00
David Testé
43ebb14b9b doc: fix specs description of aws hpc7a.96xlarge instance
These instances have two sockets, each equipped with a 96-core CPU.
2026-02-06 12:25:07 +01:00
David Testé
641b394423 doc: add svg tables to zero-knowledge benchmarks
This removes the embedded GSheet.
These SVGs display more operations and inputs that reflect
real-use cases.
Throughput is available only for server-side computation as
it's meaningless to perform multiple proof in parallele on
the client-side.
2026-02-02 18:05:08 +01:00
Beka Barbakadze
72c18cc2b2 fix(gpu): add __syncthread and threadIdx condition for sample_extract_body in all pbs versions 2026-01-30 20:21:15 +01:00
Pedro Alves
71e95c435d chore(gpu): add test for single-item compact ciphertext list expand 2026-01-30 20:21:15 +01:00
Andrei Stoian
057217b7d2 fix(gpu): more crypto param checks in cuda backend 2026-01-30 20:21:15 +01:00
Beka Barbakadze
f006039916 refactor(gpu): refactor f128 and fft128 to prevent possible precision losses and improve performance 2026-01-30 20:21:15 +01:00
Pedro Alves
69df8c0310 fix(gpu): fix some inconsistencies in decompression that could enable access of not allocated memory
- also adds a diagram explaining how compression / decompression work
2026-01-30 20:21:15 +01:00
Agnes Leroy
482e49eac5 fix(gpu): add check on nullptr for dest_indexes in many lut gather 2026-01-29 15:58:18 +01:00
Guillermo Oyarzun
0dcf2ace3b fix(gpu): handling temporary events destruction 2026-01-29 15:58:18 +01:00
Pedro Alves
0483c95941 fix(gpu): fix an invalid access in expand when the number of LWEs is odd
- also improves test_expander_length_matches_data_items
- adds diagrams and explanations about GPU's expand
2026-01-29 15:58:18 +01:00
Guillermo Oyarzun
f9b292ca77 fix(gpu): add panic for 32-bit Torus calls 2026-01-29 15:58:18 +01:00
Agnes Leroy
8e194c4b65 chore(gpu): fix logic to check ptr validity in device.cu 2026-01-29 15:58:18 +01:00
Agnes Leroy
f02bba151c chore(gpu): stop trying to enable NVlink since we don't use it 2026-01-29 15:58:18 +01:00
Agnes Leroy
a477548161 chore(gpu): add comments in device.cu 2026-01-29 15:58:18 +01:00
David Testé
45a7d5217c chore(docs): update benchmark results for all backends 2026-01-26 17:08:40 +01:00
Andrei Stoian
865c0887ee fix(gpu): mutex lock 2026-01-26 11:20:54 +01:00
Agnes Leroy
0b9df1c8cd doc: add erc20 benchmark results for all backends 2026-01-26 11:20:54 +01:00
Pedro Alves
cf20e337ef fix(gpu): fix an inconsistency between CudaCompactCiphertextListExpander::len() and the CPU equivalent 2026-01-23 15:37:18 +01:00
Agnes Leroy
d7aea61eba fix(gpu): fix potential overflow in create_on_same_gpus 2026-01-23 15:37:18 +01:00
Arthur Meyre
9352ecce54 chore: bump version to 1.5.1 2026-01-20 14:17:55 +01:00
Arthur Meyre
179b52ab26 feat: add missing raw parts APIs for shortint (Compressed)DecompressionKey 2026-01-20 14:17:55 +01:00
231 changed files with 6619 additions and 3726 deletions

View File

@@ -41,7 +41,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -141,7 +141,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -146,7 +146,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -299,7 +299,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -86,7 +86,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -35,7 +35,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -100,7 +100,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -87,7 +87,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -172,7 +172,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -155,7 +155,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -279,7 +279,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -147,7 +147,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -126,7 +126,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -261,7 +261,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -33,7 +33,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -134,7 +134,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -324,7 +324,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -92,7 +92,7 @@ jobs:
steps:
- name: Start remote instance
id: start-remote-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -324,7 +324,7 @@ jobs:
steps:
- name: Stop remote instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -143,7 +143,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -383,7 +383,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -37,7 +37,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -37,7 +37,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -71,7 +71,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -213,7 +213,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -80,7 +80,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -242,7 +242,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -60,7 +60,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -31,7 +31,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -37,7 +37,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -43,7 +43,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -87,7 +87,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -198,7 +198,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -79,7 +79,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -35,7 +35,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -187,7 +187,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -38,7 +38,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -38,7 +38,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -159,7 +159,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -87,7 +87,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -82,7 +82,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -87,7 +87,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -82,7 +82,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
- name: Stop instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -62,7 +62,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -34,7 +34,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -37,7 +37,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -221,7 +221,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -38,7 +38,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
uses: zama-ai/slab-github-runner@5aee5d157f4a0201e5eaefc9cc648e5f9f5472a5 # v1.6.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-cuda-backend"
version = "0.13.0"
version = "0.13.2"
edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"

View File

@@ -10,11 +10,7 @@ extern std::mutex m;
extern bool p2p_enabled;
extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
extern "C" {
int32_t cuda_setup_multi_gpu(int device_0_id);
}
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
// Define a variant type that can be either a vector or a single pointer
template <typename Torus>
using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
@@ -42,6 +38,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type);
uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
@@ -70,7 +68,7 @@ public:
// Construct an empty set. Invalid use of an empty set should raise an error
// right away through asserts or because of a nullptr dereference
CudaStreams()
: _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
: _streams(nullptr), _gpu_indexes(nullptr), _gpu_count(0),
_owns_streams(false) {}
// Returns a subset of this set as an active subset. An active subset is one
@@ -80,7 +78,15 @@ public:
_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
}
// Returns a subset of this set as an active subset for pbs128. An active
// subset is one that is temporarily used to perform some computation. For
// pbs128, the threshold is different, because the original threshold was
// designed for 2_2 params.
CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
return CudaStreams(
_streams, _gpu_indexes,
get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
}
// Returns a CudaStreams struct containing only the ith stream
CudaStreams get_ith(int i) const {
return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
@@ -114,11 +120,13 @@ public:
// streams on the same GPU
void create_on_same_gpus(const CudaStreams &other) {
PANIC_IF_FALSE(_streams == nullptr,
"Assign clone to non-empty cudastreams");
"Cuda error: Assign clone to non-empty CudaStreams");
PANIC_IF_FALSE(_gpu_count <= 8,
"Cuda error: GPU count should be in the interval [0, 8]");
cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];
uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
uint32_t *gpu_indexes_clone = new uint32_t[other._gpu_count];
for (uint32_t i = 0; i < other._gpu_count; ++i) {
new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
gpu_indexes_clone[i] = other._gpu_indexes[i];
@@ -170,6 +178,7 @@ public:
_streams = nullptr;
delete[] _gpu_indexes;
_gpu_indexes = nullptr;
_gpu_count = 0;
}
}
@@ -483,4 +492,38 @@ public:
}
};
// Event pool for managing temporary CUDA events in scatter/gather operations
struct CudaEventPool {
private:
std::vector<cudaEvent_t> _events;
std::vector<uint32_t> _gpu_indices;
public:
CudaEventPool() {}
// Requests a new event from the pool (creates and stores it)
cudaEvent_t request_event(uint32_t gpu_index) {
cudaEvent_t event = cuda_create_event(gpu_index);
_events.push_back(event);
_gpu_indices.push_back(gpu_index);
return event;
}
// Releases all pooled events
// This should always be called in the release of the LUT, so streams
// are already synchronized
void release() {
for (size_t i = 0; i < _events.size(); i++) {
cuda_event_destroy(_events[i], _gpu_indices[i]);
}
_events.clear();
_gpu_indices.clear();
}
~CudaEventPool() {
GPU_ASSERT(_events.empty(),
"CudaEventPool: must call release before destruction");
}
};
#endif

View File

@@ -28,7 +28,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
Torus total_modulus = params.message_modulus * params.carry_modulus;
uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
tmp_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,

View File

@@ -11,16 +11,26 @@ template <typename Torus> struct int_compression {
Torus *tmp_glwe_array_out;
bool gpu_memory_allocated;
uint32_t lwe_per_glwe;
uint32_t max_num_glwes;
// num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
// compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
// defined by the chosen parameter set
int_compression(CudaStreams streams, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
// Calculate the actual number of GLWEs needed based on total radix blocks.
// This ensures we allocate enough memory when num_radix_blocks >
// lwe_per_glwe.
max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);
tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
sizeof(Torus),
@@ -28,7 +38,7 @@ template <typename Torus> struct int_compression {
allocate_gpu_memory));
tmp_glwe_array_out =
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
max_num_glwes * glwe_accumulator_size * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory));

View File

@@ -34,8 +34,10 @@ public:
} else if ((msg_mod) == 4 && (carry_mod) == 4) { \
constexpr int max_noise_level = 5; \
if ((noise_level_expr) > max_noise_level) \
PANIC("Cuda error: noise exceeds maximum authorized value for 2_2 " \
"parameters"); \
PANIC( \
"Cuda error: noise %d exceeds maximum authorized value 5 for 2_2" \
" parameters", \
noise_level_expr); \
} else if ((msg_mod) == 8 && (carry_mod) == 8) { \
constexpr int max_noise_level = 9; \
if ((noise_level_expr) > max_noise_level) \
@@ -349,6 +351,7 @@ struct int_radix_lut_custom_input_output {
CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier;
CudaStreamsBarrier multi_gpu_gather_barrier;
CudaEventPool event_pool;
// Setup the LUT configuration:
// input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input
@@ -371,8 +374,13 @@ struct int_radix_lut_custom_input_output {
this->num_input_blocks = num_input_blocks;
this->gpu_memory_allocated = allocate_gpu_memory;
this->active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
if (sizeof(OutputTorus) == 16) {
this->active_streams =
streams.active_gpu_subset_u128(num_radix_blocks, params.pbs_type);
} else {
this->active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
}
}
void setup_degrees() {
@@ -384,9 +392,13 @@ struct int_radix_lut_custom_input_output {
void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory, uint64_t &size_tracker) {
int classical_threshold =
sizeof(OutputTorus) == 16
? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
: classical_threshold;
for (uint i = 0; i < active_streams.count(); i++) {
cuda_set_device(active_streams.gpu_index(i));
@@ -458,11 +470,14 @@ struct int_radix_lut_custom_input_output {
lwe_trivial_indexes, num_radix_blocks,
allocate_gpu_memory);
}
void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
int classical_threshold =
sizeof(OutputTorus) == 16
? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
: classical_threshold;
auto inputs_on_gpu = std::min(
(int)num_input_blocks,
@@ -809,10 +824,13 @@ struct int_radix_lut_custom_input_output {
void allocate_lwe_vector_for_non_trivial_indexes(
CudaStreams streams, uint64_t max_num_radix_blocks,
uint64_t &size_tracker, bool allocate_gpu_memory) {
int classical_threshold =
sizeof(OutputTorus) == 16
? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
: classical_threshold;
// We need to create the auxiliary array only in GPU 0
if (active_streams.count() > 1) {
@@ -865,6 +883,7 @@ struct int_radix_lut_custom_input_output {
if (active_streams.count() > 1) {
active_streams.synchronize();
event_pool.release();
multi_gpu_gather_barrier.release();
multi_gpu_broadcast_barrier.release();
multi_gpu_scatter_barrier.release();
@@ -2055,7 +2074,7 @@ template <typename Torus> struct int_sc_prop_memory {
uint32_t block_modulus = message_modulus * carry_modulus;
uint32_t num_bits_in_block = std::log2(block_modulus);
uint32_t grouping_size = num_bits_in_block;
num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
num_groups = CEIL_DIV(num_radix_blocks, grouping_size);
num_many_lut = 2; // many luts apply 2 luts
uint32_t box_size = polynomial_size / block_modulus;
@@ -2489,7 +2508,7 @@ template <typename Torus> struct int_borrow_prop_memory {
uint32_t num_bits_in_block = std::log2(block_modulus);
uint32_t grouping_size = num_bits_in_block;
group_size = grouping_size;
num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
num_groups = CEIL_DIV(num_radix_blocks, grouping_size);
num_many_lut = 2; // many luts apply 2 luts
uint32_t box_size = polynomial_size / block_modulus;
@@ -2563,7 +2582,9 @@ template <typename Torus> struct int_borrow_prop_memory {
void release(CudaStreams streams) {
shifted_blocks_borrow_state_mem->release(streams);
delete shifted_blocks_borrow_state_mem;
prop_simu_group_carries_mem->release(streams);
delete prop_simu_group_carries_mem;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
overflow_block, gpu_memory_allocated);

View File

@@ -22,8 +22,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
uint32_t calculated_active_blocks =
total_random_bits == 0
? 0
: (total_random_bits + message_bits_per_block - 1) /
message_bits_per_block;
: CEIL_DIV(total_random_bits, message_bits_per_block);
if (num_blocks_to_process != calculated_active_blocks) {
PANIC(
"num_blocks_to_process should be equal to calculated_active_blocks");
@@ -170,8 +169,7 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_random_input_blocks =
(num_input_random_bits + message_bits_per_block - 1) /
message_bits_per_block;
CEIL_DIV(num_input_random_bits, message_bits_per_block);
this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
streams, params, this->num_random_input_blocks, message_bits_per_block,

View File

@@ -7,7 +7,8 @@
#include <functional>
#include <vector>
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
// If we use more than 5 streams the result is incorrect
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;
template <typename Torus> struct int_equality_selectors_buffer {
int_radix_params params;
@@ -175,8 +176,7 @@ template <typename Torus> struct int_possible_results_buffer {
this->lut_stride =
(ciphertext_modulus / this->max_luts_per_call) * box_size;
this->num_lut_accumulators =
(total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
this->num_lut_accumulators = CEIL_DIV(total_luts_needed, max_luts_per_call);
stream_luts =
new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];

View File

@@ -73,9 +73,10 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
int8_t **fp_ks_buffer,
bool gpu_memory_allocated);
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log, uint32_t level_count);
void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log,
uint32_t level_count);
}
#endif // CNCRT_KS_H_

View File

@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
uint32_t casting_output_dimension, uint32_t casting_ks_level,
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
KS_TYPE casting_key_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
const bool *is_boolean_array, const uint32_t is_boolean_array_len,
uint32_t num_compact_lists, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_expand_without_verification_64(
CudaStreamsFFI streams, void *lwe_array_out,

View File

@@ -118,7 +118,8 @@ template <typename Torus> struct zk_expand_mem {
zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
int_radix_params casting_params, KS_TYPE casting_key_type,
const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
const bool *is_boolean_array,
const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
bool allocate_gpu_memory, uint64_t &size_tracker)
: computing_params(computing_params), casting_params(casting_params),
num_compact_lists(num_compact_lists),
@@ -270,14 +271,36 @@ template <typename Torus> struct zk_expand_mem {
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto lwe_index_in_list = i % num_lwes_in_kth;
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
lwe_index, num_packed_msgs * num_lwes);
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
h_indexes_out[lwe_index] =
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
// If the input relates to a boolean, shift the LUT so the correct one
// with sanitization is used
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
// is_boolean_array tells us which input is a boolean and thus the
// related output needs boolean sanitization. It naturally has
// total_blocks entries, but h_indexes_out reaches
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
// the ceiling causes out-of-bounds access. Reading garbage "true" would
// set h_lut_indexes to an invalid index pointing to uninitialized
// memory instead of a real LUT. Rust pads is_boolean_array with FALSE
// to match.
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
"Cuda error: index %d for is_boolean_array is out of "
"bounds (len is %d)",
h_indexes_out[lwe_index], is_boolean_array_len);
auto boolean_offset =
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
PANIC_IF_FALSE(
h_lut_indexes[lwe_index] < 4,
"Cuda error: lut index is greater than the max possible value (3)");
}
offset += num_lwes_in_kth;
}

View File

@@ -183,9 +183,10 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_128(
base_log, level_count, num_lwes);
}
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log, uint32_t level_count) {
void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log,
uint32_t level_count) {
host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<const uint64_t *>(input),
static_cast<uint64_t *>(output), base_log,

View File

@@ -10,7 +10,6 @@
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
@@ -351,6 +350,7 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
Torus state =
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
#pragma unroll 1
for (int j = 0; j < level_count; j++) {
KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
@@ -363,16 +363,15 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
lwe_acc_out[shmem_index] = local_lwe_out;
}
if (tid <= lwe_dimension_out) {
for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
offset /= 2) {
__syncthreads();
for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
__syncthreads();
if (tid <= lwe_dimension_out && threadIdx.y < offset) {
lwe_acc_out[shmem_index] +=
lwe_acc_out[shmem_index + offset * blockDim.x];
}
if (threadIdx.y == 0)
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
}
if (tid <= lwe_dimension_out && threadIdx.y == 0)
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
}
template <typename Torus, typename KSTorus>

View File

@@ -12,12 +12,9 @@
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
// Finish the keyswitching operation and prepare GLWEs for accumulation.
// 1. Finish the keyswitching computation partially performed with a GEMM:
// - negate the dot product between the GLWE and KSK polynomial

View File

@@ -6,7 +6,7 @@
#include "helper_multi_gpu.h"
#include "polynomial/parameters.cuh"
#include "types/int128.cuh"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
#include <limits>
template <typename T>

View File

@@ -1,4 +1,5 @@
#include "device.h"
#include "utils/helper.cuh"
#include <cstdint>
#include <cuda_runtime.h>
#include <mutex>
@@ -6,6 +7,27 @@
#include <cuda_profiler_api.h>
#endif
void validate_device_ptr_and_gpu_index(const void *ptr, uint32_t gpu_index) {
GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer.")
}
}
int validate_device_ptr(const void *ptr) {
GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer.")
}
return attr.device;
}
uint32_t cuda_get_device() {
int device;
check_cuda_error(cudaGetDevice(&device));
@@ -30,8 +52,9 @@ bool mem_pools_enabled = false;
// better results.
void cuda_setup_mempool(uint32_t caller_gpu_index) {
if (!mem_pools_enabled) {
pool_mutex.lock();
if (mem_pools_enabled)
std::lock_guard lock(pool_mutex);
if (mem_pools_enabled) // double-check - mem_pools_enabled might have been
// changed in a different thread
return; // If mem pools are already enabled, we don't need to do anything
// We do it only once for all GPUs
@@ -78,7 +101,6 @@ void cuda_setup_mempool(uint32_t caller_gpu_index) {
}
// We return to the original gpu_index
cuda_set_device(caller_gpu_index);
pool_mutex.unlock();
}
}
@@ -234,60 +256,61 @@ bool cuda_check_support_thread_block_clusters() {
#endif
}
/// Copy memory to the GPU asynchronously
/// Copy memory from the CPU to a GPU with size tracking.
/// This copy is asynchronous only if the CPU memory was pinned, i.e.
/// allocated using cudaMallocHost. This was shown to come with a performance
/// penalty if we allocate all CPU data in this way in the backend, so
/// cudaMallocHost is only used in specific places where we need an
/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
/// example to copy the bootstrapping key).
/// The copy only happens if gpu_memory_allocated is true.
void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
uint64_t size,
cudaStream_t stream,
uint32_t gpu_index,
bool gpu_memory_allocated) {
GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
if (size == 0 || !gpu_memory_allocated)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
}
validate_device_ptr_and_gpu_index(dest, gpu_index);
cuda_set_device(gpu_index);
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream));
}
/// Copy memory to the GPU asynchronously
/// Copy memory from the CPU to a GPU.
/// This copy is asynchronous only if the CPU memory was pinned, i.e.
/// allocated using cudaMallocHost. This was shown to come with a performance
/// penalty if we allocate all CPU data in this way in the backend, so
/// cudaMallocHost is only used in specific places where we need an
/// asynchronous data copy from the CPU to all the GPUs simultaneously (for
/// example to copy the bootstrapping key).
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_memcpy_with_size_tracking_async_to_gpu(dest, src, size, stream,
gpu_index, true);
}
/// Copy memory within a GPU asynchronously
/// Copy memory within a GPU asynchronously.
/// The copy only happens if gpu_memory_allocated is true
void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
void *dest, void const *src, uint64_t size, cudaStream_t stream,
uint32_t gpu_index, bool gpu_memory_allocated) {
if (size == 0 || !gpu_memory_allocated)
return;
GPU_ASSERT(dest != nullptr,
"Cuda error: trying to copy gpu->gpu to null ptr");
GPU_ASSERT(src != nullptr,
"Cuda error: trying to copy gpu->gpu from null ptr");
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
PANIC_IF_FALSE(
attr_dest.type == cudaMemoryTypeDevice,
"Cuda error: invalid dest device pointer in copy from GPU to GPU.");
cudaPointerAttributes attr_src;
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
PANIC_IF_FALSE(
attr_src.type == cudaMemoryTypeDevice,
"Cuda error: invalid src device pointer in copy from GPU to GPU.");
int src_gpu_index = validate_device_ptr(src);
int dest_gpu_index = validate_device_ptr(dest);
cuda_set_device(gpu_index);
if (attr_src.device == attr_dest.device) {
if (src_gpu_index == dest_gpu_index) {
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
} else {
check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
attr_src.device, size, stream));
check_cuda_error(cudaMemcpyPeerAsync(dest, dest_gpu_index, src,
src_gpu_index, size, stream));
}
}
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
@@ -327,21 +350,20 @@ void cuda_synchronize_device(uint32_t gpu_index) {
check_cuda_error(cudaDeviceSynchronize());
}
/// cuda_memset sets bytes, we basically only use it to initialize data to 0
/// The memset only happens if gpu_memory_allocated is true
void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
uint64_t size, cudaStream_t stream,
uint32_t gpu_index,
bool gpu_memory_allocated) {
if (size == 0 || !gpu_memory_allocated)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
}
validate_device_ptr_and_gpu_index(dest, gpu_index);
cuda_set_device(gpu_index);
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
}
/// cuda_memset sets bytes, we basically only use it to initialize data to 0
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
cuda_memset_with_size_tracking_async(dest, val, size, stream, gpu_index,
@@ -366,7 +388,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
}
cuda_set_device(gpu_index);
int block_size = 256;
int num_blocks = (n + block_size - 1) / block_size;
int num_blocks = CEIL_DIV(n, block_size);
// Launch the kernel
cuda_set_value_kernel<Torus>
@@ -384,15 +406,15 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
uint32_t n);
/// Copy memory to the CPU asynchronously
/// This comes with a big penalty on performance even if the CPU
/// memory is pinned (using cudaMallocHost for the CPU allocation),
/// so it should be avoided at all costs
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
GPU_ASSERT(dest != nullptr, "Cuda error: null host ptr");
if (size == 0)
return;
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, src));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
}
validate_device_ptr_and_gpu_index(src, gpu_index);
cuda_set_device(gpu_index);
check_cuda_error(

View File

@@ -68,9 +68,15 @@ struct alignas(16) f128 {
auto t = two_sum(a.lo, b.lo);
double hi = s.hi;
#ifdef __CUDA_ARCH__
double lo = __dadd_rn(s.lo, t.hi);
hi = __dadd_rn(hi, lo);
lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
#else
double lo = s.lo + t.hi;
hi = hi + lo;
lo = lo - (hi - s.hi);
#endif
return f128(hi, lo + t.lo);
}
@@ -104,8 +110,13 @@ struct alignas(16) f128 {
__host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
auto s = two_diff(a.hi, b.hi);
auto t = two_diff(a.lo, b.lo);
#ifdef __CUDA_ARCH__
s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
#else
s = quick_two_sum(s.hi, s.lo + t.hi);
return quick_two_sum(s.hi, s.lo + t.lo);
#endif
}
// Multiplication
@@ -220,16 +231,16 @@ struct f128x2 {
// Subtraction
__host__ __device__ friend f128x2 operator-(const f128x2 &a,
const f128x2 &b) {
return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
return f128x2(f128::sub_estimate(a.re, b.re),
f128::sub_estimate(a.im, b.im));
}
// Multiplication (complex multiplication)
__host__ __device__ friend f128x2 operator*(const f128x2 &a,
const f128x2 &b) {
const f128 a_im_b_im = f128::mul(a.im, b.im);
f128 real_part =
f128::add(f128::mul(a.re, b.re),
f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
return f128x2(real_part, imag_part);
}
@@ -243,8 +254,8 @@ struct f128x2 {
// Subtraction-assignment operator
__host__ __device__ f128x2 &operator-=(const f128x2 &other) {
re = f128::add(re, f128(-other.re.hi, -other.re.lo));
im = f128::add(im, f128(-other.im.hi, -other.im.lo));
re = f128::sub_estimate(re, other.re);
im = f128::sub_estimate(im, other.im);
return *this;
}
@@ -261,12 +272,20 @@ struct f128x2 {
};
__host__ __device__ inline uint64_t double_to_bits(double d) {
#ifdef __CUDA_ARCH__
uint64_t bits = __double_as_longlong(d);
#else
uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
#endif
return bits;
}
__host__ __device__ inline double bits_to_double(uint64_t bits) {
#ifdef __CUDA_ARCH__
double d = __longlong_as_double(bits);
#else
double d = *reinterpret_cast<double *>(&bits);
#endif
return d;
}
@@ -275,6 +294,8 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
const double A = ONE << 52;
const double B = ONE << 104;
const double C = ONE << 76;
// NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
// does not work here
const double D = 340282366920938500000000000000000000000.;
const __uint128_t threshold = (ONE << 104);
@@ -288,15 +309,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
uint64_t bits_l = A_bits | lower64;
double l_temp = bits_to_double(bits_l);
double l = l_temp - A;
uint64_t B_bits = double_to_bits(B);
uint64_t top64 = static_cast<uint64_t>(x >> 52);
uint64_t bits_h = B_bits | top64;
double h_temp = bits_to_double(bits_h);
#ifdef __CUDA_ARCH__
return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
#else
double l = l_temp - A;
double h = h_temp - B;
return (l + h);
#endif
} else {
uint64_t C_bits = double_to_bits(C);
@@ -310,15 +336,20 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
uint64_t bits_l = C_bits | lower64 | mask_part;
double l_temp = bits_to_double(bits_l);
double l = l_temp - C;
uint64_t D_bits = double_to_bits(D);
uint64_t top64 = static_cast<uint64_t>(x >> 76);
uint64_t bits_h = D_bits | top64;
double h_temp = bits_to_double(bits_h);
#ifdef __CUDA_ARCH__
return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
#else
double l = l_temp - C;
double h = h_temp - D;
return (l + h);
#endif
}
}
@@ -389,6 +420,8 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
auto x = f128::sub_estimate(a, f128::f128_floor(a));
// NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
// does not work here
const double normalization = 340282366920938500000000000000000000000.;
#ifdef __CUDA_ARCH__
x.hi = __dmul_rn(x.hi, normalization);
@@ -398,7 +431,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
x.lo *= normalization;
#endif
// TODO has to be round
x = f128::add_estimate(x, f128(0.5, 0.0));
x = f128::f128_floor(x);
__uint128_t x0 = f64_to_u128(x.hi);

View File

@@ -12,8 +12,9 @@
using Index = unsigned;
#define NEG_TWID(i) \
f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]), \
f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
f128x2( \
f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])), \
f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))
#define F64x4_TO_F128x2(f128x2_reg, ind) \
f128x2_reg.re.hi = dt_re_hi[ind]; \
@@ -75,7 +76,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
if (u_stays_in_register) {
F128x2_TO_F64x4(v[i], tid);
} else {
F128x2_TO_F64x4(u[i], tid);
}
tid = tid + STRIDE;
}
__syncthreads();
@@ -86,8 +91,11 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F64x4_TO_F128x2(w, tid ^ lane_mask);
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
if (u_stays_in_register) {
v[i] = w;
} else {
u[i] = w;
}
w = NEG_TWID(tid / lane_mask + twiddle_shift);
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
@@ -151,7 +159,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
// keep one of the register for next iteration and store another one in sm
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
if (u_stays_in_register) {
F128x2_TO_F64x4(v[i], tid);
} else {
F128x2_TO_F64x4(u[i], tid);
}
tid = tid + STRIDE;
}
@@ -165,8 +177,11 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
bool u_stays_in_register = rank < lane_mask;
F64x4_TO_F128x2(w, tid ^ lane_mask);
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
if (u_stays_in_register) {
v[i] = w;
} else {
u[i] = w;
}
tid = tid + STRIDE;
}
@@ -218,7 +233,7 @@ __device__ void convert_u128_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re, const __uint128_t *in_im) {
const double normalization = pow(2., -128.);
const double normalization = __longlong_as_double(0x37f0000000000000ULL);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {
@@ -241,7 +256,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
const double normalization = pow(2., -128.);
const double normalization = __longlong_as_double(0x37f0000000000000ULL);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {

View File

@@ -12,7 +12,7 @@
#include "integer/subtraction.cuh"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
// lwe_dimension + 1 threads
// todo: This kernel MUST be refactored to a binary reduction
@@ -98,7 +98,7 @@ __host__ void are_all_comparisons_block_true(
while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
int num_chunks = CEIL_DIV(remaining_blocks, max_value);
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
@@ -222,7 +222,7 @@ __host__ void is_at_least_one_comparisons_block_true(
uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
int num_chunks = CEIL_DIV(remaining_blocks, max_value);
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones

View File

@@ -10,7 +10,122 @@
#include "integer/integer.cuh"
#include "linearalgebra/multiplication.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
/*
* =============================================================================
* GPU Compression/Decompression Algorithm: Overview
* =============================================================================
*
* The compression algorithm transforms standard LWE ciphertexts into a compact
* packed format. Decompression reverses this process.
*
* -----------------------------------------------------------------------------
* COMPRESSION INPUT (lwe_array_in)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | lwe_array_in (GPU memory) |
* +-------------------------------------------------------------------------+
* +---------------------------+---------------------------+-----------------+
* | LWE 0 | LWE 1 | ... |
* | [mask, body] | [mask, body] | |
* +---------------------------+---------------------------+-----------------+
* |<-- lwe_dimension + 1 -->|
*
* Total LWEs: total_lwe_bodies_count (num_radix_blocks)
*
* -----------------------------------------------------------------------------
* COMPRESSION PROCESS
* -----------------------------------------------------------------------------
*
* 1. Message Shift (64-bit only):
* Each LWE is multiplied by message_modulus to shift the message to MSB
*
* 2. Packing Keyswitch (LWE -> GLWE):
* Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
*
* +--------------------------------------------------------------+
* | lwe_per_glwe LWEs (input batch) |
* | LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1] |
* +--------------------------------------------------------------+
* |
* Packing Keyswitch
* v
* +--------------------------------------------------------------+
* | Single GLWE Ciphertext |
* | [A_0, A_1, ..., A_{k-1}, B] |
* | |<-- k * polynomial_size -->| |<-- polynomial_size -->| |
* +--------------------------------------------------------------+
*
* Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
* lwe_per_glwe)
*
* 3. Modulus Switch:
* Reduce precision from 64-bit torus to storage_log_modulus bits
*
* 4. Bit Packing:
* Pack multiple reduced-precision elements into dense bit representation
*
* -----------------------------------------------------------------------------
* COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | tmp_glwe_array_out (intermediate buffer) |
* +-------------------------------------------------------------------------+
* +----------------------------+----------------------------+---------------+
* | GLWE 0 | GLWE 1 | ... |
* | [A_0..A_{k-1}, B_0..B_N] | [A_0..A_{k-1}, B_0..B_N] | |
* +----------------------------+----------------------------+---------------+
* |<-- glwe_accumulator_size = (k+1)*N -->|
*
* Total size needed: num_glwes * glwe_accumulator_size elements
* Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
*
* -----------------------------------------------------------------------------
* PACKED OUTPUT (glwe_array_out)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | Packed GLWE Ciphertext List (bit-packed) |
* +-------------------------------------------------------------------------+
* +-------------------------------------------------------------------------+
* | Elements packed with storage_log_modulus bits per original element |
* | Total packed size: ceil(in_len * storage_log_modulus / 64) elements |
* +-------------------------------------------------------------------------+
*
* =============================================================================
* DECOMPRESSION (Extract) Algorithm
* =============================================================================
*
* Decompression receives an array of LWE indexes. For each index, it identifies
* the corresponding GLWE, extracts that GLWE from the packed representation,
* and then sample-extracts the requested LWE from the GLWE.
*
* -----------------------------------------------------------------------------
* EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | Extracted GLWE Ciphertext |
* +-------------------------------------------------------------------------+
* +---------------------------------------+-----------------+---------------+
* | Mask (A polynomials) | Body (B) | Tail |
* | [A_0, ..., A_{k-1}] | (body_count) | (zeroed) |
* | k * polynomial_size elements | elements | elements |
* +---------------------------------------+-----------------+---------------+
* |<------------------- initial_out_len ------------------->| |
* |<------------------------ glwe_ciphertext_size ------------------------->|
*
* For the last GLWE, body_count may be less than polynomial_size (partial).
* The tail region must be zeroed to ensure defined behavior.
*
* tail_size = glwe_ciphertext_size - initial_out_len
* tail_offset = initial_out_len (NOT 0!)
*
* =============================================================================
*/
template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
@@ -66,7 +181,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto nbits = sizeof(Torus) * 8;
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
auto out_len = CEIL_DIV(number_bits_to_pack, nbits);
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
@@ -108,6 +223,8 @@ host_integer_compress(CudaStreams streams,
uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
glwe_array_out->lwe_per_glwe - 1) /
glwe_array_out->lwe_per_glwe;
PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
"Invalid number of GLWEs");
// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -200,8 +317,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
uint32_t num_glwes =
(total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
uint32_t num_glwes = CEIL_DIV(total_lwe_bodies_count, polynomial_size);
// Compressed length of the compressed GLWE we want to extract
uint32_t body_count = 0;
@@ -218,19 +334,21 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;
// Calculates how many bits this particular GLWE shall use
auto number_bits_to_unpack = initial_out_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
// Calculates how many bits a full-packed GLWE shall use
number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
auto len = (number_bits_to_unpack + nbits - 1) / nbits;
// Calculate how many bits a full-packed GLWE uses, to determine
// the stride between consecutive packed GLWEs in the input buffer
auto number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
auto len = CEIL_DIV(number_bits_to_unpack, nbits);
// Uses that length to set the input pointer
auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;
// Ensure the tail of the GLWE is zeroed
// The extract kernel writes initial_out_len elements starting at offset 0.
// We must zero the tail region (from initial_out_len to
// glwe_ciphertext_size)
if (initial_out_len < glwe_ciphertext_size) {
cuda_memset_async(glwe_array_out, 0,
cuda_memset_async(glwe_array_out + initial_out_len, 0,
(glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
stream, gpu_index);
}
@@ -368,7 +486,7 @@ host_integer_decompress(CudaStreams streams,
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
active_streams, lwe_array_in_vec, extracted_lwe, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
lut->active_streams.count(), num_blocks_to_decompress,
compression_params.small_lwe_dimension + 1);
@@ -388,7 +506,7 @@ host_integer_decompress(CudaStreams streams,
multi_gpu_gather_lwe_async<Torus>(
active_streams, (Torus *)d_lwe_array_out->ptr, lwe_after_pbs_vec,
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_blocks_to_decompress,
lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_decompress,
encryption_params.big_lwe_dimension + 1);
/// Synchronize all GPUs

View File

@@ -14,7 +14,6 @@
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
#include "utils/helper_profile.cuh"
#include "utils/kernel_dimensions.cuh"
#include <algorithm>
#include <functional>
@@ -273,8 +272,7 @@ __global__ void device_radix_split_simulators_and_grouping_pgns(
}
}
if ((blockIdx.x / group_size + 1) <
(blocks_count + group_size - 1) / group_size) {
if ((blockIdx.x / group_size + 1) < CEIL_DIV(blocks_count, group_size)) {
size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
@@ -363,7 +361,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
num_radix_blocks > src1->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks should have more "
"blocks than the number used in sum in groups")
auto num_groups = (num_radix_blocks + group_size - 1) / group_size;
auto num_groups = CEIL_DIV(num_radix_blocks, group_size);
if (src2->num_radix_blocks < num_groups)
PANIC("Cuda error: second input in sum in groups should have at least "
"num_groups blocks")
@@ -570,8 +568,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
multi_gpu_scatter_lwe_async<Torus>(
active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
big_lwe_dimension + 1);
lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(
@@ -594,7 +592,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
multi_gpu_gather_lwe_async<Torus>(
active_streams, (Torus *)lwe_array_out->ptr, lwe_after_pbs_vec,
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
big_lwe_dimension + 1);
POP_RANGE()
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
active_streams);
@@ -674,8 +673,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
multi_gpu_scatter_lwe_async<Torus>(
active_streams, lwe_array_in_vec, (Torus *)lwe_array_in->ptr,
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
big_lwe_dimension + 1);
lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(
@@ -791,8 +790,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
multi_gpu_scatter_lwe_async<Torus>(
active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr,
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, lut->active_streams.count(), num_radix_blocks,
big_lwe_dimension + 1);
lut->lwe_aligned_vec, lut->event_pool, lut->active_streams.count(),
num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(
@@ -815,7 +814,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
multi_gpu_gather_lwe_async<Torus>(
active_streams, (Torus *)(lwe_array_out->ptr), lwe_after_pbs_vec,
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
lut->lwe_aligned_vec, lut->event_pool, num_radix_blocks,
big_lwe_dimension + 1);
POP_RANGE()
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
active_streams);
@@ -2342,7 +2342,7 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
// from the output ct
auto active_streams = streams.active_gpu_subset(
auto active_streams = streams.active_gpu_subset_u128(
lwe_array_out->num_radix_blocks, params.pbs_type);
if (active_streams.count() == 1) {
execute_keyswitch_async<InputTorus>(
@@ -2373,8 +2373,9 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
multi_gpu_scatter_lwe_async<InputTorus>(
active_streams, lwe_array_in_vec, (InputTorus *)lwe_array_pbs_in->ptr,
lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_scatter_vec, lut->active_streams.count(),
lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
lut->lwe_aligned_scatter_vec, lut->event_pool,
lut->active_streams.count(), lwe_array_out->num_radix_blocks,
lut->input_big_lwe_dimension + 1);
execute_keyswitch_async<InputTorus>(
active_streams, lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -2397,7 +2398,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
multi_gpu_gather_lwe_async<__uint128_t>(
active_streams, (__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
nullptr, lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
lut->event_pool, lwe_array_out->num_radix_blocks,
big_lwe_dimension + 1);
/// Synchronize all GPUs
streams.synchronize();

View File

@@ -362,7 +362,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
radix_columns current_columns(current_blocks->degrees, num_radix_blocks,
num_radix_in_vec, chunk_size, needs_processing);
int number_of_threads = std::min(256, (int)mem_ptr->params.polynomial_size);
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
int part_count = CEIL_DIV(big_lwe_size, number_of_threads);
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
mem_ptr->setup_lookup_tables(streams, num_radix_in_vec,

View File

@@ -10,7 +10,7 @@
#include "device.h"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
#include <iostream>
#include <sstream>
#include <string>

View File

@@ -54,7 +54,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async<Torus>(
active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec, lut->event_pool,
active_streams.count(), num_blocks_to_process,
mem_ptr->params.small_lwe_dimension + 1);
POP_RANGE()
@@ -72,7 +72,7 @@ void host_integer_grouped_oprf(CudaStreams streams,
multi_gpu_gather_lwe_async<Torus>(
active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_blocks_to_process,
lut->lwe_aligned_vec, lut->event_pool, num_blocks_to_process,
mem_ptr->params.big_lwe_dimension + 1);
POP_RANGE()
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(

View File

@@ -4,8 +4,8 @@
#include "device.h"
#include "integer/integer.h"
#include "integer/radix_ciphertext.h"
#include "utils/helper.cuh"
#include "utils/helper_profile.cuh"
#include "utils/kernel_dimensions.cuh"
inline CudaLweCiphertextListFFI
to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {

View File

@@ -5,8 +5,8 @@
#include "integer/radix_ciphertext.h"
#include "integer/rerand.h"
#include "integer/rerand_utilities.h"
#include "utils/helper.cuh"
#include "utils/helper_profile.cuh"
#include "utils/kernel_dimensions.cuh"
#include "zk/zk_utilities.h"
template <typename Torus, class params>

View File

@@ -8,7 +8,7 @@
#include "device.h"
#include "helper_multi_gpu.h"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
#include <stdio.h>
template <typename Torus>

View File

@@ -12,7 +12,7 @@
#include "integer/scalar_mul.h"
#include "multiplication.cuh"
#include "scalar_shifts.cuh"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
#include <stdio.h>
template <typename T>

View File

@@ -116,7 +116,7 @@ __host__ void host_integer_overflowing_sub(
radix_params.message_modulus * radix_params.carry_modulus;
uint32_t num_bits_in_block = log2_int(block_modulus);
uint32_t grouping_size = num_bits_in_block;
uint32_t num_groups = (num_blocks + grouping_size - 1) / grouping_size;
uint32_t num_groups = CEIL_DIV(num_blocks, grouping_size);
host_unchecked_sub_with_correcting_term<Torus>(
streams.stream(0), streams.gpu_index(0), output, input_left, input_right,

View File

@@ -175,14 +175,17 @@ __host__ void host_aggregate_one_hot_vector(
Torus *const *ksks) {
int_radix_params params = mem_ptr->params;
if (params.message_modulus > 4 && params.carry_modulus > 4) {
PANIC("Cuda error: aggregate one hot vector is only implemented for 1_1 "
"and 2_2 params");
}
uint32_t chunk_size = mem_ptr->chunk_size;
uint32_t num_streams = mem_ptr->num_streams;
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t inputs_per_stream =
(num_input_ciphertexts + num_streams - 1) / num_streams;
uint32_t inputs_per_stream = CEIL_DIV(num_input_ciphertexts, num_streams);
for (uint32_t s = 0; s < num_streams; s++) {
@@ -210,7 +213,7 @@ __host__ void host_aggregate_one_hot_vector(
if (count_in_stream == 0)
continue;
uint32_t num_chunks = (count_in_stream + chunk_size - 1) / chunk_size;
uint32_t num_chunks = CEIL_DIV(count_in_stream, chunk_size);
//
// Process chunks of input ciphertexts for the current stream
@@ -255,7 +258,10 @@ __host__ void host_aggregate_one_hot_vector(
//
// Aggregate partial results from all streams into the final aggregated vector
// num_streams has to be less than the max noise level otherwise we accumulate
// too much and the noise limit is exceeded
//
CHECK_NOISE_LEVEL(num_streams, params.message_modulus, params.carry_modulus);
for (uint32_t s = 1; s < num_streams; s++) {
uint32_t start_idx = s * inputs_per_stream;
if (start_idx >= num_input_ciphertexts)

View File

@@ -8,7 +8,7 @@
#include "helper_multi_gpu.h"
#include "integer/integer.h"
#include "integer/integer_utilities.h"
#include "utils/kernel_dimensions.cuh"
#include "utils/helper.cuh"
#include <stdio.h>
template <typename T>

View File

@@ -6,7 +6,7 @@
#include <cuda_runtime.h>
#endif
#include "../utils/kernel_dimensions.cuh"
#include "../utils/helper.cuh"
#include "device.h"
#include "linear_algebra.h"
#include <fstream>

View File

@@ -6,7 +6,7 @@
#include <cuda_runtime.h>
#endif
#include "../utils/kernel_dimensions.cuh"
#include "../utils/helper.cuh"
#include "device.h"
#include "linear_algebra.h"

View File

@@ -211,6 +211,8 @@ __global__ void device_programmable_bootstrap_amortized(
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
// No need to sync here, it is already synchronized after add_to_torus
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
}
@@ -303,7 +305,9 @@ __host__ void host_programmable_bootstrap_amortized(
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
PANIC_IF_FALSE(sizeof(Torus) == 8,
"Error: Programmable bootstrap amortized only supports 64-bit "
"Torus type.");
uint64_t SM_FULL =
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
polynomial_size, glwe_dimension);

View File

@@ -173,6 +173,7 @@ __global__ void device_programmable_bootstrap_cg(
}
}
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
@@ -184,7 +185,8 @@ __global__ void device_programmable_bootstrap_cg(
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync, it is already synchronized before the first
// sample_extract_body call
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
@@ -247,7 +249,9 @@ __host__ void host_programmable_bootstrap_cg(
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
uint32_t num_many_lut, uint32_t lut_stride) {
PANIC_IF_FALSE(sizeof(Torus) == 8,
"Error: Programmable bootstrap cg only supports 64-bit "
"Torus type.");
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
uint64_t full_sm =

View File

@@ -151,9 +151,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
@@ -165,6 +164,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync, it is already synchronized before the first
// sample_extract_body call
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
@@ -302,7 +303,10 @@ __host__ void execute_cg_external_product_loop(
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
uint32_t lut_stride) {
cuda_set_device(gpu_index);
PANIC_IF_FALSE(
sizeof(Torus) == 8,
"Error: Programmable bootstrap multi-bit cg only supports 64-bit "
"Torus type.");
uint64_t full_sm =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);

View File

@@ -239,6 +239,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
}
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
@@ -251,6 +252,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync, it is already synchronized before the first
// sample_extract_body call
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
@@ -409,7 +412,9 @@ __host__ void execute_step_one(
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
PBS_MS_REDUCTION_T noise_reduction_type) {
PANIC_IF_FALSE(sizeof(Torus) == 8,
"Error: Programmable bootstrap step one only supports 64-bit "
"Torus type.");
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;
@@ -451,7 +456,9 @@ __host__ void execute_step_two(
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm,
uint32_t num_many_lut, uint32_t lut_stride) {
PANIC_IF_FALSE(sizeof(Torus) == 8,
"Error: Programmable bootstrap step two only supports 64-bit "
"Torus type.");
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
cuda_set_device(gpu_index);
int thds = polynomial_size / params::opt;

View File

@@ -44,30 +44,31 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
host_programmable_bootstrap_128<InputTorus, Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 512:
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
host_programmable_bootstrap_128<InputTorus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 1024:
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
host_programmable_bootstrap_128<InputTorus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 2048:
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
host_programmable_bootstrap_128<InputTorus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
@@ -91,30 +92,31 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
host_programmable_bootstrap_cg_128<InputTorus, Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 512:
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
host_programmable_bootstrap_cg_128<InputTorus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 1024:
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
host_programmable_bootstrap_cg_128<InputTorus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 2048:
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
host_programmable_bootstrap_cg_128<InputTorus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples);
break;
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,

View File

@@ -240,10 +240,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
} else {
// Persist the updated accumulator
// We don't sync here because we use same indexes to read from `accumulator`
// as it was used in `add_to_torus_128` to write inside it Persist the
// updated accumulator
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
global_slice[tid] = accumulator[tid];
@@ -395,6 +398,7 @@ __global__ void device_programmable_bootstrap_cg_128(
accumulator);
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
0);
}
@@ -571,38 +575,35 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
input_lwe_ciphertext_count, max_shared_memory)) {
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<256>>(
return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
break;
case 512:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<512>>(
return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
break;
case 1024:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<1024>>(
return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
break;
case 2048:
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<2048>>(
return scratch_programmable_bootstrap_cg_128<InputTorus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
break;
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
return scratch_programmable_bootstrap_cg_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
@@ -618,38 +619,35 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
} else {
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
return scratch_programmable_bootstrap_128<InputTorus, Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
break;
case 512:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
return scratch_programmable_bootstrap_128<InputTorus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
break;
case 1024:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
return scratch_programmable_bootstrap_128<InputTorus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
break;
case 2048:
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
return scratch_programmable_bootstrap_128<InputTorus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory,
noise_reduction_type);
break;
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
return scratch_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
@@ -956,22 +954,19 @@ __host__ bool supports_cooperative_groups_on_programmable_bootstrap_128(
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 512:
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 1024:
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 2048:
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
return verify_cuda_programmable_bootstrap_128_cg_grid_size<Degree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
return verify_cuda_programmable_bootstrap_128_cg_grid_size<
AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);

View File

@@ -461,6 +461,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
}
} else if (blockIdx.y == glwe_dimension) {
// No need to sync here, it is already synchronized after add_to_torus
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
@@ -473,6 +474,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync here, it is already synchronized after add_to_torus
sample_extract_body<Torus, params>(next_block_lwe_array_out,
global_slice, 0, i * lut_stride);
}
@@ -663,7 +665,9 @@ __host__ void execute_compute_keybundle(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t level_count, uint32_t lwe_offset) {
cuda_set_device(gpu_index);
PANIC_IF_FALSE(sizeof(Torus) == 8,
"Error: PBS keybundle only supports 64-bit "
"Torus type.");
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
@@ -735,7 +739,10 @@ __host__ void execute_step_one(
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
cuda_set_device(gpu_index);
PANIC_IF_FALSE(
sizeof(Torus) == 8,
"Error: Programmable bootstrap multi-bit step one only supports 64-bit "
"Torus type.");
uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
polynomial_size);
@@ -789,7 +796,10 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
uint32_t level_count, uint32_t j, uint32_t num_many_lut,
uint32_t lut_stride) {
cuda_set_device(gpu_index);
PANIC_IF_FALSE(
sizeof(Torus) == 8,
"Error: Programmable bootstrap multi-bit step two only supports 64-bit "
"Torus type.");
uint32_t lwe_chunk_size = (uint32_t)(buffer->lwe_chunk_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(

View File

@@ -11,29 +11,30 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128(
switch (polynomial_size) {
case 256:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
return scratch_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
@@ -55,30 +56,31 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap_128(
switch (polynomial_size) {
case 256:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<256>>(
return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<512>>(
return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<1024>>(
return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<2048>>(
return scratch_cg_multi_bit_programmable_bootstrap_128<InputTorus,
Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
return scratch_cg_multi_bit_programmable_bootstrap_128<
InputTorus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
@@ -97,7 +99,7 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
bool supports_cg =
supports_cooperative_groups_on_multibit_programmable_bootstrap<
supports_cooperative_groups_on_multibit_programmable_bootstrap_128<
__uint128_t>(glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
@@ -129,7 +131,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
switch (polynomial_size) {
case 256:
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -137,7 +139,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
num_many_lut, lut_stride);
break;
case 512:
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -145,8 +147,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
num_many_lut, lut_stride);
break;
case 1024:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -154,8 +155,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
num_many_lut, lut_stride);
break;
case 2048:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
host_multi_bit_programmable_bootstrap_128<InputTorus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -163,6 +163,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
num_many_lut, lut_stride);
break;
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
@@ -191,8 +192,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
switch (polynomial_size) {
case 256:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -200,8 +200,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
num_many_lut, lut_stride);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -209,8 +208,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
num_many_lut, lut_stride);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -218,8 +216,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
num_many_lut, lut_stride);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
host_cg_multi_bit_programmable_bootstrap_128<InputTorus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
@@ -227,6 +224,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
num_many_lut, lut_stride);
break;
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,

View File

@@ -333,6 +333,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
}
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<__uint128_t, params>(block_lwe_array_out,
global_slice, 0);
if (num_many_lut > 1) {
@@ -346,6 +347,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync, it is already synchronized before the first
// sample_extract_body call
sample_extract_body<__uint128_t, params>(
next_block_lwe_array_out, global_slice, 0, i * lut_stride);
}
@@ -505,10 +508,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<__uint128_t, params>(block_lwe_array_out,
accumulator, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
@@ -519,7 +521,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync, it is already synchronized before the first
// sample_extract_body call
sample_extract_body<__uint128_t, params>(
next_block_lwe_array_out, accumulator, 0, i * lut_stride);
}
@@ -1090,4 +1093,109 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap_128(
return size_tracker;
}
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128(
int glwe_dimension, int level_count, int num_samples,
uint32_t max_shared_memory) {
// If Cooperative Groups is not supported, no need to check anything else
if (!cuda_check_support_cooperative_groups())
return false;
// Calculate the dimension of the kernel
uint64_t full_sm_cg_accumulate =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
params::degree);
uint64_t partial_sm_cg_accumulate =
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
params::degree);
int thds = params::degree / params::opt;
// Get the maximum number of active blocks per streaming multiprocessors
int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
int max_active_blocks_per_sm;
if (max_shared_memory < partial_sm_cg_accumulate) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
Torus, params, NOSM>,
thds, 0);
} else if (max_shared_memory < full_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
PARTIALSM>,
cudaFuncCachePreferShared);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
Torus, params, PARTIALSM>,
thds, partial_sm_cg_accumulate);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate_128<Torus, params,
FULLSM>,
cudaFuncCachePreferShared);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
Torus, params, FULLSM>,
thds, full_sm_cg_accumulate);
check_cuda_error(cudaGetLastError());
}
// Get the number of streaming multiprocessors
int number_of_sm = 0;
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
}
// Verify if the grid size for the multi-bit kernel satisfies the cooperative
// group constraints
template <typename Torus>
__host__ bool
supports_cooperative_groups_on_multibit_programmable_bootstrap_128(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
Torus, Degree<256>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 512:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
Torus, Degree<512>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 1024:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
Torus, Degree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 2048:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
Torus, Degree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
case 4096:
// We use AmortizedDegree for 4096 to avoid register exhaustion
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size_128<
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
default:
PANIC(
"Cuda error (multi-bit PBS128): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
#endif // PROGRAMMABLE_BOOTSTRAP_MULTIBIT_128_CUH

View File

@@ -179,6 +179,7 @@ __global__ void device_programmable_bootstrap_tbc(
}
}
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
@@ -191,13 +192,17 @@ __global__ void device_programmable_bootstrap_tbc(
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync, it is already synchronized before the first
// sample_extract_body call
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
}
// Before exiting the kernel we need to sync the cluster to ensure that
// other blocks can still access the dsm in the mul ggsw glwe
cluster.sync();
}
template <typename Torus, class params, sharedMemDegree SMD>
@@ -207,8 +212,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride,
const double2 *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
uint32_t num_many_lut, uint32_t lut_stride,
PBS_MS_REDUCTION_T noise_reduction_type) {
constexpr uint32_t level_count = 1;
@@ -254,9 +259,6 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
(glwe_dimension + 1)];
double2 *block_join_buffer =
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
params::degree / 2];
// Since the space is L1 cache is small, we use the same memory location for
// the rotated accumulator and the fft accumulator, since we know that the
// rotated array is not in use anymore by the time we perform the fft
@@ -279,13 +281,14 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
Torus temp_a_hat = 0;
for (int i = 0; i < lwe_dimension; i++) {
constexpr int WARP_SIZE = 32;
// We calculate the modulus switch of a warp size of elements
if (i % 32 == 0 && (i + threadIdx.x % 32) < lwe_dimension) {
modulus_switch(block_lwe_array_in[i + threadIdx.x % 32], temp_a_hat,
log_modulus);
if (i % WARP_SIZE == 0 && (i + threadIdx.x % WARP_SIZE) < lwe_dimension) {
modulus_switch(block_lwe_array_in[i + threadIdx.x % WARP_SIZE],
temp_a_hat, log_modulus);
}
// each iteration we broadcast the corresponding ms previously calculated
Torus a_hat = __shfl_sync(0xFFFFFFFF, temp_a_hat, i % 32);
Torus a_hat = __shfl_sync(0xFFFFFFFF, temp_a_hat, i % WARP_SIZE);
__syncthreads();
Torus reg_acc_rotated[params::opt];
@@ -360,6 +363,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
}
}
} else if (blockIdx.y == glwe_dimension) {
// No need to sync here, it is already synchronized after add_to_torus
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
@@ -373,12 +377,16 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync here, it is already synchronized after add_to_torus
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
}
// Before exiting the kernel we need to sync the cluster to ensure that
// other blocks can still access the dsm in the mul ggsw glwe or the
// ping pong buffers
cluster.sync();
}
@@ -458,6 +466,9 @@ __host__ void host_programmable_bootstrap_tbc(
uint32_t num_many_lut, uint32_t lut_stride) {
cuda_set_device(gpu_index);
PANIC_IF_FALSE(sizeof(Torus) == 8,
"Error: Programmable bootstrap tbc only supports 64-bit "
"Torus type.");
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
auto supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
@@ -543,8 +554,8 @@ __host__ void host_programmable_bootstrap_tbc(
&config,
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, num_many_lut, lut_stride, noise_reduction_type));
lwe_array_in, lwe_input_indexes, bootstrapping_key, lwe_dimension,
num_many_lut, lut_stride, noise_reduction_type));
} else {
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;

View File

@@ -156,6 +156,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
}
} else if (blockIdx.y == glwe_dimension) {
__syncthreads();
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
@@ -167,7 +168,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync, it is already synchronized before the first
// sample_extract_body call
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
@@ -179,6 +181,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
accumulator, global_accumulator_slice);
}
}
// Before exiting the kernel we need to sync the cluster to ensure that
// other blocks can still access the dsm in the mul ggsw glwe
cluster.sync();
}
// Specialized version for the multi-bit bootstrap using 2_2 params:
@@ -356,6 +361,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
}
} else if (blockIdx.y == glwe_dimension) {
// No need to sync here, it is already synchronized after add_to_torus
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (num_many_lut > 1) {
for (int i = 1; i < num_many_lut; i++) {
@@ -367,7 +373,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
&next_lwe_array_out[lwe_output_indexes[blockIdx.x] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
// No need to sync here, it is already synchronized after
// add_to_torus
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
@@ -379,7 +386,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
reg_acc_rotated, global_accumulator_slice);
}
// Before exiting the kernel we need to sync the cluster to ensure that
// that other blocks can still access the dsm in the ping pong buffer
// other blocks can still access the dsm in the ping pong buffer
cluster.sync();
}
@@ -523,6 +530,11 @@ __host__ void execute_tbc_external_product_loop(
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t lwe_offset, uint32_t num_many_lut,
uint32_t lut_stride) {
PANIC_IF_FALSE(
sizeof(Torus) == 8,
"Error: Programmable bootstrap multi-bit tbc only supports 64-bit "
"Torus type.");
cuda_set_device(gpu_index);
auto lwe_chunk_size = buffer->lwe_chunk_size;

View File

@@ -3,8 +3,6 @@
#include "crypto/torus.cuh"
#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
#define CIRCULANT_BLOCKTILE 32
// Make a circulant matrix that serves to multiply a polynomial
// with another one. Each thread loads a part of the original

View File

@@ -339,8 +339,10 @@ template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus const *glwe,
uint32_t glwe_dimension, uint32_t nth = 0) {
// Set first coefficient of the glwe as the body of the LWE sample
lwe_array_out[glwe_dimension * params::degree] =
glwe[glwe_dimension * params::degree + nth];
if (threadIdx.x == 0) {
lwe_array_out[glwe_dimension * params::degree] =
glwe[glwe_dimension * params::degree + nth];
}
}
// Extracts the mask from the nth-LWE in a GLWE.

View File

@@ -8,8 +8,6 @@
#include "parameters.cuh"
#include "types/complex/operations.cuh"
#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
template <typename T>
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
int pos = chunk_num * chunk_size;

View File

@@ -1,135 +1,34 @@
#ifndef HELPER_CUH
#define HELPER_CUH
#include <cstdint>
#include <sstream>
#include <stdio.h>
#include <type_traits>
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
template <typename T> __device__ inline const char *get_format();
template <> __device__ inline const char *get_format<int>() { return "%d, "; }
template <> __device__ inline const char *get_format<unsigned int>() {
return "%u, ";
inline int nextPow2(int x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
template <> __device__ inline const char *get_format<uint64_t>() {
return "%lu, ";
inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
int &blocks, int &threads) {
threads =
(n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
blocks = CEIL_DIV(n, threads);
}
template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
for (int i = 0; i < N; i++) {
printf(get_format<T>(), src[i]);
}
// Determines blocks and threads in x for a given blockDim.y using the same
// logic than above
inline void getNumBlocksAndThreads2D(const int n, const int maxBlockSize,
const int block_dim_y, int &blocks,
int &threads_x) {
const int max_block_dim_x = maxBlockSize / block_dim_y;
threads_x = (n < max_block_dim_x * 2) ? max(128, nextPow2((n + 1) / 2))
: max_block_dim_x;
blocks = CEIL_DIV(n, threads_x);
}
template <>
__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
for (int i = 0; i < N; i++) {
uint64_t low = static_cast<uint64_t>(src[i]);
uint64_t high = static_cast<uint64_t>(src[i] >> 64);
printf("(%llu, %llu), ", high, low);
}
}
template <>
__global__ inline void print_debug_kernel(const double2 *src, int N) {
for (int i = 0; i < N; i++) {
printf("(%lf, %lf), ", src[i].x, src[i].y);
}
}
template <typename T> void print_debug(const char *name, const T *src, int N) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_debug_kernel<<<1, 1>>>(src, N);
cudaDeviceSynchronize();
printf("\n");
}
template <typename T>
__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
for (int i = 0; i < N; i++) {
T body = src[i * (lwe_dimension + 1) + lwe_dimension];
T clear = body / delta;
printf("(%lu, %lu), ", body, clear);
}
}
template <typename T>
void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
cudaDeviceSynchronize();
printf("\n");
}
template <typename Torus>
void print_2d_csv_to_file(const std::vector<Torus> &v, int col_size,
const char *fname) {
FILE *fp = fopen(fname, "wt");
for (int i = 0; i < v.size() / col_size; ++i) {
for (int j = 0; j < col_size; ++j) {
fprintf(fp, "%lu%c", v[i * col_size + j],
(j == col_size - 1) ? '\n' : ',');
}
}
fclose(fp);
}
template <typename Torus>
__host__ void dump_2d_gpu_to_file(const Torus *ptr, int row_size, int col_size,
const char *fname_prefix, int rand_prefix,
cudaStream_t stream, uint32_t gpu_index) {
// #ifndef NDEBUG
std::vector<Torus> buf_cpu(row_size * col_size);
char fname[4096];
snprintf(fname, 4096, "%s_%d_%d_%d.csv", fname_prefix, row_size, col_size,
rand_prefix);
cuda_memcpy_async_to_cpu((void *)&buf_cpu[0], ptr,
buf_cpu.size() * sizeof(Torus), stream, gpu_index);
cuda_synchronize_device(gpu_index);
print_2d_csv_to_file(buf_cpu, col_size, fname);
// #endif
}
template <typename Torus>
__host__ void compare_2d_arrays(const Torus *ptr1, const Torus *ptr2,
int row_size, int col_size, cudaStream_t stream,
uint32_t gpu_index) {
// #ifndef NDEBUG
std::vector<Torus> buf_cpu1(row_size * col_size),
buf_cpu2(row_size * col_size);
;
cuda_memcpy_async_to_cpu((void *)&buf_cpu1[0], ptr1,
buf_cpu1.size() * sizeof(Torus), stream, gpu_index);
cuda_memcpy_async_to_cpu((void *)&buf_cpu2[0], ptr2,
buf_cpu2.size() * sizeof(Torus), stream, gpu_index);
cuda_synchronize_device(gpu_index);
std::vector<uint32_t> non_matching_indexes;
for (int i = 0; i < buf_cpu1.size(); ++i) {
if (buf_cpu1[i] != buf_cpu2[i]) {
non_matching_indexes.push_back(i);
}
}
if (!non_matching_indexes.empty()) {
std::stringstream ss;
for (int i = 0; i < std::min(non_matching_indexes.size(), (size_t)10);
++i) {
ss << " difference at " << non_matching_indexes[i] << ": "
<< buf_cpu1[non_matching_indexes[i]] << " vs "
<< buf_cpu2[non_matching_indexes[i]] << " at index "
<< non_matching_indexes[i] << "\n";
}
GPU_ASSERT(non_matching_indexes.empty(),
"Correctness error for matrices %d x %d: \n%s", row_size,
col_size, ss.str().c_str());
}
}
#endif
#endif // KERNEL_DIMENSIONS_H

View File

@@ -1,6 +1,12 @@
#ifndef HELPER_DEBUG_CUH
#define HELPER_DEBUG_CUH
#include "cuComplex.h"
#include "thrust/complex.h"
#include <cstdint>
#include <iostream>
#include <sstream>
#include <stdio.h>
#include <string>
#include <type_traits>
@@ -98,3 +104,130 @@ __device__
}
__syncthreads();
}
template <typename T> __device__ inline const char *get_format();
template <> __device__ inline const char *get_format<int>() { return "%d, "; }
template <> __device__ inline const char *get_format<unsigned int>() {
return "%u, ";
}
template <> __device__ inline const char *get_format<uint64_t>() {
return "%lu, ";
}
template <typename T> __global__ void print_debug_kernel(const T *src, int N) {
for (int i = 0; i < N; i++) {
printf(get_format<T>(), src[i]);
}
}
template <>
__global__ inline void print_debug_kernel(const __uint128_t *src, int N) {
for (int i = 0; i < N; i++) {
uint64_t low = static_cast<uint64_t>(src[i]);
uint64_t high = static_cast<uint64_t>(src[i] >> 64);
printf("(%llu, %llu), ", high, low);
}
}
template <>
__global__ inline void print_debug_kernel(const double2 *src, int N) {
for (int i = 0; i < N; i++) {
printf("(%lf, %lf), ", src[i].x, src[i].y);
}
}
template <typename T> void print_debug(const char *name, const T *src, int N) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_debug_kernel<<<1, 1>>>(src, N);
cudaDeviceSynchronize();
printf("\n");
}
template <typename T>
__global__ void print_body_kernel(T *src, int N, int lwe_dimension, T delta) {
for (int i = 0; i < N; i++) {
T body = src[i * (lwe_dimension + 1) + lwe_dimension];
T clear = body / delta;
printf("(%lu, %lu), ", body, clear);
}
}
template <typename T>
void print_body(const char *name, T *src, int n, int lwe_dimension, T delta) {
printf("%s: ", name);
cudaDeviceSynchronize();
print_body_kernel<<<1, 1>>>(src, n, lwe_dimension, delta);
cudaDeviceSynchronize();
printf("\n");
}
template <typename Torus>
void print_2d_csv_to_file(const std::vector<Torus> &v, int col_size,
const char *fname) {
FILE *fp = fopen(fname, "wt");
for (int i = 0; i < v.size() / col_size; ++i) {
for (int j = 0; j < col_size; ++j) {
fprintf(fp, "%lu%c", v[i * col_size + j],
(j == col_size - 1) ? '\n' : ',');
}
}
fclose(fp);
}
template <typename Torus>
__host__ void dump_2d_gpu_to_file(const Torus *ptr, int row_size, int col_size,
const char *fname_prefix, int rand_prefix,
cudaStream_t stream, uint32_t gpu_index) {
// #ifndef NDEBUG
std::vector<Torus> buf_cpu(row_size * col_size);
char fname[4096];
snprintf(fname, 4096, "%s_%d_%d_%d.csv", fname_prefix, row_size, col_size,
rand_prefix);
cuda_memcpy_async_to_cpu((void *)&buf_cpu[0], ptr,
buf_cpu.size() * sizeof(Torus), stream, gpu_index);
cuda_synchronize_device(gpu_index);
print_2d_csv_to_file(buf_cpu, col_size, fname);
// #endif
}
template <typename Torus>
__host__ void compare_2d_arrays(const Torus *ptr1, const Torus *ptr2,
int row_size, int col_size, cudaStream_t stream,
uint32_t gpu_index) {
// #ifndef NDEBUG
std::vector<Torus> buf_cpu1(row_size * col_size),
buf_cpu2(row_size * col_size);
;
cuda_memcpy_async_to_cpu((void *)&buf_cpu1[0], ptr1,
buf_cpu1.size() * sizeof(Torus), stream, gpu_index);
cuda_memcpy_async_to_cpu((void *)&buf_cpu2[0], ptr2,
buf_cpu2.size() * sizeof(Torus), stream, gpu_index);
cuda_synchronize_device(gpu_index);
std::vector<uint32_t> non_matching_indexes;
for (int i = 0; i < buf_cpu1.size(); ++i) {
if (buf_cpu1[i] != buf_cpu2[i]) {
non_matching_indexes.push_back(i);
}
}
if (!non_matching_indexes.empty()) {
std::stringstream ss;
for (int i = 0; i < std::min(non_matching_indexes.size(), (size_t)10);
++i) {
ss << " difference at " << non_matching_indexes[i] << ": "
<< buf_cpu1[non_matching_indexes[i]] << " vs "
<< buf_cpu2[non_matching_indexes[i]] << " at index "
<< non_matching_indexes[i] << "\n";
}
GPU_ASSERT(non_matching_indexes.empty(),
"Correctness error for matrices %d x %d: \n%s", row_size,
col_size, ss.str().c_str());
}
}
#endif

View File

@@ -1,4 +1,5 @@
#include "device.h"
#include "helper.cuh"
#include "helper_multi_gpu.cuh"
#include <mutex>
#include <omp.h>
@@ -7,38 +8,7 @@ std::mutex m;
bool p2p_enabled = false;
const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS = 12;
const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS = 68;
// Enable bidirectional p2p access between all available GPUs and device_0_id
int32_t cuda_setup_multi_gpu(int device_0_id) {
int num_gpus = cuda_get_number_of_gpus();
if (num_gpus == 0)
PANIC("GPU error: the number of GPUs should be > 0.")
int num_used_gpus = 1;
if (num_gpus > 1) {
m.lock();
if (!p2p_enabled) {
p2p_enabled = true;
omp_set_nested(1);
int has_peer_access_to_device_0;
for (int i = 1; i < num_gpus; i++) {
check_cuda_error(cudaDeviceCanAccessPeer(&has_peer_access_to_device_0,
i, device_0_id));
if (has_peer_access_to_device_0) {
cuda_set_device(i);
check_cuda_error(cudaDeviceEnablePeerAccess(device_0_id, 0));
cuda_set_device(device_0_id);
check_cuda_error(cudaDeviceEnablePeerAccess(i, 0));
}
num_used_gpus += 1;
}
} else {
for (int i = 1; i < num_gpus; i++)
num_used_gpus += 1;
}
m.unlock();
}
return (int32_t)(num_used_gpus);
}
const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128 = 12;
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type) {
@@ -46,7 +16,21 @@ uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
uint32_t ceil_div_inputs =
std::max((uint32_t)1, (num_inputs + threshold - 1) / threshold);
std::max((uint32_t)1, CEIL_DIV(num_inputs, (uint32_t)threshold));
uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
return active_gpu_count;
}
// For pbs 128 we need to use the smaller threshold in both multi bit and
// classical
uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type) {
int threshold = (pbs_type == MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
uint32_t ceil_div_inputs =
std::max((uint32_t)1, CEIL_DIV(num_inputs, (uint32_t)threshold));
uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
return active_gpu_count;
}

View File

@@ -30,6 +30,9 @@ void multi_gpu_copy_array_async(CudaStreams streams,
const std::vector<Torus *> &dest,
Torus const *src, uint32_t elements_per_gpu,
bool gpu_memory_allocated) {
PANIC_IF_FALSE(
dest.size() >= streams.count(),
"Cuda error: destination vector was not allocated for enough GPUs");
for (uint i = 0; i < streams.count(); i++) {
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dest[i], src, elements_per_gpu * sizeof(Torus), streams.stream(i),
@@ -62,10 +65,12 @@ void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
PANIC_IF_FALSE(dest.empty(),
"Cuda error: Requested multi-GPU vector is already allocated");
int classical_threshold = sizeof(Torus) == 16
? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
int threshold = (pbs_type == MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
: classical_threshold;
dest.resize(streams.count());
for (uint i = 0; i < streams.count(); i++) {
@@ -100,10 +105,12 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
PANIC_IF_FALSE(dest.empty(),
"Cuda error: Requested multi-GPU vector is already allocated");
int classical_threshold = sizeof(Torus) == 16
? THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
int threshold = (pbs_type == MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
: classical_threshold;
dest.resize(streams.count());
for (uint i = 0; i < streams.count(); i++) {
@@ -158,13 +165,11 @@ __global__ void realign_with_indexes(Torus *d_vector,
/// The output indexing is always the trivial one
/// num_inputs: total num of lwe in src
template <typename Torus>
void multi_gpu_scatter_lwe_async(CudaStreams streams,
const std::vector<Torus *> &dest,
Torus const *src, Torus const *d_src_indexes,
bool is_trivial_index,
std::vector<Torus *> &aligned_vec,
uint32_t max_active_gpu_count,
uint32_t num_inputs, uint32_t lwe_size) {
void multi_gpu_scatter_lwe_async(
CudaStreams streams, const std::vector<Torus *> &dest, Torus const *src,
Torus const *d_src_indexes, bool is_trivial_index,
std::vector<Torus *> &aligned_vec, CudaEventPool &event_pool,
uint32_t max_active_gpu_count, uint32_t num_inputs, uint32_t lwe_size) {
PANIC_IF_FALSE(
max_active_gpu_count >= streams.count(),
@@ -187,13 +192,16 @@ void multi_gpu_scatter_lwe_async(CudaStreams streams,
streams.stream(i), streams.gpu_index(i), true);
} else {
if (aligned_vec.size() == 0)
PANIC("Cuda error: auxiliary arrays should be setup!");
PANIC_IF_FALSE(aligned_vec.size() > 0,
"Cuda error: auxiliary arrays should be setup!");
PANIC_IF_FALSE(
aligned_vec.size() >= streams.count(),
"Cuda error: aligned vec was not allocated for enough GPUs");
if (d_src_indexes == nullptr)
PANIC("Cuda error: source indexes should be initialized!");
cudaEvent_t temp_event2 = cuda_create_event(streams.gpu_index(0));
cudaEvent_t temp_event2 = event_pool.request_event(streams.gpu_index(0));
cuda_set_device(streams.gpu_index(0));
align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams.stream(0)>>>(
aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
@@ -207,7 +215,7 @@ void multi_gpu_scatter_lwe_async(CudaStreams streams,
dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
streams.stream(i), streams.gpu_index(i), true);
cudaEvent_t temp_event = cuda_create_event(streams.gpu_index(i));
cudaEvent_t temp_event = event_pool.request_event(streams.gpu_index(i));
cuda_event_record(temp_event, streams.stream(i), streams.gpu_index(i));
cuda_stream_wait_event(streams.stream(0), temp_event,
streams.gpu_index(0));
@@ -223,7 +231,8 @@ void multi_gpu_gather_lwe_async(CudaStreams streams, Torus *dest,
const std::vector<Torus *> &src,
Torus *d_dest_indexes, bool is_trivial_index,
std::vector<Torus *> &aligned_vec,
uint32_t num_inputs, uint32_t lwe_size) {
CudaEventPool &event_pool, uint32_t num_inputs,
uint32_t lwe_size) {
PANIC_IF_FALSE(src.size() >= streams.count(),
"Cuda error: src vector was not allocated for enough GPUs");
@@ -242,12 +251,15 @@ void multi_gpu_gather_lwe_async(CudaStreams streams, Torus *dest,
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
streams.stream(i), streams.gpu_index(i), true);
} else {
if (aligned_vec.size() == 0)
PANIC("Cuda error: auxiliary arrays should be setup!");
PANIC_IF_FALSE(aligned_vec.size() > 0,
"Cuda error: auxiliary arrays should be setup!");
PANIC_IF_FALSE(
aligned_vec.size() >= streams.count(),
"Cuda error: aligned vec was not allocated for enough GPUs");
if (d_dest_indexes == nullptr)
PANIC("Cuda error: destination indexes should be initialized!");
cudaEvent_t temp_event2 = cuda_create_event(streams.gpu_index(0));
cudaEvent_t temp_event2 = event_pool.request_event(streams.gpu_index(0));
cuda_event_record(temp_event2, streams.stream(0), streams.gpu_index(0));
cuda_stream_wait_event(streams.stream(i), temp_event2,
@@ -257,7 +269,7 @@ void multi_gpu_gather_lwe_async(CudaStreams streams, Torus *dest,
aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
streams.stream(i), streams.gpu_index(i), true);
cudaEvent_t temp_event3 = cuda_create_event(streams.gpu_index(i));
cudaEvent_t temp_event3 = event_pool.request_event(streams.gpu_index(i));
cuda_event_record(temp_event3, streams.stream(i), streams.gpu_index(i));
cuda_stream_wait_event(streams.stream(0), temp_event3,
streams.gpu_index(0));
@@ -302,6 +314,8 @@ void multi_gpu_gather_many_lut_lwe_async(CudaStreams streams, Torus *dest,
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus),
streams.stream(i), streams.gpu_index(i), true);
} else {
if (h_dest_indexes == nullptr)
PANIC("Cuda error: destination indexes should be initialized!");
auto dest_indexes = h_dest_indexes + gpu_offset;
for (uint j = 0; j < inputs_on_gpu; j++) {
@@ -322,6 +336,8 @@ void multi_gpu_gather_many_lut_lwe_async(CudaStreams streams, Torus *dest,
template <typename Torus>
void multi_gpu_release_async(CudaStreams streams, std::vector<Torus *> &vec) {
PANIC_IF_FALSE(vec.size() >= streams.count(),
"Cuda error: vec was not allocated for enough GPUs");
for (uint i = 0; i < vec.size(); i++)
cuda_drop_async(vec[i], streams.stream(i), streams.gpu_index(i));
}

View File

@@ -1,32 +0,0 @@
#ifndef KERNEL_DIMENSIONS_CUH
#define KERNEL_DIMENSIONS_CUH
inline int nextPow2(int x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
int &blocks, int &threads) {
threads =
(n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
blocks = (n + threads - 1) / threads;
}
// Determines blocks and threads in x for a given blockDim.y using the same
// logic than above
inline void getNumBlocksAndThreads2D(const int n, const int maxBlockSize,
const int block_dim_y, int &blocks,
int &threads_x) {
const int max_block_dim_x = maxBlockSize / block_dim_y;
threads_x = (n < max_block_dim_x * 2) ? max(128, nextPow2((n + 1) / 2))
: max_block_dim_x;
blocks = (n + threads_x - 1) / threads_x;
}
#endif // KERNEL_DIMENSIONS_H

View File

@@ -8,10 +8,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
uint32_t casting_output_dimension, uint32_t casting_ks_level,
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, uint32_t num_compact_lists,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
KS_TYPE casting_key_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
const bool *is_boolean_array, const uint32_t is_boolean_array_len,
uint32_t num_compact_lists, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
// Since CUDA backend works with the concept of "big" and "small" key, instead
// of "input" and "output", we need to do this or otherwise our PBS will throw
@@ -36,8 +36,9 @@ uint64_t scratch_cuda_expand_without_verification_64(
return scratch_cuda_expand_without_verification<uint64_t>(
CudaStreams(streams),
reinterpret_cast<zk_expand_mem<uint64_t> **>(mem_ptr),
num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
computing_params, casting_params, casting_key_type, allocate_gpu_memory);
num_lwes_per_compact_list, is_boolean_array, is_boolean_array_len,
num_compact_lists, computing_params, casting_params, casting_key_type,
allocate_gpu_memory);
}
void cuda_expand_without_verification_64(

View File

@@ -11,10 +11,73 @@
#include "polynomial/functions.cuh"
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
#include "utils/kernel_dimensions.cuh"
#include "zk/zk_utilities.h"
#include <functional>
/*
* =============================================================================
* GPU Expand Algorithm: Overview
* =============================================================================
*
* The expand algorithm transforms compact ciphertexts into standard LWE
* ciphertexts. Compact ciphertexts save space by sharing a single mask across
* multiple encrypted messages.
*
* -----------------------------------------------------------------------------
* INPUT STRUCTURE (lwe_flattened_compact_array_in)
* -----------------------------------------------------------------------------
*
* A contiguous array of concatenated compact ciphertext lists:
*
* ┌─────────────────────────────────────────────────────────────────────────┐
* │ lwe_flattened_compact_array_in (GPU memory) │
* └─────────────────────────────────────────────────────────────────────────┘
* ┌────────────────────────────────┬────────────────────────────────────────┐
* │ Compact List 0 │ Compact List 1 │...
* └────────────────────────────────┴────────────────────────────────────────┘
*
* Each compact list structure:
* ┌─────────────────────────────────────────────┬────────┬────────┬─────────┐
* │ Shared Mask (lwe_dimension coefficients) │ Body 0 │ Body 1 │ Body 2
* │... │ [a_0, a_1, ..., a_{n-1}] │ b_0 │ b_1 │ b_2 │
* └─────────────────────────────────────────────┴────────┴────────┴─────────┘
* │<────────── lwe_dimension ──────────────────>│<── num_lwes_in_list ─────>│
*
* -----------------------------------------------------------------------------
* EXPAND PROCESS
* -----------------------------------------------------------------------------
*
* 1. LWE Expansion (lwe_expand kernel):
* Each (mask, body_i) pair becomes a standard LWE by rotating the mask
* by i positions: LWE_i = (rotate(mask, i), body_i)
*
* 2. Message/Carry Extraction (PBS with LUTs):
* Each expanded LWE contains packed data. PBS extracts both parts:
*
* Input LWE_i ──PBS──> Output[2i] (message extraction LUT)
* └───> Output[2i+1] (carry extraction LUT)
*
* For boolean values, sanitization LUTs clamp output to {0, 1}.
*
* -----------------------------------------------------------------------------
* OUTPUT STRUCTURE (lwe_array_out)
* -----------------------------------------------------------------------------
*
* ┌─────────────────────────────────────────────────────────────────────────┐
* │ lwe_array_out (2 * num_lwes standard LWEs) │
* └─────────────────────────────────────────────────────────────────────────┘
* ┌──────────────┬──────────────┬──────────────┬──────────────┬─────────────┐
* │ LWE 0 (msg) │ LWE 0 (carry)│ LWE 1 (msg) │ LWE 1 (carry)│ ... │
* └──────────────┴──────────────┴──────────────┴──────────────┴─────────────┘
*
* Each output LWE: [mask (lwe_dimension), body (1)] = lwe_dimension + 1
* elements
*
* See zk_utilities.h for detailed documentation on the is_boolean array and
* LUT indexing logic.
* =============================================================================
*/
template <typename Torus, class params>
__host__ void host_expand_without_verification(
CudaStreams streams, Torus *lwe_array_out,
@@ -107,15 +170,15 @@ template <typename Torus>
__host__ uint64_t scratch_cuda_expand_without_verification(
CudaStreams streams, zk_expand_mem<Torus> **mem_ptr,
const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
uint32_t num_compact_lists, int_radix_params computing_params,
int_radix_params casting_params, KS_TYPE casting_key_type,
bool allocate_gpu_memory) {
const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
int_radix_params computing_params, int_radix_params casting_params,
KS_TYPE casting_key_type, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new zk_expand_mem<Torus>(
streams, computing_params, casting_params, casting_key_type,
num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
allocate_gpu_memory, size_tracker);
num_lwes_per_compact_list, is_boolean_array, is_boolean_array_len,
num_compact_lists, allocate_gpu_memory, size_tracker);
return size_tracker;
}

View File

@@ -67,7 +67,7 @@ public:
number_of_inputs = (int)GetParam().number_of_inputs;
// Enable Multi-GPU logic
gpu_count = cuda_setup_multi_gpu(0);
gpu_count = cuda_get_number_of_gpus();
active_gpu_count = std::min((uint)number_of_inputs, gpu_count);
for (uint gpu_i = 0; gpu_i < active_gpu_count; gpu_i++) {
streams.push_back(cuda_create_stream(gpu_i));

View File

@@ -2533,6 +2533,7 @@ unsafe extern "C" {
grouping_factor: u32,
num_lwes_per_compact_list: *const u32,
is_boolean_array: *const bool,
is_boolean_array_len: u32,
num_compact_lists: u32,
message_modulus: u32,
carry_modulus: u32,
@@ -2711,7 +2712,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cuda_closest_representable_64(
pub fn cuda_closest_representable_64_async(
stream: *mut ffi::c_void,
gpu_index: u32,
input: *const ffi::c_void,

View File

@@ -13,27 +13,10 @@ extern "C" {
pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void;
pub fn cuda_malloc_with_size_tracking_async(
size: u64,
stream: *mut c_void,
gpu_index: u32,
size_tracker: *mut u64,
allocate_gpu_memory: bool,
) -> *mut c_void;
pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32) -> bool;
pub fn cuda_device_total_memory(gpu_index: u32) -> u64;
pub fn cuda_memcpy_with_size_tracking_async_to_gpu(
dest: *mut c_void,
src: *const c_void,
size: u64,
stream: *mut c_void,
gpu_index: u32,
gpu_memory_allocated: bool,
);
pub fn cuda_memcpy_async_to_gpu(
dest: *mut c_void,
src: *const c_void,
@@ -44,15 +27,6 @@ extern "C" {
pub fn cuda_memcpy_gpu_to_gpu(dest: *mut c_void, src: *const c_void, size: u64, gpu_index: u32);
pub fn cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dest: *mut c_void,
src: *const c_void,
size: u64,
stream: *mut c_void,
gpu_index: u32,
gpu_memory_allocated: bool,
);
pub fn cuda_memcpy_async_gpu_to_gpu(
dest: *mut c_void,
src: *const c_void,
@@ -69,15 +43,6 @@ extern "C" {
gpu_index: u32,
);
pub fn cuda_memset_with_size_tracking_async(
dest: *mut c_void,
val: u64,
size: u64,
stream: *mut c_void,
gpu_index: u32,
gpu_memory_allocated: bool,
);
pub fn cuda_memset_async(
dest: *mut c_void,
val: u64,
@@ -94,16 +59,4 @@ extern "C" {
pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
pub fn cuda_drop_with_size_tracking_async(
ptr: *mut c_void,
stream: *mut c_void,
gpu_index: u32,
size_tracker: *mut u64,
allocate_gpu_memory: bool,
);
pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
pub fn cuda_setup_multi_gpu(gpu_index: u32) -> i32;
} // extern "C"

View File

@@ -100,3 +100,9 @@ environment_name = "canada"
image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
flavor_name = "n3-L40x4"
user = "ubuntu"
[backend.hyperstack.4-l40_fallback]
environment_name = "norway"
image_name = "Ubuntu Server 22.04 LTS R570 CUDA 12.8"
flavor_name = "n3-RTX-A4000x4"
user = "ubuntu"

View File

@@ -28,6 +28,9 @@ if [[ "${RUN_VALGRIND}" == "0" && "${RUN_COMPUTE_SANITIZER}" == "0" ]]; then
exit 1
fi
# Array to collect error messages for final summary
ERROR_MESSAGES=()
# List the tests into a temporary file
RUSTFLAGS="$RUSTFLAGS" cargo nextest list --cargo-profile "${CARGO_PROFILE}" \
--features=integer,internal-keycache,gpu-debug,zk-pok -p tfhe &> /tmp/test_list.txt
@@ -46,14 +49,28 @@ if [[ "${RUN_VALGRIND}" == "1" ]]; then
# Find the test executable -> last one to have been modified
EXECUTABLE=target/release/deps/$(find target/release/deps/ -type f -executable -name "tfhe-*" -printf "%T@ %f\n" |sort -nr|sed 's/^.* //; q;')
# shellcheck disable=SC2181
RESULT=0
while read -r t; do
[ -z "$t" ] && continue
echo "Running valgrind on: $t"
VALGRIND_EXIT=0
valgrind --leak-check=full --show-leak-kinds=definite "$EXECUTABLE" -- "$t" || VALGRIND_EXIT=$?
valgrind --leak-check=full \
--show-leak-kinds=definite,indirect \
--errors-for-leak-kinds=definite,indirect \
--error-exitcode=1 \
"$EXECUTABLE" -- "$t" 2>&1 | tee /tmp/valgrind_output.log || VALGRIND_EXIT=$?
# Fail if the test crashed (non-zero exit code from valgrind)
if [[ $VALGRIND_EXIT -ne 0 ]]; then
ERROR_MESSAGES+=("Test crashed or valgrind returned error for test: $t")
RESULT=1
fi
# Also fail if memory errors reference tfhe/cuda code (not system libraries)
if grep -E "definitely lost|indirectly lost|Invalid read|Invalid write|Invalid free|Mismatched free" /tmp/valgrind_output.log | \
grep -q "tfhe\|cuda"; then
ERROR_MESSAGES+=("Memory error detected in tfhe/cuda code for test: $t")
RESULT=1
fi
done <<< "$TESTS_TO_RUN"
@@ -81,9 +98,22 @@ if [[ "${RUN_COMPUTE_SANITIZER}" == "1" ]]; then
--error-exitcode=1 --target-processes=all \
"$EXECUTABLE" -- "$t" || CS_EXIT=$?
if [[ $CS_EXIT -ne 0 ]]; then
ERROR_MESSAGES+=("Compute-sanitizer detected error for test: $t")
RESULT=1
fi
done <<< "$TESTS_TO_RUN"
fi
# Print summary of errors if any were encountered
if [[ ${#ERROR_MESSAGES[@]} -gt 0 ]]; then
echo ""
echo "========================================"
echo "MEMORY ERROR SUMMARY"
echo "========================================"
for msg in "${ERROR_MESSAGES[@]}"; do
echo " - $msg"
done
echo "========================================"
fi
exit $RESULT

View File

@@ -3,9 +3,7 @@ use crate::{load_and_unversionize, TestedModule};
use std::path::Path;
#[cfg(feature = "zk-pok")]
use tfhe::integer::parameters::DynamicDistribution;
use tfhe::prelude::{
CiphertextList, FheDecrypt, FheEncrypt, ParameterSetConformant, ReRandomize, SquashNoise,
};
use tfhe::prelude::*;
#[cfg(feature = "zk-pok")]
use tfhe::shortint::parameters::{
CompactCiphertextListExpansionKind, CompactPublicKeyEncryptionParameters,
@@ -13,27 +11,29 @@ use tfhe::shortint::parameters::{
#[cfg(feature = "zk-pok")]
use tfhe::shortint::prelude::LweDimension;
use tfhe::shortint::{CarryModulus, CiphertextModulus, MessageModulus};
use tfhe::xof_key_set::CompressedXofKeySet;
#[cfg(feature = "zk-pok")]
use tfhe::zk::{CompactPkeCrs, CompactPkeCrsConformanceParams};
#[cfg(feature = "zk-pok")]
use tfhe::ProvenCompactCiphertextList;
use tfhe::{
set_server_key, ClientKey, CompactCiphertextList, CompressedCiphertextList,
set_server_key, ClientKey, CompactCiphertextList, CompactCiphertextListBuilder,
CompactPublicKey, CompressedCiphertextList, CompressedCiphertextListBuilder,
CompressedCompactPublicKey, CompressedFheBool, CompressedFheInt8, CompressedFheUint8,
CompressedKVStore, CompressedPublicKey, CompressedServerKey,
CompressedSquashedNoiseCiphertextList, FheBool, FheInt8, FheUint64, FheUint8,
ReRandomizationContext, ServerKey, SquashedNoiseFheBool, SquashedNoiseFheInt,
SquashedNoiseFheUint,
CompressedSquashedNoiseCiphertextList, CompressedSquashedNoiseCiphertextListBuilder, FheBool,
FheInt8, FheUint32, FheUint64, FheUint8, ReRandomizationContext, ServerKey,
SquashedNoiseFheBool, SquashedNoiseFheInt, SquashedNoiseFheUint,
};
#[cfg(feature = "zk-pok")]
use tfhe::{CompactPublicKey, ProvenCompactCiphertextList};
use tfhe_backward_compat_data::load::{
load_versioned_auxiliary, DataFormat, TestFailure, TestResult, TestSuccess,
};
use tfhe_backward_compat_data::{
DataKind, HlBoolCiphertextTest, HlCiphertextTest, HlClientKeyTest, HlCompressedKVStoreTest,
HlCompressedSquashedNoiseCiphertextListTest, HlHeterogeneousCiphertextListTest,
HlPublicKeyTest, HlServerKeyTest, HlSignedCiphertextTest, HlSquashedNoiseBoolCiphertextTest,
HlSquashedNoiseSignedCiphertextTest, HlSquashedNoiseUnsignedCiphertextTest, TestMetadata,
TestType, Testcase, ZkPkePublicParamsTest,
HlCompressedSquashedNoiseCiphertextListTest, HlCompressedXofKeySetTest,
HlHeterogeneousCiphertextListTest, HlPublicKeyTest, HlServerKeyTest, HlSignedCiphertextTest,
HlSquashedNoiseBoolCiphertextTest, HlSquashedNoiseSignedCiphertextTest,
HlSquashedNoiseUnsignedCiphertextTest, TestMetadata, TestType, Testcase, ZkPkePublicParamsTest,
};
use tfhe_versionable::Unversionize;
@@ -360,6 +360,155 @@ pub fn test_hl_pubkey(
}
}
/// Shared feature-testing logic for server keys: computation, re-randomization, noise squashing,
/// compression, and compressed noise-squashed lists.
fn test_hl_key_features(
client_key: &ClientKey,
server_key: ServerKey,
compact_public_key: Option<&CompactPublicKey>,
test: &impl TestType,
format: DataFormat,
) -> Result<(), TestFailure> {
set_server_key(server_key.clone());
let clear_a = 278120u32;
let clear_b = 839412u32;
let (mut a, mut b) = match compact_public_key {
Some(pk) => {
let compact_list = CompactCiphertextListBuilder::new(pk)
.push(clear_a)
.push(clear_b)
.build_packed();
let expanded = compact_list
.expand()
.map_err(|e| test.failure(format!("Failed to expand: {e}"), format))?;
let a: FheUint32 = expanded.get(0).unwrap().unwrap();
let b: FheUint32 = expanded.get(1).unwrap().unwrap();
(a, b)
}
None => {
let a = FheUint32::encrypt(clear_a, client_key);
let b = FheUint32::encrypt(clear_b, client_key);
(a, b)
}
};
// Re-randomization
if let (Some(pk), true) = (
compact_public_key,
server_key.supports_ciphertext_re_randomization(),
) {
let nonce: [u8; 256 / 8] = core::array::from_fn(|i| i as u8);
let mut re_rand_context = ReRandomizationContext::new(
*b"TFHE_Rrd",
[b"FheUint32 bin ops".as_slice(), nonce.as_slice()],
*b"TFHE_Enc",
);
re_rand_context.add_ciphertext(&a);
re_rand_context.add_ciphertext(&b);
let mut seed_gen = re_rand_context.finalize();
a.re_randomize(pk, seed_gen.next_seed().unwrap())
.map_err(|e| test.failure(format!("Failed to re-randomize a: {e}"), format))?;
b.re_randomize(pk, seed_gen.next_seed().unwrap())
.map_err(|e| test.failure(format!("Failed to re-randomize b: {e}"), format))?;
}
// Computation
let c = &a + &b;
let d = &a & &b;
let expected_c = clear_a.wrapping_add(clear_b);
let expected_d = clear_a & clear_b;
for (val, expected) in [&c, &d].iter().zip([expected_c, expected_d]) {
let dec: u32 = val.decrypt(client_key);
if dec != expected {
return Err(test.failure(
format!("Invalid decryption: expected {expected}, got {dec}"),
format,
));
}
}
// Noise squashing
if server_key.supports_noise_squashing() {
let ns_c = c
.squash_noise()
.map_err(|e| test.failure(format!("Failed to squash noise: {e}"), format))?;
let ns_d = d
.squash_noise()
.map_err(|e| test.failure(format!("Failed to squash noise: {e}"), format))?;
for (ns_val, expected) in [&ns_c, &ns_d].iter().zip([expected_c, expected_d]) {
let dec: u32 = ns_val.decrypt(client_key);
if dec != expected {
return Err(test.failure(
format!("Invalid noise-squashed decryption: expected {expected}, got {dec}"),
format,
));
}
}
if server_key.supports_noise_squashing_compression() {
// Compressed noise-squashed ciphertext list
let ns_compressed_list = CompressedSquashedNoiseCiphertextListBuilder::new()
.push(ns_c)
.push(ns_d)
.build()
.map_err(|e| {
test.failure(
format!("Failed to build compressed squashed noise list: {e}"),
format,
)
})?;
for (i, expected) in [expected_c, expected_d].iter().enumerate() {
let val: SquashedNoiseFheUint = ns_compressed_list.get(i).unwrap().unwrap();
let dec: u32 = val.decrypt(client_key);
if dec != *expected {
return Err(test.failure(
format!(
"Invalid compressed noise-squashed[{i}]: \
expected {expected}, got {dec}"
),
format,
));
}
}
}
}
// Compression / decompression
if server_key.supports_compression() {
let compressed_list = CompressedCiphertextListBuilder::new()
.push(a)
.push(b)
.push(c)
.push(d)
.build()
.map_err(|e| test.failure(format!("Failed to build compressed list: {e}"), format))?;
let expected_values = [clear_a, clear_b, expected_c, expected_d];
for (i, expected) in expected_values.iter().enumerate() {
let val: FheUint32 = compressed_list.get(i).unwrap().unwrap();
let dec: u32 = val.decrypt(client_key);
if dec != *expected {
return Err(test.failure(
format!("Invalid decompressed[{i}]: expected {expected}, got {dec}"),
format,
));
}
}
}
Ok(())
}
/// Test HL server key: encrypt two values with a client key, add them using the server key and
/// check that the decrypted sum is valid.
pub fn test_hl_serverkey(
@@ -373,11 +522,6 @@ pub fn test_hl_serverkey(
)
.map_err(|e| test.failure(e, format))?;
let v1 = 73u8;
let mut ct1 = FheUint8::encrypt(v1, &client_key);
let v2 = 102u8;
let ct2 = FheUint8::encrypt(v2, &client_key);
let key = if test.compressed {
let compressed: CompressedServerKey = load_and_unversionize(dir, test, format)?;
compressed.decompress()
@@ -385,77 +529,20 @@ pub fn test_hl_serverkey(
load_and_unversionize(dir, test, format)?
};
let has_noise_squashing = key.supports_noise_squashing();
let has_rerand = key.supports_ciphertext_re_randomization();
set_server_key(key);
if has_noise_squashing {
let ns = ct1.squash_noise().unwrap();
let res: u8 = ns.decrypt(&client_key);
if res != v1 {
return Err(test.failure(
format!(
"Invalid result for noise squashing using loaded server key, expected {v1} got {res}",
),
format,
));
}
}
if let Some(rerand_cpk_filename) = test.rerand_cpk_filename.as_ref() {
if has_rerand {
let rerand_cpk_file = dir.join(rerand_cpk_filename.to_string());
let public_key = CompressedCompactPublicKey::unversionize(
load_versioned_auxiliary(rerand_cpk_file).map_err(|e| test.failure(e, format))?,
let compact_public_key = test
.rerand_cpk_filename
.as_ref()
.map(|filename| {
let cpk_file = dir.join(filename.to_string());
CompressedCompactPublicKey::unversionize(
load_versioned_auxiliary(cpk_file).map_err(|e| test.failure(e, format))?,
)
.map_err(|e| test.failure(e, format))?
.decompress();
.map_err(|e| test.failure(e, format))
.map(|cpk| cpk.decompress())
})
.transpose()?;
let nonce: [u8; 256 / 8] = rand::random();
let mut re_rand_context = ReRandomizationContext::new(
*b"TFHE_Rrd",
[b"FheUint8".as_slice(), nonce.as_slice()],
*b"TFHE_Enc",
);
re_rand_context.add_ciphertext(&ct1);
let mut seed_gen = re_rand_context.finalize();
ct1.re_randomize(&public_key, seed_gen.next_seed().unwrap())
.unwrap();
#[allow(clippy::eq_op)]
let rrd = &ct1 & &ct1;
let res: u8 = rrd.decrypt(&client_key);
if res != v1 {
return Err(test.failure(
format!(
"Invalid result for rerand using loaded server key, expected {v1} got {res}",
),
format,
));
}
} else {
return Err(test.failure(
"Test requires rerand key but server key does not have it".to_string(),
format,
));
}
}
let ct_sum = ct1 + ct2;
let sum: u8 = ct_sum.decrypt(&client_key);
if sum != v1 + v2 {
return Err(test.failure(
format!(
"Invalid result for addition using loaded server key, expected {} got {}",
v1 + v2,
sum,
),
format,
));
}
test_hl_key_features(&client_key, key, compact_public_key.as_ref(), test, format)?;
Ok(test.success(format))
}
@@ -659,6 +746,39 @@ fn test_hl_compressed_kv_store_test(
Ok(test.success(format))
}
fn test_hl_compressed_xof_key_set_test(
dir: &Path,
test: &HlCompressedXofKeySetTest,
format: DataFormat,
) -> Result<TestSuccess, TestFailure> {
let client_key_file = dir.join(&*test.client_key_file_name);
let client_key = ClientKey::unversionize(
load_versioned_auxiliary(client_key_file).map_err(|e| test.failure(e, format))?,
)
.map_err(|e| test.failure(format!("Failed to load client key file: {e}"), format))?;
let compressed_xof_key_set_file = dir.join(&*test.compressed_xof_key_set_file_name);
let compressed_xof_key_set = CompressedXofKeySet::unversionize(
load_versioned_auxiliary(compressed_xof_key_set_file)
.map_err(|e| test.failure(e, format))?,
)
.map_err(|e| {
test.failure(
format!("Failed to load compressed xof key set file: {e}"),
format,
)
})?;
let xof_key_set = compressed_xof_key_set
.decompress()
.map_err(|e| test.failure(format!("Failed to decompress the xof key set: {e}"), format))?;
let (pk, server_key) = xof_key_set.into_raw_parts();
test_hl_key_features(&client_key, server_key, Some(&pk), test, format)?;
Ok(test.success(format))
}
pub struct Hl;
impl TestedModule for Hl {
@@ -711,6 +831,9 @@ impl TestedModule for Hl {
TestMetadata::HlCompressedKVStoreTest(test) => {
test_hl_compressed_kv_store_test(test_dir.as_ref(), test, format).into()
}
TestMetadata::HlCompressedXofKeySet(test) => {
test_hl_compressed_xof_key_set_test(test_dir.as_ref(), test, format).into()
}
_ => {
println!("WARNING: missing test: {:?}", testcase.metadata);
TestResult::Skipped(testcase.skip())

View File

@@ -197,10 +197,10 @@ mod cuda {
use tfhe::integer::ciphertext::ReRandomizationContext;
use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
use tfhe::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
use tfhe::integer::gpu::key_switching_key::CudaKeySwitchingKeyMaterial;
use tfhe::integer::key_switching_key::KeySwitchingKey;
use tfhe::integer::{gen_keys_radix, CompactPrivateKey, CompactPublicKey};
use tfhe::keycache::NamedParam;
use tfhe::shortint::key_switching_key::CudaKeySwitchingKeyMaterial;
fn execute_gpu_re_randomize(c: &mut Criterion, bit_size: usize) {
let bench_name = "integer::cuda::re_randomize";

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-csprng"
version = "0.8.0"
version = "0.8.1"
edition = "2021"
license = "BSD-3-Clause-Clear"
description = "Cryptographically Secure PRNG used in the TFHE-rs library."
@@ -25,6 +25,8 @@ libc = "0.2.133"
rand = { workspace = true }
criterion = "0.5.1"
clap = "=4.5.30"
ctr = "0.9.2"
[features]
parallel = ["rayon"]

View File

@@ -9,7 +9,7 @@ use crate::generators::aes_ctr::{AES_CALLS_PER_BATCH, BYTES_PER_AES_CALL, BYTES_
/// needs to be loaded with [u128::from_le] (to keep consistency of the loaded bytes across systems
/// endianness), the rest of the code should use the [`AesKey`] with native endian ordering such
/// that the internal u128 is equivalent to [u8; 16].
#[derive(Clone, Copy)]
#[derive(Clone, Copy, Debug)]
pub(crate) struct AesKey(pub(crate) u128);
/// A trait for AES block ciphers.

Some files were not shown because too many files have changed in this diff Show More