Compare commits

...

37 Commits

Author SHA1 Message Date
Agnes Leroy
7dcbd85a83 chore(gpu): stop using optional arguments altogether 2024-09-02 15:51:45 +02:00
Agnes Leroy
1e453263af chore(gpu): remove device synchronization in drop for CudaVec 2024-09-02 15:09:08 +02:00
dependabot[bot]
c258d53625 chore(deps): bump actions/upload-artifact from 4.3.6 to 4.4.0
Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.3.6 to 4.4.0.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](834a144ee9...50769540e7)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-09-02 09:34:34 +02:00
tmontaigu
8ddee20a57 feat(tfhe): add get_kind_of to CompactCiphertextList
This adds the ability to query the length and types
contained in a CompactCiphertextList and ProvenCompactCiphertextList
without having to expand or verify the proof(s)
2024-08-30 21:01:46 +02:00
tmontaigu
1d786b7202 feat(wasm): bind CompactListExpander::get_kind_of
And other things to allow wasm users to explore
what kind of data is stored in the compact list.
2024-08-30 21:01:46 +02:00
tmontaigu
7267d60e01 feat(integer): implement unsigned_overflowing_scalar_sub 2024-08-29 19:09:48 +02:00
Arthur Meyre
0148a6ffc8 chore(tfhe): update dependencies with breaking changes
- concrete-fft to 0.5 and concrete-ntt 0.2.0 due to rust AVX512 breaking
change (fix for bad args in function)
- dyn-stack to 0.10 due to concrete-fft update
2024-08-29 17:36:19 +02:00
tmontaigu
63571a07ae feat(integer): add is_even/is_odd functions
These ones are pretty simple and so are also directly done for GPU
2024-08-29 14:24:40 +02:00
Arthur Meyre
6e2908ad4e chore(bench): fix CRS size for integer ZK bench 2024-08-29 09:41:35 +02:00
sarah el kazdadi
d3d06c905f feat(tfhe): replace asm with rust intrinsics 2024-08-29 09:41:20 +02:00
Arthur Meyre
051f33f166 chore(hl): remove second server key generation
- bad merge led to two server key generations in the HL API, fix that
2024-08-28 15:25:35 +02:00
Mayeul@Zama
11a8f97a1c chore(all): use destructuring in conformance 2024-08-26 17:28:05 +02:00
tmontaigu
35a9c323a7 chore(integer): make remaining non-parallel test use defined test cases
This makes the remaining non-parallel ops implementation use the same
test cases that are used for parallel implementations.

There are still some test that do not share the test case but its either
because they do not have a parallel impl (not interesting to have) or
when its tests about encryption/decryption

Closes https://github.com/zama-ai/tfhe-rs-internal/issues/265
2024-08-26 10:13:11 +02:00
dependabot[bot]
641f47b775 chore(deps): bump tj-actions/changed-files from 44.5.7 to 45.0.0
Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 44.5.7 to 45.0.0.
- [Release notes](https://github.com/tj-actions/changed-files/releases)
- [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md)
- [Commits](c65cd88342...40853de9f8)

---
updated-dependencies:
- dependency-name: tj-actions/changed-files
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-08-26 09:52:06 +02:00
tmontaigu
456d0ced1b chore(integer): addition test based on trivial inputs
This adds `overflowing_add` and `add` tests that
are on trivial inputs. As these are faster to run they
can be more extensive than on true encryptions

This also binds the advanced_add_assign functions tests
to include overflow computation

On a standard laptop with 1 test thread it takes ~7 minutes
to run these trivial tests
2024-08-23 16:28:40 +02:00
tmontaigu
358bcc9a22 feat(integer): implement sub_assign_with borrow
To get the same kind of speed ups for unsigned_overflow
as we got in previous commits that changed the carry propagation
algorithm
2024-08-21 09:56:40 +02:00
Pedro Alves
27a4564d83 fix(gpu): fix compression benchmarking 2024-08-20 17:46:20 -03:00
Arthur Meyre
296e419f6c chore(ci): update tfhe-lints to more recent toolchain 2024-08-20 13:02:12 +02:00
Arthur Meyre
e1a25a10ac chore(docs): fix README link to getting started 2024-08-19 15:35:52 +02:00
Arthur Meyre
d9349b3357 chore(ci): update nightly toolchain 2024-08-19 15:35:52 +02:00
Arthur Meyre
68e4ac4896 chore(ci): fix lints for new nightly toolchain 2024-08-19 15:35:52 +02:00
tmontaigu
3f318a2046 feat(wasm): add missing push_u{512,1024,2048}
This adds the missing push functions for some big
uint type that the fhEVM needs
2024-08-19 10:12:53 +02:00
tmontaigu
d1380794ed chore(tfhe): bump version to 0.8.0-alpha.3 2024-08-19 10:12:53 +02:00
Pedro Alves
fe5641ef6d feat(gpu): implement CUDA-based Radix Integer compression and public functional packing keyswitch 2024-08-16 15:44:34 -03:00
Arthur Meyre
3397aa81d2 chore(ci): update node to 22.6 2024-08-14 13:42:14 +02:00
Arthur Meyre
8f10f8f8db chore(ci): reduce bench loops for WASM compressed server key
- excessive loops seemed to trigger a crash likely due to some memory
exhaustion/fragmentation
2024-08-14 13:42:14 +02:00
Arthur Meyre
92be95c6b8 chore(ci): fix parsing for integer benchmarks 2024-08-14 13:42:14 +02:00
Arthur Meyre
990c4d0380 chore(ci): do not run all steps on slow runners 2024-08-14 13:42:14 +02:00
Arthur Meyre
1d5abfd5ea chore(ci): do not run tests nightly, on push only if relevant files changed 2024-08-14 13:42:14 +02:00
Arthur Meyre
dfd1beeb47 chore(ci): avoid concurrency lock for PKE ZK benchmarks
- sharing a concurrency group on merge to main means two sequential merges
will lock the second one while it waits for the first to complete
2024-08-14 13:42:14 +02:00
Arthur Meyre
43a007a2fa chore(ci): make sure the newline linter runs 2024-08-14 13:42:14 +02:00
Arthur Meyre
54faf64ecd chore(tfhe): bump tfhe-versionable version to 0.2.1 2024-08-14 13:17:21 +02:00
Arthur Meyre
8fe7f9c3cb chore(ci): add workflow to publish tfhe-versionable 2024-08-14 13:17:21 +02:00
Arthur Meyre
9ed65db03d chore(ci): csprng release workflow misc fixes 2024-08-14 13:17:21 +02:00
tmontaigu
9413d3e722 feat(integer): improve {overflowing_}scalar_add/sub 2024-08-14 12:30:53 +02:00
Ben
2000feb87e chore(CI): update LE commit 2024-08-13 14:56:27 +01:00
tmontaigu
594a5cee25 fix(integer): remove double carry prop in sub
The subtraction is done via addition of the negation,
the negation is done via unchecked_neg, this will make the
first block have a carry.
Then we called add_assign_with_carry_parallelized which did
a carry propagation on the rhs which here is the negated value,
meaning the subtraction would do 2 carry propagation.

To fix that we directly call the lower function.
2024-08-13 14:45:57 +02:00
162 changed files with 6626 additions and 2217 deletions

View File

@@ -56,7 +56,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -19,21 +19,48 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
types: [labeled]
push:
branches:
- main
schedule:
# Nightly tests @ 3AM after each work day
- cron: "0 3 * * MON-FRI"
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
persist-credentials: "false"
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
integer:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
- tfhe/src/core_crypto/**
- tfhe/src/shortint/**
- tfhe/src/integer/**
setup-instance:
name: Setup instance (unsigned-integer-tests)
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
github.event_name == 'workflow_dispatch'
needs: should-run
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -60,7 +87,7 @@ jobs:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
persist-credentials: "false"
- name: Set up home
run: |
@@ -103,7 +130,7 @@ jobs:
teardown-instance:
name: Teardown instance (unsigned-integer-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, unsigned-integer-tests ]
needs: [setup-instance, unsigned-integer-tests]
runs-on: ubuntu-latest
steps:
- name: Stop instance

View File

@@ -19,21 +19,48 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
types: [labeled]
push:
branches:
- main
schedule:
# Nightly tests @ 3AM after each work day
- cron: "0 3 * * MON-FRI"
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: write
outputs:
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
persist-credentials: "false"
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
integer:
- tfhe/Cargo.toml
- concrete-csprng/**
- tfhe-zk-pok/**
- tfhe/src/core_crypto/**
- tfhe/src/shortint/**
- tfhe/src/integer/**
setup-instance:
name: Setup instance (signed-integer-tests)
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
github.event_name == 'workflow_dispatch'
name: Setup instance (unsigned-integer-tests)
needs: should-run
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
@@ -60,7 +87,7 @@ jobs:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
persist-credentials: 'false'
persist-credentials: "false"
- name: Set up home
run: |
@@ -107,7 +134,7 @@ jobs:
teardown-instance:
name: Teardown instance (signed-integer-tests)
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
needs: [ setup-instance, signed-integer-tests ]
needs: [setup-instance, signed-integer-tests]
runs-on: ubuntu-latest
steps:
- name: Stop instance

View File

@@ -63,7 +63,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -98,7 +98,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_boolean
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -19,14 +19,21 @@ jobs:
strategy:
matrix:
os: [large_ubuntu_16, macos-latest-large, large_windows_16_latest]
# GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
# even with a few PRs
os: [large_ubuntu_16, macos-latest, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
- name: Install latest stable
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
with:
toolchain: stable
- name: Install and run newline linter checks
if: matrix.os == 'ubuntu-latest'
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
@@ -36,27 +43,33 @@ jobs:
make check_newline
- name: Run pcc checks
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make pcc
- name: Build concrete-csprng
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_concrete_csprng
- name: Build Release core
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_core AVX512_SUPPORT=ON
make build_core_experimental AVX512_SUPPORT=ON
- name: Build Release boolean
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_boolean
- name: Build Release shortint
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_shortint
- name: Build Release integer
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_integer
@@ -65,10 +78,12 @@ jobs:
make build_tfhe_full
- name: Build Release c_api
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_c_api
- name: Build coverage tests
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make build_tfhe_coverage

View File

@@ -57,7 +57,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
files_yaml: |
tfhe:

View File

@@ -86,7 +86,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -128,7 +128,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -82,7 +82,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_integer_multi_bit_gpu_default
path: ${{ env.RESULTS_FILENAME }}
@@ -164,7 +164,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -34,7 +34,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -33,7 +33,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -34,7 +34,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -34,7 +34,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -42,7 +42,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -34,7 +34,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -41,7 +41,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |

View File

@@ -139,7 +139,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -124,7 +124,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -144,7 +144,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -144,7 +144,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -147,7 +147,7 @@ jobs:
parse_integer_benches
- name: Upload csv results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_csv_integer
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
@@ -167,7 +167,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -164,7 +164,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_integer
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -144,7 +144,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -42,7 +42,7 @@ jobs:
- name: Prepare package
run: |
cargo package -p tfhe
- uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
with:
name: crate
path: target/package/*.crate

View File

@@ -1,4 +1,3 @@
# Publish new release of tfhe-rs on various platform.
name: Publish concrete-csprng release
on:
@@ -37,6 +36,6 @@ jobs:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "concrete-csprng release failed: (${{ env.ACTION_RUN_URL }})"
SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -0,0 +1,36 @@
name: Publish tfhe-versionable release
on:
workflow_dispatch:
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
jobs:
publish_release:
name: Publish tfhe-versionable Release
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
with:
fetch-depth: 0
- name: Publish crate.io package
env:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
run: |
cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
env:
SLACK_COLOR: ${{ job.status }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}

View File

@@ -14,7 +14,7 @@ on:
jobs:
params-curves-security-check:
runs-on: ubuntu-latest
runs-on: large_ubuntu_16
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
@@ -24,7 +24,7 @@ jobs:
with:
repository: malb/lattice-estimator
path: lattice_estimator
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
ref: 'e80ec6bbbba212428b0e92d0467c18629cf9ed67'
- name: Install Sage
run: |

View File

@@ -141,7 +141,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -139,7 +139,7 @@ jobs:
--throughput
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -39,7 +39,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
@@ -130,7 +130,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_wasm
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -36,7 +36,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
with:
since_last_remote_commit: true
files_yaml: |
@@ -79,7 +79,7 @@ jobs:
if: needs.setup-instance.result != 'skipped'
needs: setup-instance
concurrency:
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
@@ -138,7 +138,7 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
with:
name: ${{ github.sha }}_integer_zk
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -18,7 +18,7 @@ FAST_TESTS?=FALSE
FAST_BENCH?=FALSE
NIGHTLY_TESTS?=FALSE
BENCH_OP_FLAVOR?=DEFAULT
NODE_VERSION=22.4
NODE_VERSION=22.6
FORWARD_COMPAT?=OFF
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
BACKWARD_COMPAT_DATA_BRANCH?=v0.1

View File

@@ -159,7 +159,7 @@ To run this code, use the following command:
> Note that when running code that uses `TFHE-rs`, it is highly recommended
to run in release mode with cargo's `--release` flag to have the best performances possible.
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
<p align="right">
<a href="#about" > ↑ Back to top </a>

View File

@@ -148,10 +148,9 @@ where
/// Computes one turn of the stream, updating registers and outputting the new bit.
pub fn next_bool(&mut self) -> T {
match &self.fhe_key {
Some(sk) => set_server_key(sk.clone()),
None => (),
};
if let Some(sk) = &self.fhe_key {
set_server_key(sk.clone());
}
let [o, a, b, c] = self.get_output_and_values(0);
@@ -226,18 +225,12 @@ where
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
/// Vec (first value is oldest, last is newest)
pub fn next_64(&mut self) -> Vec<T> {
match &self.fhe_key {
Some(sk) => {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
None => (),
if let Some(sk) = &self.fhe_key {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
let mut values = self.get_64_output_and_values();
match &self.fhe_key {
Some(_) => {
rayon::broadcast(|_| unset_server_key());
}
None => (),
if self.fhe_key.is_some() {
rayon::broadcast(|_| unset_server_key());
}
let mut ret = Vec::<T>::with_capacity(64);

View File

@@ -237,18 +237,12 @@ where
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
/// Vec (first value is oldest, last is newest)
pub fn next_64(&mut self) -> Vec<T> {
match &self.fhe_key {
Some(sk) => {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
None => (),
if let Some(sk) = &self.fhe_key {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
let values = self.get_64_output_and_values();
match &self.fhe_key {
Some(_) => {
rayon::broadcast(|_| unset_server_key());
}
None => (),
if self.fhe_key.is_some() {
rayon::broadcast(|_| unset_server_key());
}
let mut bytes = Vec::<T>::with_capacity(8);

View File

@@ -120,10 +120,9 @@ where
/// Computes one turn of the stream, updating registers and outputting the new bit.
pub fn next_bool(&mut self) -> T {
match &self.fhe_key {
Some(sk) => set_server_key(sk.clone()),
None => (),
};
if let Some(sk) = &self.fhe_key {
set_server_key(sk.clone());
}
let [o, a, b, c] = self.get_output_and_values(0);
@@ -196,18 +195,12 @@ where
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
/// Vec (first value is oldest, last is newest)
pub fn next_64(&mut self) -> Vec<T> {
match &self.fhe_key {
Some(sk) => {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
None => (),
if let Some(sk) = &self.fhe_key {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
let mut values = self.get_64_output_and_values();
match &self.fhe_key {
Some(_) => {
rayon::broadcast(|_| unset_server_key());
}
None => (),
if self.fhe_key.is_some() {
rayon::broadcast(|_| unset_server_key());
}
let mut ret = Vec::<T>::with_capacity(64);

View File

@@ -187,18 +187,12 @@ where
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
/// Vec (first value is oldest, last is newest)
pub fn next_64(&mut self) -> Vec<T> {
match &self.fhe_key {
Some(sk) => {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
None => (),
if let Some(sk) = &self.fhe_key {
rayon::broadcast(|_| set_server_key(sk.clone()));
}
let values = self.get_64_output_and_values();
match &self.fhe_key {
Some(_) => {
rayon::broadcast(|_| unset_server_key());
}
None => (),
if self.fhe_key.is_some() {
rayon::broadcast(|_| unset_server_key());
}
let mut bytes = Vec::<T>::with_capacity(8);

View File

@@ -0,0 +1,156 @@
#ifndef CUDA_INTEGER_COMPRESSION_H
#define CUDA_INTEGER_COMPRESSION_H
#include "integer.h"
extern "C" {
void scratch_cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory);
void scratch_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
int8_t *mem_ptr);
void cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, void *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
}
template <typename Torus> struct int_compression {
int_radix_params compression_params;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;
uint32_t body_count;
// Compression
int8_t *fp_ks_buffer;
Torus *tmp_lwe;
Torus *tmp_glwe_array_out;
int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
tmp_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
scratch_packing_keyswitch_lwe_list_to_glwe_64(
streams[0], gpu_indexes[0], &fp_ks_buffer,
compression_params.glwe_dimension, compression_params.polynomial_size,
num_radix_blocks, true);
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
&fp_ks_buffer);
}
};
template <typename Torus> struct int_decompression {
int_radix_params encryption_params;
int_radix_params compression_params;
uint32_t storage_log_modulus;
uint32_t body_count;
Torus *tmp_extracted_glwe;
Torus *tmp_extracted_lwe;
int_radix_lut<Torus> *carry_extract_lut;
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
carry_extract_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_radix_blocks, allocate_gpu_memory);
tmp_extracted_glwe = (Torus *)cuda_malloc_async(
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
num_radix_blocks *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
1) *
sizeof(Torus),
streams[0], gpu_indexes[0]);
// Decompression
// Carry extract LUT
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
return x / encryption_params.message_modulus;
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0],
carry_extract_lut->get_lut(gpu_indexes[0], 0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
encryption_params.message_modulus, encryption_params.carry_modulus,
carry_extract_f);
carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
}
}
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
delete (carry_extract_lut);
}
};
#endif

View File

@@ -1,6 +1,7 @@
#ifndef CUDA_INTEGER_H
#define CUDA_INTEGER_H
#include "keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include "programmable_bootstrap.h"
#include "programmable_bootstrap_multibit.h"
@@ -15,7 +16,6 @@ enum SHIFT_OR_ROTATE_TYPE {
LEFT_ROTATE = 2,
RIGHT_ROTATE = 3
};
enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
enum BITOP_TYPE {
BITAND = 0,
BITOR = 1,
@@ -475,7 +475,8 @@ struct int_radix_params {
message_modulus(message_modulus), carry_modulus(carry_modulus){};
void print() {
printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
printf("pbs_type: %u, glwe_dimension: %u, "
"polynomial_size: %u, "
"big_lwe_dimension: %u, "
"small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
"%u, pbs_base_log: "
@@ -812,7 +813,6 @@ template <typename Torus> struct int_radix_lut {
}
}
};
template <typename Torus> struct int_bit_extract_luts_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;

View File

@@ -16,6 +16,21 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
bool allocate_gpu_memory);
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
uint32_t gpu_index,
int8_t **fp_ks_buffer);
}
#endif // CNCRT_KS_H_

View File

@@ -1,17 +1,3 @@
set(SOURCES
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
file(GLOB_RECURSE SOURCES "*.cu")
add_library(tfhe_cuda_backend STATIC ${SOURCES})
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)

View File

@@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
auto lwe_out = lwe_array_out + input_id * lwe_output_size;
// We assume each GLWE will store the first polynomial_size inputs
uint32_t nth_per_glwe = params::degree;
auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
uint32_t lwe_per_glwe = params::degree;
auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
auto nth = nth_array[input_id];
@@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
template <typename Torus, class params>
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *glwe_array_in,
uint32_t *nth_array, uint32_t num_glwes,
uint32_t *nth_array, uint32_t num_nths,
uint32_t glwe_dimension) {
cudaSetDevice(gpu_index);
dim3 grid(num_glwes);
dim3 grid(num_nths);
dim3 thds(params::degree / params::opt);
sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
lwe_array_out, glwe_array_in, nth_array, glwe_dimension);

View File

@@ -27,7 +27,7 @@ private:
public:
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
uint32_t num_poly = 1)
uint32_t num_poly)
: base_log(base_log), level_count(level_count), num_poly(num_poly),
state(state) {

View File

@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cuda_keyswitch_lwe_ciphertext_vector(
host_keyswitch_lwe_ciphertext_vector(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_output_indexes),
@@ -48,3 +48,35 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
}
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
bool allocate_gpu_memory) {
scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
}
/* Perform functional packing keyswitch on a batch of 64 bits input LWE
* ciphertexts.
*/
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
host_packing_keyswitch_lwe_list_to_glwe(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
output_glwe_dimension, output_polynomial_size, base_log, level_count,
num_lwes);
}
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
uint32_t gpu_index,
int8_t **fp_ks_buffer) {
cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
}

View File

@@ -7,6 +7,7 @@
#include "polynomial/functions.cuh"
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
@@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
}
template <typename Torus>
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
__host__ void host_keyswitch_lwe_ciphertext_vector(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
@@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
// Compute Keyswitch
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
host_keyswitch_lwe_ciphertext_vector<Torus>(
streams[i], gpu_indexes[i], current_lwe_array_out,
current_lwe_output_indexes, current_lwe_array_in,
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
@@ -154,4 +155,154 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
}
}
template <typename Torus>
__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
if (allocate_gpu_memory)
*fp_ks_buffer = (int8_t *)cuda_malloc_async(
2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
gpu_index);
}
// public functional packing keyswitch for a single LWE ciphertext
//
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
// different thread blocks at the x-axis to work on that input.
template <typename Torus>
__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
size_t glwe_size = (glwe_dimension + 1);
if (tid < glwe_size * polynomial_size) {
const int local_index = threadIdx.x;
// the output_glwe is split in polynomials and each x-block takes one of
// them
size_t poly_id = blockIdx.x;
size_t coef_per_block = blockDim.x;
// number of coefficients inside fp-ksk block for each lwe_input coefficient
size_t ksk_block_size = glwe_size * polynomial_size * level_count;
// initialize accumulator to 0
glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
tid == glwe_dimension * polynomial_size);
// Iterate through all lwe elements
for (int i = 0; i < lwe_dimension_in; i++) {
// Round and prepare decomposition
Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
Torus mod_b_mask = (1ll << base_log) - 1ll;
// block of key for current lwe coefficient (cur_input_lwe[i])
auto ksk_block = &fp_ksk[i * ksk_block_size];
for (int j = 0; j < level_count; j++) {
auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
// Iterate through each level and multiply by the ksk piece
auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
}
}
}
}
// public functional packing keyswitch for a batch of LWE ciphertexts
//
// Selects the input each thread is working on using the y-block index.
//
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
// different thread blocks at the x-axis to work on that input.
template <typename Torus>
__global__ void
packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
Torus *fp_ksk, uint32_t lwe_dimension_in,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, Torus *d_mem) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
const int lwe_size = (lwe_dimension_in + 1);
const int input_id = blockIdx.y;
const int degree = input_id;
// Select an input
auto lwe_in = lwe_array_in + input_id * lwe_size;
auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
// KS LWE to GLWE
packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
polynomial_size, base_log, level_count);
// P * x ^degree
auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree,
tid % polynomial_size,
polynomial_size, 1, true);
}
/// To-do: Rewrite this kernel for efficiency
template <typename Torus>
__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t num_lwes) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < (glwe_dimension + 1) * polynomial_size) {
glwe_out[tid] = glwe_array_in[tid];
// Accumulate
for (int i = 1; i < num_lwes; i++) {
auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
glwe_out[tid] += glwe_in[tid];
}
}
}
template <typename Torus>
__host__ void host_packing_keyswitch_lwe_list_to_glwe(
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_lwes) {
cudaSetDevice(gpu_index);
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
dim3 grid(num_blocks, num_lwes);
dim3 threads(num_threads);
auto d_mem = (Torus *)fp_ks_buffer;
auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
// individually keyswitch each lwe
packing_keyswitch_lwe_list_to_glwe<<<grid, threads, 0, stream>>>(
d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
glwe_dimension, polynomial_size, base_log, level_count, d_mem);
check_cuda_error(cudaGetLastError());
// accumulate to a single glwe
accumulate_glwes<<<num_blocks, threads, 0, stream>>>(
glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
num_lwes);
check_cuda_error(cudaGetLastError());
}
#endif

View File

@@ -2,6 +2,7 @@
#define CNCRT_TORUS_CUH
#include "types/int128.cuh"
#include "utils/kernel_dimensions.cuh"
#include <limits>
template <typename T>
@@ -29,20 +30,18 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
template <typename T>
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
uint32_t level_count) {
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b = (x & mask) >> (shift - 1);
const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
const T shift = non_rep_bit_count - 1;
T res = x >> shift;
res += b;
res <<= shift;
return res;
res += 1;
res &= (T)(-2);
return res << shift;
}
template <typename T>
__device__ __forceinline__ void modulus_switch(T input, T &output,
uint32_t log_modulus) {
constexpr uint32_t BITS = sizeof(T) * 8;
output = input + (((T)1) << (BITS - log_modulus - 1));
output >>= (BITS - log_modulus);
}
@@ -54,4 +53,27 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
return output;
}
template <typename Torus>
__global__ void modulus_switch_inplace(Torus *array, int size,
uint32_t log_modulus) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < size) {
array[tid] = modulus_switch(array[tid], log_modulus);
}
}
template <typename Torus>
__host__ void host_modulus_switch_inplace(cudaStream_t stream,
uint32_t gpu_index, Torus *array,
int size, uint32_t log_modulus) {
cudaSetDevice(gpu_index);
int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
log_modulus);
check_cuda_error(cudaGetLastError());
}
#endif // CNCRT_TORUS_H

View File

@@ -0,0 +1,87 @@
#include "compression.cuh"
void scratch_cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
carry_modulus);
scratch_cuda_compress_integer_radix_ciphertext_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
}
void scratch_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
int_radix_params encryption_params(
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
(encryption_glwe_dimension + 1) * encryption_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
scratch_cuda_integer_decompress_radix_ciphertext_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
compression_params, storage_log_modulus, allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
int8_t *mem_ptr) {
host_integer_compress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(glwe_array_out),
static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
(int_compression<uint64_t> *)mem_ptr);
}
void cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void *glwe_in, void *indexes_array,
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
host_integer_decompress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
static_cast<uint32_t *>(indexes_array), indexes_array_size, bsks,
(int_decompression<uint64_t> *)mem_ptr);
}
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_compression<uint64_t> *mem_ptr =
(int_compression<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_decompression<uint64_t> *mem_ptr =
(int_decompression<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

View File

@@ -0,0 +1,238 @@
#ifndef CUDA_INTEGER_COMPRESSION_CUH
#define CUDA_INTEGER_COMPRESSION_CUH
#include "ciphertext.h"
#include "compression.h"
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer/integer.cuh"
#include "linearalgebra/multiplication.cuh"
#include "polynomial/functions.cuh"
#include "utils/kernel_dimensions.cuh"
template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
uint32_t in_len, uint32_t len) {
auto nbits = sizeof(Torus) * 8;
auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < len) {
auto k = nbits * i / log_modulus;
auto j = k;
auto start_shift = i * nbits - j * log_modulus;
auto value = array_in[j] >> start_shift;
j++;
while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
auto shift = j * log_modulus - i * nbits;
value |= array_in[j] << shift;
j++;
}
array_out[i] = value;
}
}
template <typename Torus>
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
Torus *array_out, Torus *array_in, uint32_t num_inputs,
uint32_t body_count, int_compression<Torus> *mem_ptr) {
cudaSetDevice(gpu_index);
auto params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
auto number_bits_to_pack = in_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto len = (number_bits_to_pack + nbits - 1) / nbits;
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
dim3 threads(num_threads);
pack<<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus, in_len,
len);
}
template <typename Torus>
__host__ void host_integer_compress(cudaStream_t *streams,
uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *glwe_array_out, Torus *lwe_array_in,
Torus **fp_ksk, uint32_t num_lwes,
int_compression<Torus> *mem_ptr) {
auto compression_params = mem_ptr->compression_params;
auto input_lwe_dimension = compression_params.small_lwe_dimension;
// Shift
auto lwe_shifted = mem_ptr->tmp_lwe;
host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
lwe_array_in,
(uint64_t)compression_params.message_modulus,
input_lwe_dimension, num_lwes);
uint32_t lwe_in_size = input_lwe_dimension + 1;
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
for (int i = 0; i < num_glwes; i++) {
auto lwe_subset = lwe_shifted + i * lwe_in_size;
auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
host_packing_keyswitch_lwe_list_to_glwe(
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
compression_params.polynomial_size, compression_params.ks_base_log,
compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
}
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
// Modulus switch
host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes *
(compression_params.glwe_dimension *
compression_params.polynomial_size +
body_count),
mem_ptr->storage_log_modulus);
check_cuda_error(cudaGetLastError());
host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out,
num_glwes, body_count, mem_ptr);
}
template <typename Torus>
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
uint32_t log_modulus, uint32_t initial_out_len) {
auto nbits = sizeof(Torus) * 8;
auto i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < initial_out_len) {
// Unpack
Torus mask = ((Torus)1 << log_modulus) - 1;
auto start = i * log_modulus;
auto end = (i + 1) * log_modulus;
auto start_block = start / nbits;
auto start_remainder = start % nbits;
auto end_block_inclusive = (end - 1) / nbits;
Torus unpacked_i;
if (start_block == end_block_inclusive) {
auto single_part = array_in[start_block] >> start_remainder;
unpacked_i = single_part & mask;
} else {
auto first_part = array_in[start_block] >> start_remainder;
auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
unpacked_i = (first_part | second_part) & mask;
}
// Extract
glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
}
}
template <typename Torus>
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *glwe_array_out, Torus *array_in,
uint32_t glwe_index,
int_decompression<Torus> *mem_ptr) {
cudaSetDevice(gpu_index);
auto params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;
uint32_t body_count = mem_ptr->body_count;
auto initial_out_len =
params.glwe_dimension * params.polynomial_size + body_count * body_count;
// We assure the tail of the glwe is zeroed
auto zeroed_slice =
glwe_array_out + params.glwe_dimension * params.polynomial_size;
cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
stream, gpu_index);
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
dim3 grid(num_blocks);
dim3 threads(num_threads);
extract<<<grid, threads, 0, stream>>>(glwe_array_out, array_in, glwe_index,
log_modulus, initial_out_len);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out,
Torus *packed_glwe_in, uint32_t *indexes_array,
uint32_t indexes_array_size, void **bsks,
int_decompression<Torus> *mem_ptr) {
auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
auto compression_params = mem_ptr->compression_params;
host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
mem_ptr);
auto num_lwes = mem_ptr->body_count;
// Sample extract
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
extracted_glwe, indexes_array, indexes_array_size,
compression_params.glwe_dimension,
compression_params.polynomial_size);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
auto encryption_params = mem_ptr->encryption_params;
auto carry_extract_lut = mem_ptr->carry_extract_lut;
execute_pbs_async<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
carry_extract_lut->lut_indexes_vec, extracted_lwe,
carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
encryption_params.glwe_dimension,
compression_params.glwe_dimension * compression_params.polynomial_size,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
encryption_params.pbs_type);
}
template <typename Torus>
__host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_compression<Torus> **mem_ptr, uint32_t num_lwes,
int_radix_params compression_params, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
*mem_ptr = new int_compression<Torus>(
streams, gpu_indexes, gpu_count, compression_params, num_lwes,
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
}
template <typename Torus>
__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
int_radix_params encryption_params, int_radix_params compression_params,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_lwes, storage_log_modulus, allocate_gpu_memory);
}
#endif

View File

@@ -765,7 +765,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
}
if (num_radix_blocks % 2 == 1) {
// We couldn't pack the last block, so we just copy it
// We couldn't host_pack the last block, so we just copy it
Torus *lsb_block =
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
Torus *last_block =

View File

@@ -271,7 +271,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
if (!ch_amount)
ch_amount++;
dim3 add_grid(ch_amount, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);
cudaSetDevice(gpu_indexes[0]);
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(

View File

@@ -133,7 +133,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
PANIC("Unknown operation")
}
// pack bits into one block so that we have
// host_pack bits into one block so that we have
// control_bit|b|a
cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
streams[0], gpu_indexes[0]); // Do we need this?

View File

@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(cleartext_array_in),
input_lwe_dimension,
input_lwe_ciphertext_count);
host_cleartext_vec_multiplication(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
static_cast<uint32_t *>(lwe_array_in),
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *cleartext_array_in, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(cleartext_array_in),
input_lwe_dimension,
input_lwe_ciphertext_count);
host_cleartext_vec_multiplication(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t *>(lwe_array_in),
static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

View File

@@ -14,9 +14,10 @@
#include <vector>
template <typename T>
__global__ void
cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
uint32_t input_lwe_dimension, uint32_t num_entries) {
__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
T *cleartext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
@@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
}
}
template <typename T>
__host__ void
host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
T *output, T *lwe_input, T *cleartext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cleartext_vec_multiplication<<<grid, thds, 0, stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
template <typename T>
__global__ void
cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
uint32_t input_lwe_dimension, uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = lwe_input[index] * cleartext_input;
}
}
template <typename T>
__host__ void
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
T *output, T *lwe_input, T *cleartext_input,
T *output, T *lwe_input, T cleartext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {

View File

@@ -207,9 +207,9 @@ __global__ void device_programmable_bootstrap_amortized(
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
glwe_dimension, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
glwe_dimension, 0);
}
template <typename Torus>

View File

@@ -98,8 +98,8 @@ __global__ void device_programmable_bootstrap_cg(
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, false,
1);
for (int i = 0; i < lwe_dimension; i++) {
synchronize_threads_in_block();
@@ -111,13 +111,13 @@ __global__ void device_programmable_bootstrap_cg(
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
accumulator, accumulator_rotated, a_hat, 1);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
accumulator_rotated, base_log, level_count, 1);
synchronize_threads_in_block();
@@ -125,7 +125,7 @@ __global__ void device_programmable_bootstrap_cg(
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
accumulator_rotated, 1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -150,9 +150,9 @@ __global__ void device_programmable_bootstrap_cg(
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
}
}

View File

@@ -86,7 +86,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
false, 1);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -98,12 +98,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator, base_log, level_count, 1);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -129,9 +130,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
0);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
0);
}
} else {
// Load the accumulator calculated in previous iterations

View File

@@ -82,7 +82,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
false, 1);
// Persist
int tid = threadIdx.x;
@@ -102,20 +102,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(global_slice,
accumulator, a_hat);
accumulator, a_hat, 1);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator, base_log, level_count, 1);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -215,9 +215,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
0);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
0);
}
} else {
// Persist the updated accumulator

View File

@@ -102,8 +102,9 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
synchronize_threads_in_block();
// Multiply by the bsk element
polynomial_product_accumulate_by_monomial<Torus, params>(
accumulator, bsk_poly, monomial_degree, false);
polynomial_accumulate_monic_monomial_mul<Torus>(
accumulator, bsk_poly, monomial_degree, threadIdx.x, params::degree,
params::opt, false);
}
synchronize_threads_in_block();
@@ -209,7 +210,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
false, 1);
// Persist
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -224,12 +225,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator, base_log, level_count, 1);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -323,9 +324,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice, 1,
0);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0,
0);
}
}
}

View File

@@ -115,13 +115,13 @@ __global__ void device_programmable_bootstrap_tbc(
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
accumulator, accumulator_rotated, a_hat, 1);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
accumulator_rotated, base_log, level_count, 1);
synchronize_threads_in_block();
@@ -154,9 +154,9 @@ __global__ void device_programmable_bootstrap_tbc(
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
}
}

View File

@@ -94,7 +94,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
false, 1);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -106,12 +106,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator, base_log, level_count, 1);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -137,9 +138,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
0);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
0);
}
} else {
// Load the accumulator calculated in previous iterations

View File

@@ -45,7 +45,7 @@ template <typename T, int elems_per_thread, int block_size>
__device__ void
divide_by_monomial_negacyclic_inplace(T *accumulator,
const T *__restrict__ input, uint32_t j,
bool zeroAcc, uint32_t num_poly = 1) {
bool zeroAcc, uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
@@ -94,7 +94,7 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
@@ -133,7 +133,7 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
template <typename T, int elems_per_thread, int block_size>
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
int level_count,
uint32_t num_poly = 1) {
uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
@@ -192,7 +192,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
// Extracts the body of the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
uint32_t glwe_dimension, uint32_t nth = 0) {
uint32_t glwe_dimension, uint32_t nth) {
// Set first coefficient of the glwe as the body of the LWE sample
lwe_array_out[glwe_dimension * params::degree] =
glwe[glwe_dimension * params::degree + nth];
@@ -201,8 +201,7 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
// Extracts the mask from the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
uint32_t glwe_dimension = 1,
uint32_t nth = 0) {
uint32_t glwe_dimension, uint32_t nth) {
for (int z = 0; z < glwe_dimension; z++) {
Torus *lwe_array_out_slice =
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);

View File

@@ -55,21 +55,22 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
}
}
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second
template <typename T, class params>
__device__ void
polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
uint64_t monomial_degree,
bool init_accumulator = false) {
// monomial_degree \in [0, 2 * params::degree)
int full_cycles_count = monomial_degree / params::degree;
int remainder_degrees = monomial_degree % params::degree;
// This method expects to work with polynomial_size / compression_params::opt
// threads in the x-block If init_accumulator is set, assumes that result was
// not initialized and does that with the outcome of first * second
template <typename T>
__device__ void polynomial_accumulate_monic_monomial_mul(
T *result, const T *__restrict__ poly, uint64_t monomial_degree,
uint32_t tid, uint32_t polynomial_size, int coeff_per_thread,
bool init_accumulator = false) {
// monomial_degree \in [0, 2 * compression_params::degree)
int full_cycles_count = monomial_degree / polynomial_size;
int remainder_degrees = monomial_degree % polynomial_size;
int pos = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
int pos = tid;
for (int i = 0; i < coeff_per_thread; i++) {
T element = poly[pos];
int new_pos = (pos + monomial_degree) % params::degree;
int new_pos = (pos + monomial_degree) % polynomial_size;
T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
x = SEL(-x, x, new_pos >= remainder_degrees);
@@ -78,7 +79,7 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
result[new_pos] = x;
else
result[new_pos] += x;
pos += params::degree / params::opt;
pos += polynomial_size / coeff_per_thread;
}
}

View File

@@ -311,6 +311,40 @@ extern "C" {
num_samples: u32,
);
/// This scratch function allocates the necessary amount of data on the GPU for
/// the public function packing keyswitch implementation on 64-bit
pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64(
stream: *mut c_void,
gpu_index: u32,
fp_ks_buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
);
/// Perform public functional packing keyswitch on a vector of 64-bit LWE ciphertexts
pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64(
stream: *mut c_void,
gpu_index: u32,
glwe_array_out: *mut c_void,
lwe_array_in: *const c_void,
fp_ksk_array: *const c_void,
fp_ks_buffer: *mut i8,
input_lwe_dimension: u32,
output_glwe_dimension: u32,
polynomial_size: u32,
base_log: u32,
level_count: u32,
num_lwes: u32,
);
pub fn cleanup_packing_keyswitch_lwe_list_to_glwe(
stream: *mut c_void,
gpu_index: u32,
fp_ks_buffer: *mut *mut i8,
);
/// Perform the negation of a u64 input LWE ciphertext vector.
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
@@ -484,6 +518,80 @@ extern "C" {
mem_ptr: *mut *mut i8,
);
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
compression_glwe_dimension: u32,
compression_polynomial_size: u32,
lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
num_lwes: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
lwe_per_glwe: u32,
storage_log_modulus: u32,
allocate_gpu_memory: bool,
);
pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
encryption_glwe_dimension: u32,
encryption_polynomial_size: u32,
compression_glwe_dimension: u32,
compression_polynomial_size: u32,
lwe_dimension: u32,
pbs_level: u32,
pbs_base_log: u32,
num_lwes: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
storage_log_modulus: u32,
allocate_gpu_memory: bool,
);
pub fn cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut c_void,
gpu_indexes: *const u32,
gpu_count: u32,
glwe_array_out: *mut c_void,
lwe_array_in: *const c_void,
fp_ksk: *const *mut c_void,
num_lwes: u32,
mem_ptr: *mut i8,
);
pub fn cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_out: *mut c_void,
glwe_array_in: *const c_void,
indexes_array: *const c_void,
indexes_array_size: u32,
bsks: *const *mut c_void,
mem_ptr: *mut i8,
);
pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
);
pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
);
pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
streams: *const *mut c_void,
gpu_indexes: *const u32,

View File

@@ -20,7 +20,7 @@ def main(args):
bench_function_id = bench_data["function_id"]
split = bench_function_id.split("::")
if split.len() == 5: # Signed integers
if len(split) == 5: # Signed integers
(_, _, function_name, parameter_set, bits) = split
else: # Unsigned integers
(_, function_name, parameter_set, bits) = split
@@ -53,7 +53,8 @@ def main(args):
estimate_upper_bound_ms,
)
)
except:
except Exception as e:
print(e)
pass
if len(data) == 0:

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe"
version = "0.8.0-alpha.2"
version = "0.8.0-alpha.3"
edition = "2021"
readme = "../README.md"
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
@@ -62,12 +62,12 @@ lazy_static = { version = "1.4.0", optional = true }
serde = { version = "1.0", features = ["derive"] }
rayon = { version = "1.5.0" }
bincode = "1.3.3"
concrete-fft = { version = "0.4.1", features = ["serde", "fft128"] }
concrete-ntt = { version = "0.1.2" }
pulp = "0.18.8"
concrete-fft = { version = "0.5.0", features = ["serde", "fft128"] }
concrete-ntt = { version = "0.2.0" }
pulp = "0.18.22"
tfhe-cuda-backend = { version = "0.4.0-alpha.0", path = "../backends/tfhe-cuda-backend", optional = true }
aligned-vec = { version = "0.5", features = ["serde"] }
dyn-stack = { version = "0.9" }
dyn-stack = { version = "0.10" }
paste = "1.0.7"
fs2 = { version = "0.4.3", optional = true }
# Used for OPRF in shortint
@@ -76,7 +76,7 @@ sha3 = { version = "0.10", optional = true }
itertools = "0.11.0"
rand_core = { version = "0.6.4", features = ["std"] }
tfhe-zk-pok = { version = "0.3.0-alpha.0", path = "../tfhe-zk-pok", optional = true }
tfhe-versionable = { version = "0.2.0", path = "../utils/tfhe-versionable" }
tfhe-versionable = { version = "0.2.1", path = "../utils/tfhe-versionable" }
# wasm deps
wasm-bindgen = { version = "0.2.86", features = [

View File

@@ -4,7 +4,19 @@ use tfhe::integer::{ClientKey, RadixCiphertext};
use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
fn glwe_packing(c: &mut Criterion) {
#[cfg(feature = "gpu")]
use tfhe::core_crypto::gpu::CudaStreams;
#[cfg(feature = "gpu")]
use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
#[cfg(feature = "gpu")]
use tfhe::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaUnsignedRadixCiphertext};
#[cfg(feature = "gpu")]
use tfhe::integer::gpu::gen_keys_radix_gpu;
fn cpu_glwe_packing(c: &mut Criterion) {
let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
@@ -12,6 +24,9 @@ fn glwe_packing(c: &mut Criterion) {
let bench_name = "integer_packing_compression";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let cks = ClientKey::new(param);
@@ -29,7 +44,6 @@ fn glwe_packing(c: &mut Criterion) {
64,
128,
256,
256,
comp_param.lwe_per_glwe.0 * log_message_modulus,
] {
assert_eq!(num_bits % log_message_modulus, 0);
@@ -73,9 +87,86 @@ fn glwe_packing(c: &mut Criterion) {
}
}
criterion_group!(glwe_packing2, glwe_packing);
#[cfg(feature = "gpu")]
fn gpu_glwe_packing(c: &mut Criterion) {
let bench_name = "integer_cuda_packing_compression";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let stream = CudaStreams::new_multi_gpu();
let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
let log_message_modulus = param.message_modulus.0.ilog2() as usize;
for bit_size in [
8,
16,
32,
64,
128,
256,
comp_param.lwe_per_glwe.0 * log_message_modulus,
] {
assert_eq!(bit_size % log_message_modulus, 0);
let num_blocks = bit_size / log_message_modulus;
// Generate private compression key
let cks = ClientKey::new(param);
let private_compression_key = cks.new_compression_private_key(comp_param);
// Generate and convert compression keys
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
let (compressed_compression_key, compressed_decompression_key) =
radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
let cuda_decompression_key =
compressed_decompression_key.decompress_to_cuda(radix_cks.parameters(), &stream);
// Encrypt
let ct = cks.encrypt_radix(0_u32, num_blocks);
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
// Benchmark
let mut builder = CudaCompressedCiphertextListBuilder::new();
builder.push(d_ct, &stream);
let bench_id = format!("pack_u{bit_size}");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let compressed = builder.build(&cuda_compression_key, &stream);
_ = black_box(compressed);
})
});
let compressed = builder.build(&cuda_compression_key, &stream);
let bench_id = format!("unpack_u{bit_size}");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let unpacked: CudaRadixCiphertext =
compressed.get(0, &cuda_decompression_key, &stream);
_ = black_box(unpacked);
})
});
}
}
#[cfg(feature = "gpu")]
criterion_group!(gpu_glwe_packing2, gpu_glwe_packing);
criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
fn main() {
glwe_packing2();
#[cfg(feature = "gpu")]
gpu_glwe_packing2();
#[cfg(not(feature = "gpu"))]
cpu_glwe_packing2();
Criterion::default().configure_from_args().final_summary();
}

View File

@@ -51,7 +51,10 @@ fn pke_zk_proof(c: &mut Criterion) {
for bits in [640usize, 1280, 4096] {
assert_eq!(bits % 64, 0);
let num_block = 64usize.div_ceil(param_pke.message_modulus.0.ilog2() as usize);
// Packing, so we take the message and carry modulus to compute our block count
let num_block = 64usize.div_ceil(
(param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
);
use rand::Rng;
let mut rng = rand::thread_rng();
@@ -128,7 +131,10 @@ fn pke_zk_verify(c: &mut Criterion, results_file: &Path) {
for bits in [640usize, 1280, 4096] {
assert_eq!(bits % 64, 0);
let num_block = 64usize.div_ceil(param_pke.message_modulus.0.ilog2() as usize);
// Packing, so we take the message and carry modulus to compute our block count
let num_block = 64usize.div_ceil(
(param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
);
use rand::Rng;
let mut rng = rand::thread_rng();

View File

@@ -14,7 +14,7 @@ fn oprf(c: &mut Criterion) {
let keys = KEY_CACHE.get_from_param(param);
let sks = keys.server_key();
bench_group.bench_function(&format!("2-bits-oprf::{}", param.name()), |b| {
bench_group.bench_function(format!("2-bits-oprf::{}", param.name()), |b| {
b.iter(|| {
_ = black_box(sks.generate_oblivious_pseudo_random(Seed(0), 2));
})

View File

@@ -13,6 +13,7 @@ const {
CompressedFheInt8,
FheInt8,
FheInt32,
FheTypes,
CompressedFheInt128,
FheInt128,
CompressedFheInt256,
@@ -354,7 +355,6 @@ test('hlapi_public_key_encrypt_decrypt_int256_small', (t) => {
});
//////////////////////////////////////////////////////////////////////////////
/// 32 bits compact
//////////////////////////////////////////////////////////////////////////////
@@ -423,19 +423,37 @@ test('hlapi_compact_ciphertext_list', (t) => {
let clear_i32 = -3284;
let clear_bool = true;
let clear_u256 = generateRandomBigInt(256);
let clear_u2048 = generateRandomBigInt(2048);
let builder = CompactCiphertextList.builder(publicKey);
builder.push_u2(clear_u2);
builder.push_i32(clear_i32);
builder.push_boolean(clear_bool);
builder.push_u256(clear_u256);
builder.push_u2048(clear_u2048);
let list = builder.build();
let serialized = list.safe_serialize(BigInt(10000000));
let deserialized = CompactCiphertextList.safe_deserialize(serialized, BigInt(10000000));
assert.deepStrictEqual(deserialized.is_empty(), false);
assert.deepStrictEqual(deserialized.len(), 5);
assert.deepStrictEqual(deserialized.get_kind_of(0), FheTypes.Uint2);
assert.deepStrictEqual(deserialized.get_kind_of(1), FheTypes.Int32);
assert.deepStrictEqual(deserialized.get_kind_of(2), FheTypes.Bool);
assert.deepStrictEqual(deserialized.get_kind_of(3), FheTypes.Uint256);
assert.deepStrictEqual(deserialized.get_kind_of(4), FheTypes.Uint2048);
let expander = deserialized.expand();
assert.deepStrictEqual(expander.is_empty(), false);
assert.deepStrictEqual(expander.len(), 5);
assert.deepStrictEqual(expander.get_kind_of(0), FheTypes.Uint2);
assert.deepStrictEqual(expander.get_kind_of(1), FheTypes.Int32);
assert.deepStrictEqual(expander.get_kind_of(2), FheTypes.Bool);
assert.deepStrictEqual(expander.get_kind_of(3), FheTypes.Uint256);
assert.deepStrictEqual(expander.get_kind_of(4), FheTypes.Uint2048);
assert.deepStrictEqual(
expander.get_uint2(0).decrypt(clientKey),
clear_u2,
@@ -455,6 +473,12 @@ test('hlapi_compact_ciphertext_list', (t) => {
expander.get_uint256(3).decrypt(clientKey),
clear_u256,
);
assert.deepStrictEqual(
expander.get_uint2048(4).decrypt(clientKey),
clear_u2048,
);
});
test('hlapi_compact_ciphertext_list_with_proof', (t) => {
@@ -489,5 +513,12 @@ test('hlapi_compact_ciphertext_list_with_proof', (t) => {
let serialized = list.safe_serialize(BigInt(10000000));
let deserialized = ProvenCompactCiphertextList.safe_deserialize(serialized, BigInt(10000000));
assert.deepStrictEqual(deserialized.is_empty(), false);
assert.deepStrictEqual(deserialized.len(), 4);
assert.deepStrictEqual(deserialized.get_kind_of(0), FheTypes.Uint2);
assert.deepStrictEqual(deserialized.get_kind_of(1), FheTypes.Int32);
assert.deepStrictEqual(deserialized.get_kind_of(2), FheTypes.Bool);
assert.deepStrictEqual(deserialized.get_kind_of(3), FheTypes.Uint256);
// We cannot verify packed ZK in wasm
});

View File

@@ -36,6 +36,9 @@ pub enum FheTypes {
Type_FheUint128,
Type_FheUint160,
Type_FheUint256,
Type_FheUint512,
Type_FheUint1024,
Type_FheUint2048,
Type_FheInt2,
Type_FheInt4,
Type_FheInt6,
@@ -68,6 +71,9 @@ impl From<crate::FheTypes> for FheTypes {
crate::FheTypes::Uint128 => Self::Type_FheUint128,
crate::FheTypes::Uint160 => Self::Type_FheUint160,
crate::FheTypes::Uint256 => Self::Type_FheUint256,
crate::FheTypes::Uint512 => Self::Type_FheUint512,
crate::FheTypes::Uint1024 => Self::Type_FheUint1024,
crate::FheTypes::Uint2048 => Self::Type_FheUint2048,
crate::FheTypes::Int2 => Self::Type_FheInt2,
crate::FheTypes::Int4 => Self::Type_FheInt4,
crate::FheTypes::Int6 => Self::Type_FheInt6,

View File

@@ -250,13 +250,10 @@ pub fn blind_rotate_ntt64_assign_mem_optimized<InputCont, OutputCont, KeyCont>(
if *lwe_mask_element != 0u64 {
let stack = stack.rb_mut();
// We copy ct_0 to ct_1
let (mut ct1, stack) =
let (ct1, stack) =
stack.collect_aligned(CACHELINE_ALIGN, ct0.as_ref().iter().copied());
let mut ct1 = GlweCiphertextMutView::from_container(
&mut *ct1,
lut_poly_size,
ciphertext_modulus,
);
let mut ct1 =
GlweCiphertextMutView::from_container(ct1, lut_poly_size, ciphertext_modulus);
// We rotate ct_1 by performing ct_1 <- ct_1 * X^{a_hat}
for mut poly in ct1.as_mut_polynomial_list().iter_mut() {
@@ -503,10 +500,10 @@ pub fn programmable_bootstrap_ntt64_lwe_ciphertext_mem_optimized<
accumulator.ciphertext_modulus()
);
let (mut local_accumulator_data, stack) =
let (local_accumulator_data, stack) =
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
let mut local_accumulator = GlweCiphertextMutView::from_container(
&mut *local_accumulator_data,
local_accumulator_data,
accumulator.polynomial_size(),
accumulator.ciphertext_modulus(),
);
@@ -568,12 +565,11 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
out.ciphertext_modulus(),
);
let (mut output_fft_buffer, mut substack0) =
let (output_fft_buffer, mut substack0) =
stack.make_aligned_raw::<u64>(poly_size * ggsw.glwe_size().0, align);
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
// the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
// it has been fully initialized for the first time.
let output_fft_buffer = &mut *output_fft_buffer;
let mut is_output_uninit = true;
{
@@ -616,17 +612,16 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
glwe_decomp_term.as_polynomial_list().iter()
)
.for_each(|(ggsw_row, glwe_poly)| {
let (mut ntt_poly, _) =
substack2.rb_mut().make_aligned_raw::<u64>(poly_size, align);
let (ntt_poly, _) = substack2.rb_mut().make_aligned_raw::<u64>(poly_size, align);
// We perform the forward ntt transform for the glwe polynomial
ntt.forward(PolynomialMutView::from_container(&mut ntt_poly), glwe_poly);
ntt.forward(PolynomialMutView::from_container(ntt_poly), glwe_poly);
// Now we loop through the polynomials of the output, and add the
// corresponding product of polynomials.
update_with_fmadd_ntt64(
output_fft_buffer,
ggsw_row.as_ref(),
&ntt_poly,
ntt_poly,
is_output_uninit,
poly_size,
ntt,

View File

@@ -4,7 +4,7 @@ use crate::core_crypto::commons::math::decomposition::{
};
use crate::core_crypto::commons::numeric::UnsignedInteger;
use crate::core_crypto::commons::parameters::{DecompositionBaseLog, DecompositionLevelCount};
use dyn_stack::{DynArray, PodStack, ReborrowMut};
use dyn_stack::{PodStack, ReborrowMut};
/// An iterator that yields the terms of the signed decomposition of an integer.
///
@@ -288,9 +288,9 @@ pub struct TensorSignedDecompositionLendingIterNonNative<'buffers> {
// ...0001111
mod_b_mask: u64,
// The internal states of each decomposition
states: DynArray<'buffers, u64>,
states: &'buffers mut [u64],
// Corresponding input signs
input_signs: DynArray<'buffers, u8>,
input_signs: &'buffers mut [u8],
// A flag which stores whether the iterator is a fresh one (for the recompose method).
fresh: bool,
ciphertext_modulus: u64,
@@ -306,9 +306,9 @@ impl<'buffers> TensorSignedDecompositionLendingIterNonNative<'buffers> {
) -> (Self, PodStack<'buffers>) {
let shift = modulus.ceil_ilog2() as usize - decomposer.base_log * decomposer.level_count;
let input_size = input.len();
let (mut states, stack) =
let (states, stack) =
stack.make_aligned_raw::<u64>(input_size, aligned_vec::CACHELINE_ALIGN);
let (mut input_signs, stack) =
let (input_signs, stack) =
stack.make_aligned_raw::<u8>(input_size, aligned_vec::CACHELINE_ALIGN);
for ((i, state), sign) in input
@@ -393,11 +393,7 @@ impl<'buffers> TensorSignedDecompositionLendingIterNonNative<'buffers> {
&mut self,
substack1: &'a mut PodStack,
align: usize,
) -> (
DecompositionLevel,
dyn_stack::DynArray<'a, u64>,
PodStack<'a>,
) {
) -> (DecompositionLevel, &'a mut [u64], PodStack<'a>) {
let (glwe_level, _, glwe_decomp_term) = self.next_term().unwrap();
let (glwe_decomp_term, substack2) =
substack1.rb_mut().collect_aligned(align, glwe_decomp_term);

View File

@@ -200,18 +200,25 @@ impl<Scalar: UnsignedInteger> ParameterSetConformant
&self,
lwe_ct_parameters: &GlweCiphertextConformanceParameters<Scalar>,
) -> bool {
let log_modulus = self.packed_integers.log_modulus.0;
let Self {
packed_integers,
glwe_dimension,
polynomial_size,
bodies_count,
uncompressed_ciphertext_modulus,
} = self;
let log_modulus = packed_integers.log_modulus.0;
let number_bits_to_unpack =
(self.glwe_dimension.0 * self.polynomial_size.0 + self.bodies_count.0) * log_modulus;
(glwe_dimension.0 * polynomial_size.0 + bodies_count.0) * log_modulus;
let len = number_bits_to_unpack.div_ceil(Scalar::BITS);
self.packed_integers.packed_coeffs.len() == len
&& self.glwe_dimension == lwe_ct_parameters.glwe_dim
&& self.polynomial_size == lwe_ct_parameters.polynomial_size
packed_integers.packed_coeffs.len() == len
&& *glwe_dimension == lwe_ct_parameters.glwe_dim
&& *polynomial_size == lwe_ct_parameters.polynomial_size
&& lwe_ct_parameters.ct_modulus.is_power_of_two()
&& self.uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
&& *uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
}
}

View File

@@ -147,16 +147,22 @@ impl<Scalar: UnsignedInteger> ParameterSetConformant
type ParameterSet = LweCiphertextParameters<Scalar>;
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<Scalar>) -> bool {
let lwe_size = self.lwe_dimension.to_lwe_size().0;
let Self {
packed_integers,
lwe_dimension,
uncompressed_ciphertext_modulus,
} = self;
let number_bits_to_pack = lwe_size * self.packed_integers.log_modulus.0;
let lwe_size = lwe_dimension.to_lwe_size().0;
let number_bits_to_pack = lwe_size * packed_integers.log_modulus.0;
let len = number_bits_to_pack.div_ceil(Scalar::BITS);
self.packed_integers.packed_coeffs.len() == len
&& self.lwe_dimension == lwe_ct_parameters.lwe_dim
packed_integers.packed_coeffs.len() == len
&& *lwe_dimension == lwe_ct_parameters.lwe_dim
&& lwe_ct_parameters.ct_modulus.is_power_of_two()
&& self.uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
&& *uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
&& matches!(
lwe_ct_parameters.ms_decompression_method,
MsDecompressionType::ClassicPbs

View File

@@ -403,24 +403,33 @@ impl MultiBitModulusSwitchedCt for FromCompressionMultiBitModulusSwitchedCt {
impl<Scalar: UnsignedInteger + CastInto<usize> + CastFrom<usize>> ParameterSetConformant
for CompressedModulusSwitchedMultiBitLweCiphertext<Scalar>
{
type ParameterSet = LweCiphertextParameters<u64>;
type ParameterSet = LweCiphertextParameters<Scalar>;
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<u64>) -> bool {
let lwe_dim = self.lwe_dimension.0;
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<Scalar>) -> bool {
let Self {
body,
packed_mask,
packed_diffs,
lwe_dimension,
uncompressed_ciphertext_modulus,
grouping_factor,
} = self;
let number_mask_bits_to_pack = lwe_dim * self.packed_mask.log_modulus.0;
let lwe_dim = lwe_dimension.0;
let len = number_mask_bits_to_pack.div_ceil(Scalar::BITS);
self.body >> self.packed_mask.log_modulus.0 == 0
&& self.packed_mask.packed_coeffs.len() == len
&& self.lwe_dimension == lwe_ct_parameters.lwe_dim
body >> packed_mask.log_modulus.0 == 0
&& packed_mask.is_conformant(&lwe_dim)
&& packed_diffs
.as_ref()
.map_or(true, |packed_diffs| packed_diffs.is_conformant(&lwe_dim))
&& *lwe_dimension == lwe_ct_parameters.lwe_dim
&& lwe_ct_parameters.ct_modulus.is_power_of_two()
&& match lwe_ct_parameters.ms_decompression_method {
MsDecompressionType::ClassicPbs => false,
MsDecompressionType::MultiBitPbs(expected_gouping_factor) => {
expected_gouping_factor.0 == self.grouping_factor.0
expected_gouping_factor.0 == grouping_factor.0
}
}
&& *uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
}
}

View File

@@ -643,9 +643,19 @@ where
&self,
glwe_ct_parameters: &GlweCiphertextConformanceParameters<C::Element>,
) -> bool {
let Self {
data,
polynomial_size,
ciphertext_modulus,
} = self;
check_encrypted_content_respects_mod(self, glwe_ct_parameters.ct_modulus)
&& self.glwe_size() == glwe_ct_parameters.glwe_dim.to_glwe_size()
&& self.polynomial_size() == glwe_ct_parameters.polynomial_size
&& self.ciphertext_modulus() == glwe_ct_parameters.ct_modulus
&& data.container_len()
== glwe_ciphertext_size(
glwe_ct_parameters.glwe_dim.to_glwe_size(),
glwe_ct_parameters.polynomial_size,
)
&& *polynomial_size == glwe_ct_parameters.polynomial_size
&& *ciphertext_modulus == glwe_ct_parameters.ct_modulus
}
}

View File

@@ -761,9 +761,14 @@ where
type ParameterSet = LweCiphertextParameters<C::Element>;
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<C::Element>) -> bool {
check_encrypted_content_respects_mod(self, lwe_ct_parameters.ct_modulus)
let Self {
data,
ciphertext_modulus,
} = self;
check_encrypted_content_respects_mod(data, lwe_ct_parameters.ct_modulus)
&& self.lwe_size() == lwe_ct_parameters.lwe_dim.to_lwe_size()
&& self.ciphertext_modulus() == lwe_ct_parameters.ct_modulus
&& *ciphertext_modulus == lwe_ct_parameters.ct_modulus
}
}

View File

@@ -343,17 +343,24 @@ impl<T: UnsignedInteger> ParameterSetConformant for LweCompactCiphertextListOwne
type ParameterSet = LweCiphertextListParameters<T>;
fn is_conformant(&self, param: &LweCiphertextListParameters<T>) -> bool {
let Self {
data,
lwe_size,
lwe_ciphertext_count,
ciphertext_modulus,
} = self;
param
.lwe_ciphertext_count_constraint
.is_valid(self.lwe_ciphertext_count.0)
&& self.data.len()
.is_valid(lwe_ciphertext_count.0)
&& data.len()
== lwe_compact_ciphertext_list_size(
self.lwe_size.to_lwe_dimension(),
self.lwe_ciphertext_count,
lwe_size.to_lwe_dimension(),
*lwe_ciphertext_count,
)
&& check_encrypted_content_respects_mod(self, param.ct_modulus)
&& self.lwe_size == param.lwe_dim.to_lwe_size()
&& self.ciphertext_modulus == param.ct_modulus
&& *lwe_size == param.lwe_dim.to_lwe_size()
&& *ciphertext_modulus == param.ct_modulus
}
}

View File

@@ -1,5 +1,6 @@
use tfhe_versionable::Versionize;
use crate::conformance::ParameterSetConformant;
use crate::core_crypto::backward_compatibility::entities::packed_integers::PackedIntegersVersions;
use crate::core_crypto::prelude::*;
@@ -166,3 +167,21 @@ impl<Scalar: UnsignedInteger> PackedIntegers<Scalar> {
})
}
}
impl<Scalar: UnsignedInteger> ParameterSetConformant for PackedIntegers<Scalar> {
type ParameterSet = usize;
fn is_conformant(&self, len: &usize) -> bool {
let Self {
packed_coeffs,
log_modulus,
initial_len,
} = self;
let number_packed_bits = *len * log_modulus.0;
let packed_len = number_packed_bits.div_ceil(Scalar::BITS);
*len == *initial_len && packed_coeffs.len() == packed_len
}
}

View File

@@ -25,11 +25,18 @@ impl<T: UnsignedInteger> ParameterSetConformant for SeededLweCiphertext<T> {
type ParameterSet = LweCiphertextParameters<T>;
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<T>) -> bool {
let Self {
data,
lwe_size,
compression_seed: _,
ciphertext_modulus,
} = self;
check_encrypted_content_respects_mod::<T, &[T]>(
&std::slice::from_ref(self.get_body().data),
&std::slice::from_ref(data),
lwe_ct_parameters.ct_modulus,
) && self.lwe_size == lwe_ct_parameters.lwe_dim.to_lwe_size()
&& self.ciphertext_modulus() == lwe_ct_parameters.ct_modulus
) && *lwe_size == lwe_ct_parameters.lwe_dim.to_lwe_size()
&& *ciphertext_modulus == lwe_ct_parameters.ct_modulus
}
}

View File

@@ -193,12 +193,11 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
ggsw.decomposition_base_log(),
ggsw.decomposition_level_count(),
);
let (mut output_fft_buffer, mut substack0) =
let (output_fft_buffer, mut substack0) =
stack.make_aligned_raw::<c64>(fourier_poly_size * ggsw.glwe_size_out().0, align);
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
// the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
// it has been fully initialized for the first time.
let output_fft_buffer = &mut *output_fft_buffer;
let mut is_output_uninit = true;
{
@@ -244,14 +243,14 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
glwe_decomp_term.get_mask().as_polynomial_list().iter()
)
.for_each(|(ggsw_row, glwe_poly)| {
let (mut fourier, substack3) = substack2
let (fourier, substack3) = substack2
.rb_mut()
.make_aligned_raw::<c64>(fourier_poly_size, align);
// We perform the forward fft transform for the glwe polynomial
let fourier = fft
.forward_as_integer(
FourierPolynomialMutView { data: &mut fourier },
FourierPolynomialMutView { data: fourier },
glwe_poly,
substack3,
)

View File

@@ -289,10 +289,10 @@ where
if *lwe_mask_element != Scalar::ZERO {
let stack = stack.rb_mut();
// We copy ct_0 to ct_1
let (mut ct1, stack) =
let (ct1, stack) =
stack.collect_aligned(CACHELINE_ALIGN, ct0.as_ref().iter().copied());
let mut ct1 = GlweCiphertextMutView::from_container(
&mut *ct1,
ct1,
ct0.polynomial_size(),
ct0.ciphertext_modulus(),
);
@@ -361,10 +361,10 @@ where
return this.bootstrap_u128(&mut lwe_out, &lwe_in, &accumulator, fft, stack);
}
let (mut local_accumulator_data, stack) =
let (local_accumulator_data, stack) =
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
let mut local_accumulator = GlweCiphertextMutView::from_container(
&mut *local_accumulator_data,
local_accumulator_data,
accumulator.polynomial_size(),
accumulator.ciphertext_modulus(),
);

View File

@@ -397,13 +397,13 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
ggsw.decomposition_level_count(),
);
let (mut output_fft_buffer_re0, stack) =
let (output_fft_buffer_re0, stack) =
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
let (mut output_fft_buffer_re1, stack) =
let (output_fft_buffer_re1, stack) =
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
let (mut output_fft_buffer_im0, stack) =
let (output_fft_buffer_im0, stack) =
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
let (mut output_fft_buffer_im1, mut substack0) =
let (output_fft_buffer_im1, mut substack0) =
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
@@ -455,30 +455,30 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
) {
let len = fourier_poly_size;
let stack = substack2.rb_mut();
let (mut fourier_re0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (mut fourier_re1, stack) = stack.make_aligned_raw::<f64>(len, align);
let (mut fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (mut fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_re0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_re1, stack) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
// We perform the forward fft transform for the glwe polynomial
fft.forward_as_integer(
&mut fourier_re0,
&mut fourier_re1,
&mut fourier_im0,
&mut fourier_im1,
fourier_re0,
fourier_re1,
fourier_im0,
fourier_im1,
glwe_poly.as_ref(),
);
// Now we loop through the polynomials of the output, and add the
// corresponding product of polynomials.
update_with_fmadd(
&mut output_fft_buffer_re0,
&mut output_fft_buffer_re1,
&mut output_fft_buffer_im0,
&mut output_fft_buffer_im1,
output_fft_buffer_re0,
output_fft_buffer_re1,
output_fft_buffer_im0,
output_fft_buffer_im1,
ggsw_row,
&fourier_re0,
&fourier_re1,
&fourier_im0,
&fourier_im1,
fourier_re0,
fourier_re1,
fourier_im0,
fourier_im1,
is_output_uninit,
fourier_poly_size,
);
@@ -495,11 +495,6 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
//
// We iterate over the polynomials in the output.
if !is_output_uninit {
let output_fft_buffer_re0 = output_fft_buffer_re0;
let output_fft_buffer_re1 = output_fft_buffer_re1;
let output_fft_buffer_im0 = output_fft_buffer_im0;
let output_fft_buffer_im1 = output_fft_buffer_im1;
for (mut out, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip!(
out.as_mut_polynomial_list().iter_mut(),
output_fft_buffer_re0.into_chunks(fourier_poly_size),
@@ -532,11 +527,7 @@ fn collect_next_term<'a, Scalar: UnsignedTorus>(
decomposition: &mut TensorSignedDecompositionLendingIter<'_, Scalar>,
substack1: &'a mut PodStack,
align: usize,
) -> (
DecompositionLevel,
dyn_stack::DynArray<'a, Scalar>,
PodStack<'a>,
) {
) -> (DecompositionLevel, &'a mut [Scalar], PodStack<'a>) {
let (glwe_level, _, glwe_decomp_term) = decomposition.next_term().unwrap();
let (glwe_decomp_term, substack2) = substack1.rb_mut().collect_aligned(align, glwe_decomp_term);
(glwe_level, glwe_decomp_term, substack2)

View File

@@ -495,27 +495,19 @@ impl<'a> Fft128View<'a> {
debug_assert_eq!(n, 2 * fourier_im0.len());
debug_assert_eq!(n, 2 * fourier_im1.len());
let (mut tmp_re0, stack) =
let (tmp_re0, stack) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_re0.iter().copied());
let (mut tmp_re1, stack) =
let (tmp_re1, stack) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_re1.iter().copied());
let (mut tmp_im0, stack) =
let (tmp_im0, stack) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_im0.iter().copied());
let (mut tmp_im1, _) =
let (tmp_im1, _) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_im1.iter().copied());
self.plan
.inv(&mut tmp_re0, &mut tmp_re1, &mut tmp_im0, &mut tmp_im1);
self.plan.inv(tmp_re0, tmp_re1, tmp_im0, tmp_im1);
let (standard_re, standard_im) = standard.split_at_mut(n / 2);
conv_fn(
standard_re,
standard_im,
&tmp_re0,
&tmp_re1,
&tmp_im0,
&tmp_im1,
);
conv_fn(standard_re, standard_im, tmp_re0, tmp_re1, tmp_im0, tmp_im1);
}
}

View File

@@ -105,9 +105,9 @@ where
if *lwe_mask_element != 0 {
let stack = stack.rb_mut();
// We copy ct_0 to ct_1
let (mut ct1_lo, stack) =
let (ct1_lo, stack) =
stack.collect_aligned(CACHELINE_ALIGN, ct0_lo.as_ref().iter().copied());
let (mut ct1_hi, stack) =
let (ct1_hi, stack) =
stack.collect_aligned(CACHELINE_ALIGN, ct0_hi.as_ref().iter().copied());
let mut ct1_lo = GlweCiphertextMutView::from_container(
&mut *ct1_lo,
@@ -177,9 +177,9 @@ where
let align = CACHELINE_ALIGN;
let ciphertext_modulus = accumulator.ciphertext_modulus();
let (mut local_accumulator_lo, stack) =
let (local_accumulator_lo, stack) =
stack.collect_aligned(align, accumulator.as_ref().iter().map(|i| *i as u64));
let (mut local_accumulator_hi, mut stack) = stack.collect_aligned(
let (local_accumulator_hi, mut stack) = stack.collect_aligned(
align,
accumulator.as_ref().iter().map(|i| (*i >> 64) as u64),
);
@@ -207,7 +207,7 @@ where
fft,
stack.rb_mut(),
);
let (mut local_accumulator, _) = stack.collect_aligned(
let (local_accumulator, _) = stack.collect_aligned(
align,
izip!(local_accumulator_lo.as_ref(), local_accumulator_hi.as_ref())
.map(|(&lo, &hi)| lo as u128 | ((hi as u128) << 64)),

View File

@@ -63,32 +63,28 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
ggsw.decomposition_level_count(),
);
let (mut output_fft_buffer_re0, stack) =
let (output_fft_buffer_re0, stack) =
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
let (mut output_fft_buffer_re1, stack) =
let (output_fft_buffer_re1, stack) =
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
let (mut output_fft_buffer_im0, stack) =
let (output_fft_buffer_im0, stack) =
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
let (mut output_fft_buffer_im1, mut substack0) =
let (output_fft_buffer_im1, mut substack0) =
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
// the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
// it has been fully initialized for the first time.
let output_fft_buffer_re0 = &mut *output_fft_buffer_re0;
let output_fft_buffer_re1 = &mut *output_fft_buffer_re1;
let output_fft_buffer_im0 = &mut *output_fft_buffer_im0;
let output_fft_buffer_im1 = &mut *output_fft_buffer_im1;
let mut is_output_uninit = true;
{
// ------------------------------------------------------ EXTERNAL PRODUCT IN FOURIER
// DOMAIN In this section, we perform the external product in the fourier
// domain, and accumulate the result in the output_fft_buffer variable.
let (mut decomposition_states_lo, stack) = substack0
let (decomposition_states_lo, stack) = substack0
.rb_mut()
.make_aligned_raw::<u64>(poly_size * glwe_size, align);
let (mut decomposition_states_hi, mut substack1) =
let (decomposition_states_hi, mut substack1) =
stack.make_aligned_raw::<u64>(poly_size * glwe_size, align);
let shift = 128 - decomposer.base_log * decomposer.level_count;
@@ -104,6 +100,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
*out_lo = value as u64;
*out_hi = (value >> 64) as u64;
}
// Reborrow to avoid mut slices to be moved
let decomposition_states_lo = &mut *decomposition_states_lo;
let decomposition_states_hi = &mut *decomposition_states_hi;
let mut current_level = decomposer.level_count;
@@ -118,17 +115,17 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
assert_ne!(current_level, 0);
let glwe_level = DecompositionLevel(current_level);
current_level -= 1;
let (mut glwe_decomp_term_lo, stack) = substack1
let (glwe_decomp_term_lo, stack) = substack1
.rb_mut()
.make_aligned_raw::<u64>(poly_size * glwe_size, align);
let (mut glwe_decomp_term_hi, mut substack2) =
let (glwe_decomp_term_hi, mut substack2) =
stack.make_aligned_raw::<u64>(poly_size * glwe_size, align);
let base_log = decomposer.base_log;
collect_next_term_split(
&mut glwe_decomp_term_lo,
&mut glwe_decomp_term_hi,
glwe_decomp_term_lo,
glwe_decomp_term_hi,
decomposition_states_lo,
decomposition_states_hi,
mod_b_mask_lo,
@@ -136,9 +133,6 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
base_log,
);
let glwe_decomp_term_lo = &mut *glwe_decomp_term_lo;
let glwe_decomp_term_hi = &mut *glwe_decomp_term_hi;
let glwe_decomp_term_lo = GlweCiphertextView::from_container(
&*glwe_decomp_term_lo,
ggsw.polynomial_size(),
@@ -170,16 +164,16 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
) {
let len = fourier_poly_size;
let stack = substack2.rb_mut();
let (mut fourier_re0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (mut fourier_re1, stack) = stack.make_aligned_raw::<f64>(len, align);
let (mut fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (mut fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_re0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_re1, stack) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
let (fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
// We perform the forward fft transform for the glwe polynomial
fft.forward_as_integer_split(
&mut fourier_re0,
&mut fourier_re1,
&mut fourier_im0,
&mut fourier_im1,
fourier_re0,
fourier_re1,
fourier_im0,
fourier_im1,
glwe_poly_lo.as_ref(),
glwe_poly_hi.as_ref(),
);
@@ -192,10 +186,10 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
output_fft_buffer_im0,
output_fft_buffer_im1,
ggsw_row,
&fourier_re0,
&fourier_re1,
&fourier_im0,
&fourier_im1,
fourier_re0,
fourier_re1,
fourier_im0,
fourier_im1,
is_output_uninit,
fourier_poly_size,
);

View File

@@ -204,7 +204,7 @@ fn test_split_pbs() {
fft: Fft128View<'_>,
stack: PodStack<'_>,
) {
let (mut local_accumulator_data, stack) =
let (local_accumulator_data, stack) =
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
let mut local_accumulator = GlweCiphertextMutView::from_container(
&mut *local_accumulator_data,

View File

@@ -1316,17 +1316,16 @@ impl<'a> Fft128View<'a> {
debug_assert_eq!(n, 2 * fourier_im0.len());
debug_assert_eq!(n, 2 * fourier_im1.len());
let (mut tmp_re0, stack) =
let (tmp_re0, stack) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_re0.iter().copied());
let (mut tmp_re1, stack) =
let (tmp_re1, stack) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_re1.iter().copied());
let (mut tmp_im0, stack) =
let (tmp_im0, stack) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_im0.iter().copied());
let (mut tmp_im1, _) =
let (tmp_im1, _) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_im1.iter().copied());
self.plan
.inv(&mut tmp_re0, &mut tmp_re1, &mut tmp_im0, &mut tmp_im1);
self.plan.inv(tmp_re0, tmp_re1, tmp_im0, tmp_im1);
let (standard_re_lo, standard_im_lo) = standard_lo.split_at_mut(n / 2);
let (standard_re_hi, standard_im_hi) = standard_hi.split_at_mut(n / 2);
@@ -1335,10 +1334,10 @@ impl<'a> Fft128View<'a> {
standard_re_hi,
standard_im_lo,
standard_im_hi,
&tmp_re0,
&tmp_re1,
&tmp_im0,
&tmp_im1,
tmp_re0,
tmp_re1,
tmp_im0,
tmp_im1,
);
}
}

View File

@@ -353,7 +353,7 @@ impl<'a> FourierLweBootstrapKeyView<'a> {
lut.as_mut_polynomial_list()
.iter_mut()
.for_each(|mut poly| {
let (mut tmp_poly, _) = stack
let (tmp_poly, _) = stack
.rb_mut()
.make_aligned_raw(poly.as_ref().len(), CACHELINE_ALIGN);
@@ -364,7 +364,7 @@ impl<'a> FourierLweBootstrapKeyView<'a> {
// We initialize the ct_0 used for the successive cmuxes
let mut ct0 = lut;
let (mut ct1, mut stack) = stack.make_aligned_raw(ct0.as_ref().len(), CACHELINE_ALIGN);
let (ct1, mut stack) = stack.make_aligned_raw(ct0.as_ref().len(), CACHELINE_ALIGN);
let mut ct1 =
GlweCiphertextMutView::from_container(&mut *ct1, lut_poly_size, ciphertext_modulus);
@@ -437,7 +437,7 @@ impl<'a> FourierLweBootstrapKeyView<'a> {
accumulator.ciphertext_modulus()
);
let (mut local_accumulator_data, stack) =
let (local_accumulator_data, stack) =
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
let mut local_accumulator = GlweCiphertextMutView::from_container(
&mut *local_accumulator_data,

View File

@@ -588,7 +588,7 @@ pub fn add_external_product_assign<Scalar>(
ggsw.decomposition_level_count(),
);
let (mut output_fft_buffer, mut substack0) =
let (output_fft_buffer, mut substack0) =
stack.make_aligned_raw::<c64>(fourier_poly_size * ggsw.glwe_size().0, align);
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
// the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
@@ -638,13 +638,13 @@ pub fn add_external_product_assign<Scalar>(
glwe_decomp_term.as_polynomial_list().iter()
)
.for_each(|(ggsw_row, glwe_poly)| {
let (mut fourier, substack3) = substack2
let (fourier, substack3) = substack2
.rb_mut()
.make_aligned_raw::<c64>(fourier_poly_size, align);
// We perform the forward fft transform for the glwe polynomial
let fourier = fft
.forward_as_integer(
FourierPolynomialMutView { data: &mut fourier },
FourierPolynomialMutView { data: fourier },
glwe_poly,
substack3,
)
@@ -691,11 +691,7 @@ pub(crate) fn collect_next_term<'a, Scalar: UnsignedTorus>(
decomposition: &mut TensorSignedDecompositionLendingIter<'_, Scalar>,
substack1: &'a mut PodStack,
align: usize,
) -> (
DecompositionLevel,
dyn_stack::DynArray<'a, Scalar>,
PodStack<'a>,
) {
) -> (DecompositionLevel, &'a mut [Scalar], PodStack<'a>) {
let (glwe_level, _, glwe_decomp_term) = decomposition.next_term().unwrap();
let (glwe_decomp_term, substack2) = substack1.rb_mut().collect_aligned(align, glwe_decomp_term);
(glwe_level, glwe_decomp_term, substack2)

View File

@@ -123,17 +123,16 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
let align = CACHELINE_ALIGN;
let (mut lwe_in_buffer_data, stack) =
stack.collect_aligned(align, lwe_in.as_ref().iter().copied());
let (lwe_in_buffer_data, stack) = stack.collect_aligned(align, lwe_in.as_ref().iter().copied());
let mut lwe_in_buffer =
LweCiphertext::from_container(&mut *lwe_in_buffer_data, lwe_in.ciphertext_modulus());
let (mut lwe_out_ks_buffer_data, stack) =
let (lwe_out_ks_buffer_data, stack) =
stack.make_aligned_with(ksk.output_lwe_size().0, align, |_| Scalar::ZERO);
let mut lwe_out_ks_buffer =
LweCiphertext::from_container(&mut *lwe_out_ks_buffer_data, ksk.ciphertext_modulus());
let (mut pbs_accumulator_data, stack) =
let (pbs_accumulator_data, stack) =
stack.make_aligned_with(glwe_size.0 * polynomial_size.0, align, |_| Scalar::ZERO);
let mut pbs_accumulator = GlweCiphertextMutView::from_container(
&mut *pbs_accumulator_data,
@@ -144,7 +143,7 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
let lwe_size = glwe_dimension
.to_equivalent_lwe_dimension(polynomial_size)
.to_lwe_size();
let (mut lwe_out_pbs_buffer_data, mut stack) =
let (lwe_out_pbs_buffer_data, mut stack) =
stack.make_aligned_with(lwe_size.0, align, |_| Scalar::ZERO);
let mut lwe_out_pbs_buffer = LweCiphertext::from_container(
&mut *lwe_out_pbs_buffer_data,
@@ -153,26 +152,27 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
// We iterate on the list in reverse as we want to store the extracted MSB at index 0
for (bit_idx, mut output_ct) in lwe_list_out.iter_mut().rev().enumerate() {
// Shift on padding bit
let (lwe_bit_left_shift_buffer_data, _) = stack.rb_mut().collect_aligned(
align,
lwe_in_buffer
.as_ref()
.iter()
.map(|s| *s << (ciphertext_n_bits - delta_log.0 - bit_idx - 1)),
);
// Block to keep the lwe_bit_left_shift_buffer_data alive only as long as needed
{
// Shift on padding bit
let (lwe_bit_left_shift_buffer_data, _) = stack.rb_mut().collect_aligned(
align,
lwe_in_buffer
.as_ref()
.iter()
.map(|s| *s << (ciphertext_n_bits - delta_log.0 - bit_idx - 1)),
);
// Key switch to input PBS key
keyswitch_lwe_ciphertext(
&ksk,
&LweCiphertext::from_container(
&*lwe_bit_left_shift_buffer_data,
lwe_in.ciphertext_modulus(),
),
&mut lwe_out_ks_buffer,
);
drop(lwe_bit_left_shift_buffer_data);
// Key switch to input PBS key
keyswitch_lwe_ciphertext(
&ksk,
&LweCiphertext::from_container(
lwe_bit_left_shift_buffer_data,
lwe_in.ciphertext_modulus(),
),
&mut lwe_out_ks_buffer,
);
}
// Store the keyswitch output unmodified to the output list (as we need to to do other
// computations on the output of the keyswitch)
@@ -306,7 +306,7 @@ pub fn circuit_bootstrap_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
);
// Output for every bootstrapping
let (mut lwe_out_bs_buffer_data, mut stack) = stack.make_aligned_with(
let (lwe_out_bs_buffer_data, mut stack) = stack.make_aligned_with(
fourier_bsk_output_lwe_dimension.to_lwe_size().0,
CACHELINE_ALIGN,
|_| Scalar::ZERO,
@@ -384,7 +384,7 @@ pub fn homomorphic_shift_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
let polynomial_size = fourier_bsk.polynomial_size();
let ciphertext_moudulus = lwe_out.ciphertext_modulus();
let (mut lwe_left_shift_buffer_data, stack) =
let (lwe_left_shift_buffer_data, stack) =
stack.make_aligned_with(lwe_in_size.0, CACHELINE_ALIGN, |_| Scalar::ZERO);
let mut lwe_left_shift_buffer = LweCiphertext::from_container(
&mut *lwe_left_shift_buffer_data,
@@ -403,7 +403,7 @@ pub fn homomorphic_shift_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
*shift_buffer_body.data =
(*shift_buffer_body.data).wrapping_add(Scalar::ONE << (ciphertext_n_bits - 2));
let (mut pbs_accumulator_data, stack) = stack.make_aligned_with(
let (pbs_accumulator_data, stack) = stack.make_aligned_with(
polynomial_size.0 * fourier_bsk.glwe_size().0,
CACHELINE_ALIGN,
|_| Scalar::ZERO,
@@ -486,31 +486,31 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
// At index 0 you have the lut that will be loaded, and then the result for each layer gets
// computed at the next index, last layer result gets stored in `result`.
// This allow to use memory space in C * nb_layer instead of C' * 2 ^ nb_layer
let (mut t_0_data, stack) = stack.make_aligned_with(
let (t_0_data, stack) = stack.make_aligned_with(
polynomial_size.0 * glwe_size.0 * nb_layer,
CACHELINE_ALIGN,
|_| Scalar::ZERO,
);
let (mut t_1_data, stack) = stack.make_aligned_with(
let (t_1_data, stack) = stack.make_aligned_with(
polynomial_size.0 * glwe_size.0 * nb_layer,
CACHELINE_ALIGN,
|_| Scalar::ZERO,
);
let mut t_0 = GlweCiphertextList::from_container(
t_0_data.as_mut(),
t_0_data,
glwe_size,
polynomial_size,
ciphertext_modulus,
);
let mut t_1 = GlweCiphertextList::from_container(
t_1_data.as_mut(),
t_1_data,
glwe_size,
polynomial_size,
ciphertext_modulus,
);
let (mut t_fill, mut stack) = stack.make_with(nb_layer, |_| 0_usize);
let (t_fill, mut stack) = stack.make_with(nb_layer, |_| 0_usize);
let mut lut_polynomial_iter = lut_per_layer.iter();
loop {
@@ -565,8 +565,6 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
t_fill[j + 1] += 1;
t_fill[j] = 0;
drop(diff_data);
(j_counter, t0_j, t1_j) = (j_counter_plus_1, t_0_j_plus_1, t_1_j_plus_1);
} else {
assert_eq!(j, nb_layer - 1);
@@ -680,7 +678,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
);
let glwe_size = pfpksk_list.output_key_glwe_dimension().to_glwe_size();
let (mut ggsw_list_data, stack) = stack.make_aligned_with(
let (ggsw_list_data, stack) = stack.make_aligned_with(
lwe_list_in.lwe_ciphertext_count().0 * pfpksk_list.output_polynomial_size().0 / 2
* glwe_size.0
* glwe_size.0
@@ -688,14 +686,14 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
CACHELINE_ALIGN,
|_| c64::default(),
);
let (mut ggsw_res_data, mut stack) = stack.make_aligned_with(
let (ggsw_res_data, mut stack) = stack.make_aligned_with(
pfpksk_list.output_polynomial_size().0 * glwe_size.0 * glwe_size.0 * level_cbs.0,
CACHELINE_ALIGN,
|_| Scalar::ZERO,
);
let mut ggsw_list = FourierGgswCiphertextListMutView::new(
&mut ggsw_list_data,
ggsw_list_data,
lwe_list_in.lwe_ciphertext_count().0,
glwe_size,
pfpksk_list.output_polynomial_size(),
@@ -704,7 +702,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
);
let mut ggsw_res = GgswCiphertext::from_container(
&mut *ggsw_res_data,
ggsw_res_data,
glwe_size,
pfpksk_list.output_polynomial_size(),
base_log_cbs,
@@ -817,15 +815,12 @@ pub fn vertical_packing<Scalar: UnsignedTorus + CastInto<usize>>(
// the last blind rotation.
let (cmux_ggsw, br_ggsw) = ggsw_list.split_at(log_number_of_luts_for_cmux_tree);
let (mut cmux_tree_lut_res_data, mut stack) =
let (cmux_tree_lut_res_data, mut stack) =
stack.make_aligned_with(polynomial_size.0 * glwe_size.0, CACHELINE_ALIGN, |_| {
Scalar::ZERO
});
let mut cmux_tree_lut_res = GlweCiphertext::from_container(
&mut *cmux_tree_lut_res_data,
polynomial_size,
ciphertext_modulus,
);
let mut cmux_tree_lut_res =
GlweCiphertext::from_container(cmux_tree_lut_res_data, polynomial_size, ciphertext_modulus);
cmux_tree_memory_optimized(
cmux_tree_lut_res.as_mut_view(),
@@ -866,7 +861,7 @@ pub fn blind_rotate_assign<Scalar: UnsignedTorus + CastInto<usize>>(
for ggsw in ggsw_list.into_ggsw_iter().rev() {
let ct_0 = lut.as_mut_view();
let (mut ct1_data, stack) = stack
let (ct1_data, stack) = stack
.rb_mut()
.collect_aligned(CACHELINE_ALIGN, ct_0.as_ref().iter().copied());
let mut ct_1 = GlweCiphertext::from_container(

View File

@@ -2,7 +2,7 @@ use crate::core_crypto::commons::math::decomposition::decompose_one_level;
pub use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
use crate::core_crypto::commons::numeric::UnsignedInteger;
use crate::core_crypto::commons::parameters::{DecompositionBaseLog, DecompositionLevelCount};
use dyn_stack::{DynArray, PodStack};
use dyn_stack::PodStack;
use std::iter::Map;
use std::slice::IterMut;
@@ -18,7 +18,7 @@ pub struct TensorSignedDecompositionLendingIter<'buffers, Scalar: UnsignedIntege
// ...0001111
mod_b_mask: Scalar,
// The internal states of each decomposition
states: DynArray<'buffers, Scalar>,
states: &'buffers mut [Scalar],
// A flag which stores whether the iterator is a fresh one (for the recompose method).
fresh: bool,
}

View File

@@ -532,12 +532,12 @@ impl<'a> FftView<'a> {
let standard = standard.as_mut();
let n = standard.len();
debug_assert_eq!(n, 2 * fourier.len());
let (mut tmp, stack) =
let (tmp, stack) =
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier.iter().copied());
self.plan.inv(&mut tmp, stack);
self.plan.inv(tmp, stack);
let (standard_re, standard_im) = standard.split_at_mut(n / 2);
conv_fn(standard_re, standard_im, &tmp, self.twisties);
conv_fn(standard_re, standard_im, tmp, self.twisties);
}
fn backward_with_conv_in_place<

View File

@@ -89,52 +89,10 @@ pub fn mm256_cvtpd_epi64(simd: V3, x: __m256d) -> __m256i {
#[cfg(feature = "nightly-avx512")]
#[inline(always)]
pub fn mm512_cvtt_roundpd_epi64(simd: V4, x: __m512d) -> __m512i {
// This first one is required for the zmm_reg notation
#[inline]
#[target_feature(enable = "sse")]
#[target_feature(enable = "sse2")]
#[target_feature(enable = "fxsr")]
#[target_feature(enable = "sse3")]
#[target_feature(enable = "ssse3")]
#[target_feature(enable = "sse4.1")]
#[target_feature(enable = "sse4.2")]
#[target_feature(enable = "popcnt")]
#[target_feature(enable = "avx")]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "bmi1")]
#[target_feature(enable = "bmi2")]
#[target_feature(enable = "fma")]
#[target_feature(enable = "lzcnt")]
#[target_feature(enable = "avx512f")]
#[target_feature(enable = "avx512dq")]
unsafe fn implementation(x: __m512d) -> __m512i {
let mut as_i64x8: __m512i;
// From Intel's documentation the syntax to use this intrinsics is
// Instruction: vcvttpd2qq zmm, zmm
// With Intel syntax, left operand is the destination, right operand is the source
// For the asm! macro
// in: indicates an input register
// out: indicates an output register
// zmm_reg: the avx512 register type
// options: see https://doc.rust-lang.org/nightly/reference/inline-assembly.html#options
// pure: no side effect
// nomem: does not reference RAM (only registers)
// nostrack: does not alter the state of the stack
core::arch::asm!(
"vcvttpd2qq {dst}, {src}",
src = in(zmm_reg) x,
dst = out(zmm_reg) as_i64x8,
options(pure, nomem, nostack)
);
as_i64x8
}
let _ = simd.avx512dq;
// SAFETY: simd contains an instance of avx512dq, that matches the target feature of
// `implementation`
unsafe { implementation(x) }
_ = simd;
unsafe { _mm512_cvttpd_epi64(x) }
}
/// Convert a vector of i64 values to a vector of f64 values. Not sure how it works.
@@ -174,52 +132,10 @@ pub fn mm256_cvtepi64_pd(simd: V3, x: __m256i) -> __m256d {
#[cfg(feature = "nightly-avx512")]
#[inline(always)]
pub fn mm512_cvtepi64_pd(simd: V4, x: __m512i) -> __m512d {
// This first one is required for the zmm_reg notation
#[inline]
#[target_feature(enable = "sse")]
#[target_feature(enable = "sse2")]
#[target_feature(enable = "fxsr")]
#[target_feature(enable = "sse3")]
#[target_feature(enable = "ssse3")]
#[target_feature(enable = "sse4.1")]
#[target_feature(enable = "sse4.2")]
#[target_feature(enable = "popcnt")]
#[target_feature(enable = "avx")]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "bmi1")]
#[target_feature(enable = "bmi2")]
#[target_feature(enable = "fma")]
#[target_feature(enable = "lzcnt")]
#[target_feature(enable = "avx512f")]
#[target_feature(enable = "avx512dq")]
unsafe fn implementation(x: __m512i) -> __m512d {
let mut as_f64x8: __m512d;
// From Intel's documentation the syntax to use this intrinsics is
// Instruction: vcvtqq2pd zmm, zmm
// With Intel syntax, left operand is the destination, right operand is the source
// For the asm! macro
// in: indicates an input register
// out: indicates an output register
// zmm_reg: the avx512 register type
// options: see https://doc.rust-lang.org/nightly/reference/inline-assembly.html#options
// pure: no side effect
// nomem: does not reference RAM (only registers)
// nostrack: does not alter the state of the stack
core::arch::asm!(
"vcvtqq2pd {dst}, {src}",
src = in(zmm_reg) x,
dst = out(zmm_reg) as_f64x8,
options(pure, nomem, nostack)
);
as_f64x8
}
let _ = simd.avx512dq;
// SAFETY: simd contains an instance of avx512dq, that matches the target feature of
// `implementation`
unsafe { implementation(x) }
_ = simd;
unsafe { _mm512_cvtepi64_pd(x) }
}
#[cfg(feature = "nightly-avx512")]

View File

@@ -5,16 +5,16 @@ use crate::core_crypto::gpu::{extract_lwe_samples_from_glwe_ciphertext_list_asyn
use crate::core_crypto::prelude::{MonomialDegree, UnsignedTorus};
use itertools::Itertools;
/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
/// GPU-accelerated.
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_extract_lwe_samples_from_glwe_ciphertext_list_async<Scalar>(
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
vec_nth: &[MonomialDegree],
streams: &CudaStreams,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus,
{
let in_lwe_dim = input_glwe_list
@@ -58,3 +58,25 @@ pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
);
}
}
/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
/// GPU-accelerated.
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
vec_nth: &[MonomialDegree],
streams: &CudaStreams,
) where
Scalar: UnsignedTorus,
{
unsafe {
cuda_extract_lwe_samples_from_glwe_ciphertext_list_async(
input_glwe_list,
output_lwe_list,
vec_nth,
streams,
);
}
streams.synchronize();
}

View File

@@ -0,0 +1,36 @@
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
use crate::core_crypto::gpu::{packing_keyswitch_list_async, CudaStreams};
use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async<Scalar>(
lwe_pksk: &CudaLwePackingKeyswitchKey<Scalar>,
input_lwe_ciphertext_list: &CudaLweCiphertextList<Scalar>,
output_glwe_ciphertext: &mut CudaGlweCiphertextList<Scalar>,
streams: &CudaStreams,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{
let input_lwe_dimension = input_lwe_ciphertext_list.lwe_dimension();
let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension();
let output_polynomial_size = output_glwe_ciphertext.polynomial_size();
packing_keyswitch_list_async(
streams,
&mut output_glwe_ciphertext.0.d_vec,
&input_lwe_ciphertext_list.0.d_vec,
input_lwe_dimension,
output_glwe_dimension,
output_polynomial_size,
&lwe_pksk.d_vec,
lwe_pksk.decomposition_base_log(),
lwe_pksk.decomposition_level_count(),
input_lwe_ciphertext_list.lwe_ciphertext_count(),
);
}

View File

@@ -1,13 +1,15 @@
pub mod glwe_sample_extraction;
pub mod lwe_keyswitch;
pub mod lwe_linear_algebra;
pub mod lwe_multi_bit_programmable_bootstrapping;
pub mod lwe_packing_keyswitch;
pub mod lwe_programmable_bootstrapping;
pub mod glwe_sample_extraction;
mod lwe_keyswitch;
#[cfg(test)]
mod test;
pub use lwe_keyswitch::*;
pub use lwe_linear_algebra::*;
pub use lwe_multi_bit_programmable_bootstrapping::*;
pub use lwe_packing_keyswitch::*;
pub use lwe_programmable_bootstrapping::*;

View File

@@ -0,0 +1,234 @@
use super::*;
use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async;
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::CudaStreams;
use serde::de::DeserializeOwned;
use serde::Serialize;
const NB_TESTS: usize = 10;
fn generate_keys<Scalar: UnsignedTorus + Sync + Send + Serialize + DeserializeOwned>(
params: PackingKeySwitchTestParams<Scalar>,
streams: &CudaStreams,
rsc: &mut TestResources,
) -> CudaPackingKeySwitchKeys<Scalar> {
let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
params.lwe_dimension,
&mut rsc.secret_random_generator,
);
let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
params.glwe_dimension,
params.polynomial_size,
&mut rsc.secret_random_generator,
);
let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
&lwe_sk,
&glwe_sk,
params.pbs_base_log,
params.pbs_level,
params.glwe_noise_distribution,
params.ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
assert!(check_encrypted_content_respects_mod(
&pksk,
params.ciphertext_modulus
));
let cuda_pksk = CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(&pksk, streams);
CudaPackingKeySwitchKeys {
lwe_sk,
glwe_sk,
pksk: cuda_pksk,
}
}
fn lwe_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
where
Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
P: Into<PackingKeySwitchTestParams<Scalar>>,
PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
{
let params = params.into();
let lwe_noise_distribution = params.lwe_noise_distribution;
let ciphertext_modulus = params.ciphertext_modulus;
let message_modulus_log = params.message_modulus_log;
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
let mut rsc = TestResources::new();
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
let mut msg = msg_modulus;
let delta: Scalar = encoding_with_padding / msg_modulus;
let gpu_index = 0;
let stream = CudaStreams::new_single_gpu(gpu_index);
while msg != Scalar::ZERO {
msg = msg.wrapping_sub(Scalar::ONE);
for _ in 0..NB_TESTS {
let keys = generate_keys(params, &stream, &mut rsc);
let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk);
let plaintext = Plaintext(msg * delta);
let input_lwe = allocate_and_encrypt_new_lwe_ciphertext(
&lwe_sk,
plaintext,
lwe_noise_distribution,
ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
let d_input_lwe = CudaLweCiphertextList::from_lwe_ciphertext(&input_lwe, &stream);
assert!(check_encrypted_content_respects_mod(
&input_lwe,
ciphertext_modulus
));
let mut d_output_glwe = CudaGlweCiphertextList::new(
glwe_sk.glwe_dimension(),
glwe_sk.polynomial_size(),
GlweCiphertextCount(1),
ciphertext_modulus,
&stream,
);
unsafe {
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async(
&pksk,
&d_input_lwe,
&mut d_output_glwe,
&stream,
);
}
let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream);
let mut decrypted_plaintext_list = PlaintextList::new(
Scalar::ZERO,
PlaintextCount(output_glwe_list.polynomial_size().0),
);
decrypt_glwe_ciphertext_list(
&glwe_sk,
&output_glwe_list,
&mut decrypted_plaintext_list,
);
let decoded = round_decode(*decrypted_plaintext_list.get(0).0, delta) % msg_modulus;
assert_eq!(msg, decoded);
}
// In coverage, we break after one while loop iteration, changing message values does not
// yield higher coverage
#[cfg(tarpaulin)]
break;
}
}
fn lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
where
Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
P: Into<PackingKeySwitchTestParams<Scalar>>,
PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
{
let params = params.into();
let lwe_noise_distribution = params.lwe_noise_distribution;
let ciphertext_modulus = params.ciphertext_modulus;
let message_modulus_log = params.message_modulus_log;
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
let mut rsc = TestResources::new();
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
let mut msg = msg_modulus;
let delta: Scalar = encoding_with_padding / msg_modulus;
let gpu_index = 0;
let stream = CudaStreams::new_single_gpu(gpu_index);
while msg != Scalar::ZERO {
msg = msg.wrapping_sub(Scalar::ONE);
for _ in 0..NB_TESTS {
let keys = generate_keys(params, &stream, &mut rsc);
let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk);
let mut input_lwe_list = LweCiphertextList::new(
Scalar::ZERO,
lwe_sk.lwe_dimension().to_lwe_size(),
LweCiphertextCount(glwe_sk.polynomial_size().0),
ciphertext_modulus,
);
let mut input_plaintext_list =
PlaintextList::new(msg * delta, PlaintextCount(glwe_sk.polynomial_size().0));
encrypt_lwe_ciphertext_list(
&lwe_sk,
&mut input_lwe_list,
&input_plaintext_list,
lwe_noise_distribution,
&mut rsc.encryption_random_generator,
);
let d_input_lwe_list =
CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream);
assert!(check_encrypted_content_respects_mod(
&input_lwe_list,
ciphertext_modulus
));
let mut d_output_glwe = CudaGlweCiphertextList::new(
glwe_sk.glwe_dimension(),
glwe_sk.polynomial_size(),
GlweCiphertextCount(1),
ciphertext_modulus,
&stream,
);
unsafe {
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async(
&pksk,
&d_input_lwe_list,
&mut d_output_glwe,
&stream,
);
}
let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream);
let mut decrypted_plaintext_list = PlaintextList::new(
Scalar::ZERO,
PlaintextCount(output_glwe_list.polynomial_size().0),
);
decrypt_glwe_ciphertext_list(
&glwe_sk,
&output_glwe_list,
&mut decrypted_plaintext_list,
);
decrypted_plaintext_list
.iter_mut()
.for_each(|x| *x.0 = round_decode(*x.0, delta) % msg_modulus);
input_plaintext_list.iter_mut().for_each(|x| *x.0 /= delta);
assert_eq!(decrypted_plaintext_list, input_plaintext_list);
}
// In coverage, we break after one while loop iteration, changing message values does not
// yield higher coverage
#[cfg(tarpaulin)]
break;
}
}
create_gpu_parametrized_test!(lwe_encrypt_pks_to_glwe_decrypt_custom_mod);
create_gpu_parametrized_test!(lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod);

View File

@@ -4,8 +4,15 @@ mod glwe_sample_extraction;
mod lwe_keyswitch;
mod lwe_linear_algebra;
mod lwe_multi_bit_programmable_bootstrapping;
mod lwe_packing_keyswitch;
mod lwe_programmable_bootstrapping;
pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
pub lwe_sk: LweSecretKey<Vec<Scalar>>,
pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
pub pksk: CudaLwePackingKeyswitchKey<Scalar>,
}
// Macro to generate tests for all parameter sets
macro_rules! create_gpu_parametrized_test{
($name:ident { $($param:ident),* }) => {
@@ -47,4 +54,5 @@ macro_rules! create_gpu_multi_bit_parametrized_test{
};
}
use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
use {create_gpu_multi_bit_parametrized_test, create_gpu_parametrized_test};

View File

@@ -199,51 +199,6 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
LweCiphertext::from_container(container, self.ciphertext_modulus())
}
/// ```rust
/// use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
/// use tfhe::core_crypto::gpu::CudaStreams;
/// use tfhe::core_crypto::prelude::{
/// CiphertextModulus, LweCiphertextCount, LweCiphertextList, LweSize,
/// };
///
/// let mut streams = CudaStreams::new_single_gpu(0);
///
/// let lwe_size = LweSize(743);
/// let ciphertext_modulus = CiphertextModulus::new_native();
/// let lwe_ciphertext_count = LweCiphertextCount(2);
///
/// // Create a new LweCiphertextList
/// let lwe_list = LweCiphertextList::new(0u64, lwe_size, lwe_ciphertext_count, ciphertext_modulus);
///
/// // Copy to GPU
/// let d_lwe_list = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, &mut streams);
/// let d_lwe_list_copied = d_lwe_list.duplicate(&mut streams);
///
/// let lwe_list_copied = d_lwe_list_copied.to_lwe_ciphertext_list(&mut streams);
///
/// assert_eq!(lwe_list, lwe_list_copied);
/// ```
pub fn duplicate(&self, streams: &CudaStreams) -> Self {
let lwe_dimension = self.lwe_dimension();
let lwe_ciphertext_count = self.lwe_ciphertext_count();
let ciphertext_modulus = self.ciphertext_modulus();
// Copy to the GPU
let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0);
unsafe {
d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0);
}
streams.synchronize();
let cuda_lwe_list = CudaLweList {
d_vec,
lwe_ciphertext_count,
lwe_dimension,
ciphertext_modulus,
};
Self(cuda_lwe_list)
}
pub(crate) fn lwe_dimension(&self) -> LweDimension {
self.0.lwe_dimension
}

Some files were not shown because too many files have changed in this diff Show More