mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
37 Commits
bc/ci/esti
...
al/fixes
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7dcbd85a83 | ||
|
|
1e453263af | ||
|
|
c258d53625 | ||
|
|
8ddee20a57 | ||
|
|
1d786b7202 | ||
|
|
7267d60e01 | ||
|
|
0148a6ffc8 | ||
|
|
63571a07ae | ||
|
|
6e2908ad4e | ||
|
|
d3d06c905f | ||
|
|
051f33f166 | ||
|
|
11a8f97a1c | ||
|
|
35a9c323a7 | ||
|
|
641f47b775 | ||
|
|
456d0ced1b | ||
|
|
358bcc9a22 | ||
|
|
27a4564d83 | ||
|
|
296e419f6c | ||
|
|
e1a25a10ac | ||
|
|
d9349b3357 | ||
|
|
68e4ac4896 | ||
|
|
3f318a2046 | ||
|
|
d1380794ed | ||
|
|
fe5641ef6d | ||
|
|
3397aa81d2 | ||
|
|
8f10f8f8db | ||
|
|
92be95c6b8 | ||
|
|
990c4d0380 | ||
|
|
1d5abfd5ea | ||
|
|
dfd1beeb47 | ||
|
|
43a007a2fa | ||
|
|
54faf64ecd | ||
|
|
8fe7f9c3cb | ||
|
|
9ed65db03d | ||
|
|
9413d3e722 | ||
|
|
2000feb87e | ||
|
|
594a5cee25 |
2
.github/workflows/aws_tfhe_fast_tests.yml
vendored
2
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -56,7 +56,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
47
.github/workflows/aws_tfhe_integer_tests.yml
vendored
47
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -19,21 +19,48 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Nightly tests @ 3AM after each work day
|
||||
- cron: "0 3 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.integer_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
needs: should-run
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -60,7 +87,7 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -103,7 +130,7 @@ jobs:
|
||||
teardown-instance:
|
||||
name: Teardown instance (unsigned-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, unsigned-integer-tests ]
|
||||
needs: [setup-instance, unsigned-integer-tests]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
|
||||
@@ -19,21 +19,48 @@ on:
|
||||
# Allows you to run this workflow manually from the Actions tab as an alternative.
|
||||
workflow_dispatch:
|
||||
pull_request:
|
||||
types: [ labeled ]
|
||||
types: [labeled]
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
# Nightly tests @ 3AM after each work day
|
||||
- cron: "0 3 * * MON-FRI"
|
||||
|
||||
jobs:
|
||||
should-run:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
pull-requests: write
|
||||
outputs:
|
||||
integer_test: ${{ github.event_name == 'workflow_dispatch' ||
|
||||
steps.changed-files.outputs.integer_any_changed }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
integer:
|
||||
- tfhe/Cargo.toml
|
||||
- concrete-csprng/**
|
||||
- tfhe-zk-pok/**
|
||||
- tfhe/src/core_crypto/**
|
||||
- tfhe/src/shortint/**
|
||||
- tfhe/src/integer/**
|
||||
|
||||
setup-instance:
|
||||
name: Setup instance (signed-integer-tests)
|
||||
if: (github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
name: Setup instance (unsigned-integer-tests)
|
||||
needs: should-run
|
||||
if:
|
||||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs' && needs.should-run.outputs.integer_test == 'true') ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
|
||||
(github.event_name == 'pull_request' && contains(github.event.label.name, 'approved')) ||
|
||||
github.event_name == 'workflow_dispatch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -60,7 +87,7 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: "false"
|
||||
|
||||
- name: Set up home
|
||||
run: |
|
||||
@@ -107,7 +134,7 @@ jobs:
|
||||
teardown-instance:
|
||||
name: Teardown instance (signed-integer-tests)
|
||||
if: ${{ always() && needs.setup-instance.result != 'skipped' }}
|
||||
needs: [ setup-instance, signed-integer-tests ]
|
||||
needs: [setup-instance, signed-integer-tests]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Stop instance
|
||||
|
||||
2
.github/workflows/aws_tfhe_tests.yml
vendored
2
.github/workflows/aws_tfhe_tests.yml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
2
.github/workflows/boolean_benchmark.yml
vendored
2
.github/workflows/boolean_benchmark.yml
vendored
@@ -98,7 +98,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_boolean
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
19
.github/workflows/cargo_build.yml
vendored
19
.github/workflows/cargo_build.yml
vendored
@@ -19,14 +19,21 @@ jobs:
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
os: [large_ubuntu_16, macos-latest-large, large_windows_16_latest]
|
||||
# GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
|
||||
# even with a few PRs
|
||||
os: [large_ubuntu_16, macos-latest, windows-latest]
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@7b1c307e0dcbda6122208f10795a713336a9b35a
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install and run newline linter checks
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
|
||||
echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
|
||||
@@ -36,27 +43,33 @@ jobs:
|
||||
make check_newline
|
||||
|
||||
- name: Run pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make pcc
|
||||
|
||||
- name: Build concrete-csprng
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_concrete_csprng
|
||||
|
||||
- name: Build Release core
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_core AVX512_SUPPORT=ON
|
||||
make build_core_experimental AVX512_SUPPORT=ON
|
||||
|
||||
- name: Build Release boolean
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_boolean
|
||||
|
||||
- name: Build Release shortint
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_shortint
|
||||
|
||||
- name: Build Release integer
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_integer
|
||||
|
||||
@@ -65,10 +78,12 @@ jobs:
|
||||
make build_tfhe_full
|
||||
|
||||
- name: Build Release c_api
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_c_api
|
||||
|
||||
- name: Build coverage tests
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
|
||||
2
.github/workflows/code_coverage.yml
vendored
2
.github/workflows/code_coverage.yml
vendored
@@ -57,7 +57,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
files_yaml: |
|
||||
tfhe:
|
||||
|
||||
2
.github/workflows/core_crypto_benchmark.yml
vendored
2
.github/workflows/core_crypto_benchmark.yml
vendored
@@ -86,7 +86,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -128,7 +128,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
4
.github/workflows/gpu_4090_benchmark.yml
vendored
4
.github/workflows/gpu_4090_benchmark.yml
vendored
@@ -82,7 +82,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer_multi_bit_gpu_default
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
@@ -164,7 +164,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
2
.github/workflows/gpu_fast_h100_tests.yml
vendored
2
.github/workflows/gpu_fast_h100_tests.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
2
.github/workflows/gpu_fast_tests.yml
vendored
2
.github/workflows/gpu_fast_tests.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
@@ -42,7 +42,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
@@ -34,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
@@ -41,7 +41,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
|
||||
2
.github/workflows/integer_cpu_benchmark.yml
vendored
2
.github/workflows/integer_cpu_benchmark.yml
vendored
@@ -139,7 +139,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
4
.github/workflows/integer_gpu_benchmark.yml
vendored
4
.github/workflows/integer_gpu_benchmark.yml
vendored
@@ -124,7 +124,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -147,7 +147,7 @@ jobs:
|
||||
parse_integer_benches
|
||||
|
||||
- name: Upload csv results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_csv_integer
|
||||
path: ${{ env.PARSE_INTEGER_BENCH_CSV_FILE }}
|
||||
@@ -167,7 +167,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -164,7 +164,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -144,7 +144,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
2
.github/workflows/make_release.yml
vendored
2
.github/workflows/make_release.yml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe
|
||||
- uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: crate
|
||||
path: target/package/*.crate
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
# Publish new release of tfhe-rs on various platform.
|
||||
name: Publish concrete-csprng release
|
||||
|
||||
on:
|
||||
@@ -37,6 +36,6 @@ jobs:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "concrete-csprng release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "concrete-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
36
.github/workflows/make_release_concrete_tfhe_versionable.yml
vendored
Normal file
36
.github/workflows/make_release_concrete_tfhe_versionable.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: Publish tfhe-versionable release
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
|
||||
jobs:
|
||||
publish_release:
|
||||
name: Publish tfhe-versionable Release
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
|
||||
run: |
|
||||
cargo publish -p tfhe-versionable-derive --token ${{ env.CRATES_TOKEN }}
|
||||
cargo publish -p tfhe-versionable --token ${{ env.CRATES_TOKEN }}
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@4e5fb42d249be6a45a298f3c9543b111b02f7907
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
4
.github/workflows/parameters_check.yml
vendored
4
.github/workflows/parameters_check.yml
vendored
@@ -14,7 +14,7 @@ on:
|
||||
|
||||
jobs:
|
||||
params-curves-security-check:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: large_ubuntu_16
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
|
||||
@@ -24,7 +24,7 @@ jobs:
|
||||
with:
|
||||
repository: malb/lattice-estimator
|
||||
path: lattice_estimator
|
||||
ref: '53508253629d3b5d31a2ad110e85dc69391ccb95'
|
||||
ref: 'e80ec6bbbba212428b0e92d0467c18629cf9ed67'
|
||||
|
||||
- name: Install Sage
|
||||
run: |
|
||||
|
||||
2
.github/workflows/shortint_cpu_benchmark.yml
vendored
2
.github/workflows/shortint_cpu_benchmark.yml
vendored
@@ -141,7 +141,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_shortint_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
@@ -139,7 +139,7 @@ jobs:
|
||||
--throughput
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
4
.github/workflows/wasm_client_benchmark.yml
vendored
4
.github/workflows/wasm_client_benchmark.yml
vendored
@@ -39,7 +39,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_wasm
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
6
.github/workflows/zk_pke_benchmark.yml
vendored
6
.github/workflows/zk_pke_benchmark.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
|
||||
- name: Check for file changes
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
|
||||
uses: tj-actions/changed-files@40853de9f8ce2d6cfdc73c1b96f14e22ba44aec4
|
||||
with:
|
||||
since_last_remote_commit: true
|
||||
files_yaml: |
|
||||
@@ -79,7 +79,7 @@ jobs:
|
||||
if: needs.setup-instance.result != 'skipped'
|
||||
needs: setup-instance
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}
|
||||
group: ${{ github.workflow }}_${{github.event_name}}_${{ github.ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
steps:
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
--append-results
|
||||
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874
|
||||
with:
|
||||
name: ${{ github.sha }}_integer_zk
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
2
Makefile
2
Makefile
@@ -18,7 +18,7 @@ FAST_TESTS?=FALSE
|
||||
FAST_BENCH?=FALSE
|
||||
NIGHTLY_TESTS?=FALSE
|
||||
BENCH_OP_FLAVOR?=DEFAULT
|
||||
NODE_VERSION=22.4
|
||||
NODE_VERSION=22.6
|
||||
FORWARD_COMPAT?=OFF
|
||||
BACKWARD_COMPAT_DATA_URL=https://github.com/zama-ai/tfhe-backward-compat-data.git
|
||||
BACKWARD_COMPAT_DATA_BRANCH?=v0.1
|
||||
|
||||
@@ -159,7 +159,7 @@ To run this code, use the following command:
|
||||
> Note that when running code that uses `TFHE-rs`, it is highly recommended
|
||||
to run in release mode with cargo's `--release` flag to have the best performances possible.
|
||||
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/getting-started/quick_start)*
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick_start)*
|
||||
|
||||
<p align="right">
|
||||
<a href="#about" > ↑ Back to top </a>
|
||||
|
||||
@@ -148,10 +148,9 @@ where
|
||||
|
||||
/// Computes one turn of the stream, updating registers and outputting the new bit.
|
||||
pub fn next_bool(&mut self) -> T {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => set_server_key(sk.clone()),
|
||||
None => (),
|
||||
};
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
set_server_key(sk.clone());
|
||||
}
|
||||
|
||||
let [o, a, b, c] = self.get_output_and_values(0);
|
||||
|
||||
@@ -226,18 +225,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let mut values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut ret = Vec::<T>::with_capacity(64);
|
||||
|
||||
@@ -237,18 +237,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut bytes = Vec::<T>::with_capacity(8);
|
||||
|
||||
@@ -120,10 +120,9 @@ where
|
||||
|
||||
/// Computes one turn of the stream, updating registers and outputting the new bit.
|
||||
pub fn next_bool(&mut self) -> T {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => set_server_key(sk.clone()),
|
||||
None => (),
|
||||
};
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
set_server_key(sk.clone());
|
||||
}
|
||||
|
||||
let [o, a, b, c] = self.get_output_and_values(0);
|
||||
|
||||
@@ -196,18 +195,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let mut values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut ret = Vec::<T>::with_capacity(64);
|
||||
|
||||
@@ -187,18 +187,12 @@ where
|
||||
/// Computes 64 turns of the stream, outputting the 64 bits (in 8 bytes) all at once in a
|
||||
/// Vec (first value is oldest, last is newest)
|
||||
pub fn next_64(&mut self) -> Vec<T> {
|
||||
match &self.fhe_key {
|
||||
Some(sk) => {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
None => (),
|
||||
if let Some(sk) = &self.fhe_key {
|
||||
rayon::broadcast(|_| set_server_key(sk.clone()));
|
||||
}
|
||||
let values = self.get_64_output_and_values();
|
||||
match &self.fhe_key {
|
||||
Some(_) => {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
None => (),
|
||||
if self.fhe_key.is_some() {
|
||||
rayon::broadcast(|_| unset_server_key());
|
||||
}
|
||||
|
||||
let mut bytes = Vec::<T>::with_capacity(8);
|
||||
|
||||
156
backends/tfhe-cuda-backend/cuda/include/compression.h
Normal file
156
backends/tfhe-cuda-backend/cuda/include/compression.h
Normal file
@@ -0,0 +1,156 @@
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_H
|
||||
#define CUDA_INTEGER_COMPRESSION_H
|
||||
|
||||
#include "integer.h"
|
||||
|
||||
extern "C" {
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
|
||||
int8_t *mem_ptr);
|
||||
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *glwe_in, void *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
}
|
||||
|
||||
template <typename Torus> struct int_compression {
|
||||
int_radix_params compression_params;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
|
||||
uint32_t body_count;
|
||||
|
||||
// Compression
|
||||
int8_t *fp_ks_buffer;
|
||||
Torus *tmp_lwe;
|
||||
Torus *tmp_glwe_array_out;
|
||||
|
||||
int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
this->compression_params = compression_params;
|
||||
this->lwe_per_glwe = lwe_per_glwe;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->body_count = num_radix_blocks;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
|
||||
tmp_lwe = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
tmp_glwe_array_out = (Torus *)cuda_malloc_async(
|
||||
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
streams[0], gpu_indexes[0], &fp_ks_buffer,
|
||||
compression_params.glwe_dimension, compression_params.polynomial_size,
|
||||
num_radix_blocks, true);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
|
||||
cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
|
||||
&fp_ks_buffer);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_decompression {
|
||||
int_radix_params encryption_params;
|
||||
int_radix_params compression_params;
|
||||
|
||||
uint32_t storage_log_modulus;
|
||||
|
||||
uint32_t body_count;
|
||||
|
||||
Torus *tmp_extracted_glwe;
|
||||
Torus *tmp_extracted_lwe;
|
||||
|
||||
int_radix_lut<Torus> *carry_extract_lut;
|
||||
|
||||
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params encryption_params,
|
||||
int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
this->encryption_params = encryption_params;
|
||||
this->compression_params = compression_params;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->body_count = num_radix_blocks;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
|
||||
carry_extract_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
tmp_extracted_glwe = (Torus *)cuda_malloc_async(
|
||||
glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks *
|
||||
(compression_params.glwe_dimension *
|
||||
compression_params.polynomial_size +
|
||||
1) *
|
||||
sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
// Decompression
|
||||
// Carry extract LUT
|
||||
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
|
||||
return x / encryption_params.message_modulus;
|
||||
};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
carry_extract_lut->get_lut(gpu_indexes[0], 0),
|
||||
encryption_params.glwe_dimension, encryption_params.polynomial_size,
|
||||
encryption_params.message_modulus, encryption_params.carry_modulus,
|
||||
carry_extract_f);
|
||||
|
||||
carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
|
||||
cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
|
||||
|
||||
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete (carry_extract_lut);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CUDA_INTEGER_H
|
||||
#define CUDA_INTEGER_H
|
||||
|
||||
#include "keyswitch.h"
|
||||
#include "pbs/programmable_bootstrap.cuh"
|
||||
#include "programmable_bootstrap.h"
|
||||
#include "programmable_bootstrap_multibit.h"
|
||||
@@ -15,7 +16,6 @@ enum SHIFT_OR_ROTATE_TYPE {
|
||||
LEFT_ROTATE = 2,
|
||||
RIGHT_ROTATE = 3
|
||||
};
|
||||
enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
|
||||
enum BITOP_TYPE {
|
||||
BITAND = 0,
|
||||
BITOR = 1,
|
||||
@@ -475,7 +475,8 @@ struct int_radix_params {
|
||||
message_modulus(message_modulus), carry_modulus(carry_modulus){};
|
||||
|
||||
void print() {
|
||||
printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
|
||||
printf("pbs_type: %u, glwe_dimension: %u, "
|
||||
"polynomial_size: %u, "
|
||||
"big_lwe_dimension: %u, "
|
||||
"small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
|
||||
"%u, pbs_base_log: "
|
||||
@@ -812,7 +813,6 @@ template <typename Torus> struct int_radix_lut {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
|
||||
@@ -16,6 +16,21 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
|
||||
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
|
||||
|
||||
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **fp_ks_buffer);
|
||||
}
|
||||
|
||||
#endif // CNCRT_KS_H_
|
||||
|
||||
@@ -1,17 +1,3 @@
|
||||
set(SOURCES
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
|
||||
file(GLOB_RECURSE SOURCES "*.cu")
|
||||
add_library(tfhe_cuda_backend STATIC ${SOURCES})
|
||||
set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
|
||||
@@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
auto lwe_out = lwe_array_out + input_id * lwe_output_size;
|
||||
|
||||
// We assume each GLWE will store the first polynomial_size inputs
|
||||
uint32_t nth_per_glwe = params::degree;
|
||||
auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
|
||||
uint32_t lwe_per_glwe = params::degree;
|
||||
auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
|
||||
|
||||
auto nth = nth_array[input_id];
|
||||
|
||||
@@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *lwe_array_out, Torus *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_glwes,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
dim3 grid(num_glwes);
|
||||
dim3 grid(num_nths);
|
||||
dim3 thds(params::degree / params::opt);
|
||||
sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
|
||||
|
||||
@@ -27,7 +27,7 @@ private:
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
|
||||
uint32_t num_poly = 1)
|
||||
uint32_t num_poly)
|
||||
: base_log(base_log), level_count(level_count), num_poly(num_poly),
|
||||
state(state) {
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
host_keyswitch_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_output_indexes),
|
||||
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
|
||||
void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
host_keyswitch_lwe_ciphertext_vector(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_output_indexes),
|
||||
@@ -48,3 +48,35 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
void scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory) {
|
||||
scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
|
||||
glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
|
||||
}
|
||||
/* Perform functional packing keyswitch on a batch of 64 bits input LWE
|
||||
* ciphertexts.
|
||||
*/
|
||||
void cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
|
||||
void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
|
||||
uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
|
||||
output_glwe_dimension, output_polynomial_size, base_log, level_count,
|
||||
num_lwes);
|
||||
}
|
||||
|
||||
void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **fp_ks_buffer) {
|
||||
cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "torus.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
@@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(
|
||||
__host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
|
||||
Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
@@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
|
||||
|
||||
// Compute Keyswitch
|
||||
cuda_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
host_keyswitch_lwe_ciphertext_vector<Torus>(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, current_lwe_array_in,
|
||||
current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
|
||||
@@ -154,4 +155,154 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
|
||||
cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
if (allocate_gpu_memory)
|
||||
*fp_ks_buffer = (int8_t *)cuda_malloc_async(
|
||||
2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
}
|
||||
|
||||
// public functional packing keyswitch for a single LWE ciphertext
|
||||
//
|
||||
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
|
||||
// different thread blocks at the x-axis to work on that input.
|
||||
template <typename Torus>
|
||||
__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
size_t glwe_size = (glwe_dimension + 1);
|
||||
|
||||
if (tid < glwe_size * polynomial_size) {
|
||||
const int local_index = threadIdx.x;
|
||||
// the output_glwe is split in polynomials and each x-block takes one of
|
||||
// them
|
||||
size_t poly_id = blockIdx.x;
|
||||
size_t coef_per_block = blockDim.x;
|
||||
|
||||
// number of coefficients inside fp-ksk block for each lwe_input coefficient
|
||||
size_t ksk_block_size = glwe_size * polynomial_size * level_count;
|
||||
|
||||
// initialize accumulator to 0
|
||||
glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
|
||||
tid == glwe_dimension * polynomial_size);
|
||||
|
||||
// Iterate through all lwe elements
|
||||
for (int i = 0; i < lwe_dimension_in; i++) {
|
||||
// Round and prepare decomposition
|
||||
Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
|
||||
|
||||
Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
|
||||
Torus mod_b_mask = (1ll << base_log) - 1ll;
|
||||
|
||||
// block of key for current lwe coefficient (cur_input_lwe[i])
|
||||
auto ksk_block = &fp_ksk[i * ksk_block_size];
|
||||
for (int j = 0; j < level_count; j++) {
|
||||
auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
|
||||
// Iterate through each level and multiply by the ksk piece
|
||||
auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
|
||||
Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
|
||||
glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// public functional packing keyswitch for a batch of LWE ciphertexts
|
||||
//
|
||||
// Selects the input each thread is working on using the y-block index.
|
||||
//
|
||||
// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
|
||||
// different thread blocks at the x-axis to work on that input.
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus *fp_ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, Torus *d_mem) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
const int lwe_size = (lwe_dimension_in + 1);
|
||||
|
||||
const int input_id = blockIdx.y;
|
||||
const int degree = input_id;
|
||||
|
||||
// Select an input
|
||||
auto lwe_in = lwe_array_in + input_id * lwe_size;
|
||||
auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
|
||||
auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
|
||||
// KS LWE to GLWE
|
||||
packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
|
||||
ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
|
||||
polynomial_size, base_log, level_count);
|
||||
|
||||
// P * x ^degree
|
||||
auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
|
||||
auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
|
||||
polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree,
|
||||
tid % polynomial_size,
|
||||
polynomial_size, 1, true);
|
||||
}
|
||||
|
||||
/// To-do: Rewrite this kernel for efficiency
|
||||
template <typename Torus>
|
||||
__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t num_lwes) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < (glwe_dimension + 1) * polynomial_size) {
|
||||
glwe_out[tid] = glwe_array_in[tid];
|
||||
|
||||
// Accumulate
|
||||
for (int i = 1; i < num_lwes; i++) {
|
||||
auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
|
||||
glwe_out[tid] += glwe_in[tid];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_packing_keyswitch_lwe_list_to_glwe(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
|
||||
Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
|
||||
uint32_t lwe_dimension_in, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_lwes) {
|
||||
cudaSetDevice(gpu_index);
|
||||
int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
|
||||
|
||||
dim3 grid(num_blocks, num_lwes);
|
||||
dim3 threads(num_threads);
|
||||
|
||||
auto d_mem = (Torus *)fp_ks_buffer;
|
||||
auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
|
||||
|
||||
// individually keyswitch each lwe
|
||||
packing_keyswitch_lwe_list_to_glwe<<<grid, threads, 0, stream>>>(
|
||||
d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// accumulate to a single glwe
|
||||
accumulate_glwes<<<num_blocks, threads, 0, stream>>>(
|
||||
glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
|
||||
num_lwes);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#define CNCRT_TORUS_CUH
|
||||
|
||||
#include "types/int128.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <limits>
|
||||
|
||||
template <typename T>
|
||||
@@ -29,20 +30,18 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
T shift = sizeof(T) * 8 - level_count * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
|
||||
const T shift = non_rep_bit_count - 1;
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
return res;
|
||||
res += 1;
|
||||
res &= (T)(-2);
|
||||
return res << shift;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void modulus_switch(T input, T &output,
|
||||
uint32_t log_modulus) {
|
||||
constexpr uint32_t BITS = sizeof(T) * 8;
|
||||
|
||||
output = input + (((T)1) << (BITS - log_modulus - 1));
|
||||
output >>= (BITS - log_modulus);
|
||||
}
|
||||
@@ -54,4 +53,27 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
|
||||
return output;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void modulus_switch_inplace(Torus *array, int size,
|
||||
uint32_t log_modulus) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (tid < size) {
|
||||
array[tid] = modulus_switch(array[tid], log_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_modulus_switch_inplace(cudaStream_t stream,
|
||||
uint32_t gpu_index, Torus *array,
|
||||
int size, uint32_t log_modulus) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
|
||||
|
||||
modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
|
||||
log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CNCRT_TORUS_H
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
#include "compression.cuh"
|
||||
|
||||
void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
(compression_glwe_dimension + 1) * compression_polynomial_size,
|
||||
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
scratch_cuda_compress_integer_radix_ciphertext_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params encryption_params(
|
||||
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
|
||||
(encryption_glwe_dimension + 1) * encryption_polynomial_size,
|
||||
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
(compression_glwe_dimension + 1) * compression_polynomial_size,
|
||||
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
|
||||
carry_modulus);
|
||||
|
||||
scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
|
||||
compression_params, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
|
||||
int8_t *mem_ptr) {
|
||||
|
||||
host_integer_compress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
|
||||
(int_compression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void *glwe_in, void *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
|
||||
|
||||
host_integer_decompress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
|
||||
static_cast<uint32_t *>(indexes_array), indexes_array_size, bsks,
|
||||
(int_decompression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
|
||||
uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_compression<uint64_t> *mem_ptr =
|
||||
(int_compression<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_decompression<uint64_t> *mem_ptr =
|
||||
(int_decompression<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
}
|
||||
@@ -0,0 +1,238 @@
|
||||
#ifndef CUDA_INTEGER_COMPRESSION_CUH
|
||||
#define CUDA_INTEGER_COMPRESSION_CUH
|
||||
|
||||
#include "ciphertext.h"
|
||||
#include "compression.h"
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/integer.cuh"
|
||||
#include "linearalgebra/multiplication.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
||||
uint32_t in_len, uint32_t len) {
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
|
||||
auto i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i < len) {
|
||||
auto k = nbits * i / log_modulus;
|
||||
auto j = k;
|
||||
|
||||
auto start_shift = i * nbits - j * log_modulus;
|
||||
|
||||
auto value = array_in[j] >> start_shift;
|
||||
j++;
|
||||
|
||||
while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
|
||||
auto shift = j * log_modulus - i * nbits;
|
||||
value |= array_in[j] << shift;
|
||||
j++;
|
||||
}
|
||||
|
||||
array_out[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *array_out, Torus *array_in, uint32_t num_inputs,
|
||||
uint32_t body_count, int_compression<Torus> *mem_ptr) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
|
||||
auto number_bits_to_pack = in_len * log_modulus;
|
||||
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
// number_bits_to_pack.div_ceil(Scalar::BITS)
|
||||
auto len = (number_bits_to_pack + nbits - 1) / nbits;
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(len, 128, num_blocks, num_threads);
|
||||
|
||||
dim3 grid(num_blocks);
|
||||
dim3 threads(num_threads);
|
||||
pack<<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus, in_len,
|
||||
len);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_compress(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus **fp_ksk, uint32_t num_lwes,
|
||||
int_compression<Torus> *mem_ptr) {
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
auto input_lwe_dimension = compression_params.small_lwe_dimension;
|
||||
|
||||
// Shift
|
||||
auto lwe_shifted = mem_ptr->tmp_lwe;
|
||||
host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
|
||||
lwe_array_in,
|
||||
(uint64_t)compression_params.message_modulus,
|
||||
input_lwe_dimension, num_lwes);
|
||||
|
||||
uint32_t lwe_in_size = input_lwe_dimension + 1;
|
||||
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
|
||||
|
||||
// Keyswitch LWEs to GLWE
|
||||
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
|
||||
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
|
||||
for (int i = 0; i < num_glwes; i++) {
|
||||
auto lwe_subset = lwe_shifted + i * lwe_in_size;
|
||||
auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe(
|
||||
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
|
||||
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size, compression_params.ks_base_log,
|
||||
compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
|
||||
}
|
||||
|
||||
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
|
||||
|
||||
// Modulus switch
|
||||
host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
|
||||
num_glwes *
|
||||
(compression_params.glwe_dimension *
|
||||
compression_params.polynomial_size +
|
||||
body_count),
|
||||
mem_ptr->storage_log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out,
|
||||
num_glwes, body_count, mem_ptr);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
|
||||
uint32_t log_modulus, uint32_t initial_out_len) {
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
|
||||
auto i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
if (i < initial_out_len) {
|
||||
// Unpack
|
||||
Torus mask = ((Torus)1 << log_modulus) - 1;
|
||||
auto start = i * log_modulus;
|
||||
auto end = (i + 1) * log_modulus;
|
||||
|
||||
auto start_block = start / nbits;
|
||||
auto start_remainder = start % nbits;
|
||||
|
||||
auto end_block_inclusive = (end - 1) / nbits;
|
||||
|
||||
Torus unpacked_i;
|
||||
if (start_block == end_block_inclusive) {
|
||||
auto single_part = array_in[start_block] >> start_remainder;
|
||||
unpacked_i = single_part & mask;
|
||||
} else {
|
||||
auto first_part = array_in[start_block] >> start_remainder;
|
||||
auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
|
||||
|
||||
unpacked_i = (first_part | second_part) & mask;
|
||||
}
|
||||
|
||||
// Extract
|
||||
glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *glwe_array_out, Torus *array_in,
|
||||
uint32_t glwe_index,
|
||||
int_decompression<Torus> *mem_ptr) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
auto params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
|
||||
uint32_t body_count = mem_ptr->body_count;
|
||||
auto initial_out_len =
|
||||
params.glwe_dimension * params.polynomial_size + body_count * body_count;
|
||||
|
||||
// We assure the tail of the glwe is zeroed
|
||||
auto zeroed_slice =
|
||||
glwe_array_out + params.glwe_dimension * params.polynomial_size;
|
||||
cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks);
|
||||
dim3 threads(num_threads);
|
||||
extract<<<grid, threads, 0, stream>>>(glwe_array_out, array_in, glwe_index,
|
||||
log_modulus, initial_out_len);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *lwe_array_out,
|
||||
Torus *packed_glwe_in, uint32_t *indexes_array,
|
||||
uint32_t indexes_array_size, void **bsks,
|
||||
int_decompression<Torus> *mem_ptr) {
|
||||
|
||||
auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
|
||||
mem_ptr);
|
||||
|
||||
auto num_lwes = mem_ptr->body_count;
|
||||
|
||||
// Sample extract
|
||||
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
|
||||
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
|
||||
extracted_glwe, indexes_array, indexes_array_size,
|
||||
compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
auto encryption_params = mem_ptr->encryption_params;
|
||||
auto carry_extract_lut = mem_ptr->carry_extract_lut;
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
|
||||
carry_extract_lut->lut_indexes_vec, extracted_lwe,
|
||||
carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.glwe_dimension * compression_params.polynomial_size,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
|
||||
encryption_params.pbs_type);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_compression<Torus> **mem_ptr, uint32_t num_lwes,
|
||||
int_radix_params compression_params, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_compression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_lwes,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
|
||||
int_radix_params encryption_params, int_radix_params compression_params,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_decompression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
|
||||
num_lwes, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
#endif
|
||||
@@ -765,7 +765,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
|
||||
}
|
||||
|
||||
if (num_radix_blocks % 2 == 1) {
|
||||
// We couldn't pack the last block, so we just copy it
|
||||
// We couldn't host_pack the last block, so we just copy it
|
||||
Torus *lsb_block =
|
||||
lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
|
||||
Torus *last_block =
|
||||
|
||||
@@ -271,7 +271,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
|
||||
@@ -133,7 +133,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
PANIC("Unknown operation")
|
||||
}
|
||||
|
||||
// pack bits into one block so that we have
|
||||
// host_pack bits into one block so that we have
|
||||
// control_bit|b|a
|
||||
cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
|
||||
streams[0], gpu_indexes[0]); // Do we need this?
|
||||
|
||||
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_cleartext_vec_multiplication(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<uint32_t *>(lwe_array_in),
|
||||
static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
/*
|
||||
* Perform the multiplication of a u64 input LWE ciphertext vector with a u64
|
||||
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
|
||||
void *cleartext_array_in, uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in),
|
||||
input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
host_cleartext_vec_multiplication(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t *>(lwe_array_in),
|
||||
static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
|
||||
input_lwe_ciphertext_count);
|
||||
}
|
||||
|
||||
@@ -14,9 +14,10 @@
|
||||
#include <vector>
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
|
||||
T *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
@@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T *cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cleartext_vec_multiplication<<<grid, thds, 0, stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
if (index < num_entries) {
|
||||
// Here we take advantage of the wrapping behaviour of uint
|
||||
output[index] = lwe_input[index] * cleartext_input;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T *lwe_input, T *cleartext_input,
|
||||
T *output, T *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
|
||||
|
||||
@@ -207,9 +207,9 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
// the resulting constant coefficient of the accumulator
|
||||
// For the mask it's more complicated
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
glwe_dimension, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
glwe_dimension, 0);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -98,8 +98,8 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, false,
|
||||
1);
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
synchronize_threads_in_block();
|
||||
@@ -111,13 +111,13 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
accumulator, accumulator_rotated, a_hat, 1);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
accumulator_rotated, base_log, level_count, 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -125,7 +125,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
accumulator_rotated, 1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -150,9 +150,9 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
false, 1);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
@@ -98,12 +98,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator, base_log, level_count, 1);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
|
||||
1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -129,9 +130,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
|
||||
0);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
|
||||
0);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
|
||||
@@ -82,7 +82,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
false, 1);
|
||||
|
||||
// Persist
|
||||
int tid = threadIdx.x;
|
||||
@@ -102,20 +102,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(global_slice,
|
||||
accumulator, a_hat);
|
||||
accumulator, a_hat, 1);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator, base_log, level_count, 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -215,9 +215,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
|
||||
0);
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
|
||||
0);
|
||||
}
|
||||
} else {
|
||||
// Persist the updated accumulator
|
||||
|
||||
@@ -102,8 +102,9 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Multiply by the bsk element
|
||||
polynomial_product_accumulate_by_monomial<Torus, params>(
|
||||
accumulator, bsk_poly, monomial_degree, false);
|
||||
polynomial_accumulate_monic_monomial_mul<Torus>(
|
||||
accumulator, bsk_poly, monomial_degree, threadIdx.x, params::degree,
|
||||
params::opt, false);
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
@@ -209,7 +210,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
false, 1);
|
||||
|
||||
// Persist
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
@@ -224,12 +225,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator, base_log, level_count, 1);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -323,9 +324,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice, 1,
|
||||
0);
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0,
|
||||
0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,13 +115,13 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
accumulator, accumulator_rotated, a_hat, 1);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
accumulator_rotated, base_log, level_count, 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -154,9 +154,9 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
false, 1);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
@@ -106,12 +106,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator, base_log, level_count, 1);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
|
||||
1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -137,9 +138,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
|
||||
0);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
|
||||
0);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
|
||||
@@ -45,7 +45,7 @@ template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void
|
||||
divide_by_monomial_negacyclic_inplace(T *accumulator,
|
||||
const T *__restrict__ input, uint32_t j,
|
||||
bool zeroAcc, uint32_t num_poly = 1) {
|
||||
bool zeroAcc, uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
|
||||
@@ -94,7 +94,7 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
|
||||
T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
|
||||
@@ -133,7 +133,7 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
|
||||
int level_count,
|
||||
uint32_t num_poly = 1) {
|
||||
uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
|
||||
@@ -192,7 +192,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
|
||||
// Extracts the body of the nth-LWE in a GLWE.
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
|
||||
uint32_t glwe_dimension, uint32_t nth = 0) {
|
||||
uint32_t glwe_dimension, uint32_t nth) {
|
||||
// Set first coefficient of the glwe as the body of the LWE sample
|
||||
lwe_array_out[glwe_dimension * params::degree] =
|
||||
glwe[glwe_dimension * params::degree + nth];
|
||||
@@ -201,8 +201,7 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
|
||||
// Extracts the mask from the nth-LWE in a GLWE.
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
|
||||
uint32_t glwe_dimension = 1,
|
||||
uint32_t nth = 0) {
|
||||
uint32_t glwe_dimension, uint32_t nth) {
|
||||
for (int z = 0; z < glwe_dimension; z++) {
|
||||
Torus *lwe_array_out_slice =
|
||||
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
|
||||
|
||||
@@ -55,21 +55,22 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
}
|
||||
}
|
||||
|
||||
// If init_accumulator is set, assumes that result was not initialized and does
|
||||
// that with the outcome of first * second
|
||||
template <typename T, class params>
|
||||
__device__ void
|
||||
polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
|
||||
uint64_t monomial_degree,
|
||||
bool init_accumulator = false) {
|
||||
// monomial_degree \in [0, 2 * params::degree)
|
||||
int full_cycles_count = monomial_degree / params::degree;
|
||||
int remainder_degrees = monomial_degree % params::degree;
|
||||
// This method expects to work with polynomial_size / compression_params::opt
|
||||
// threads in the x-block If init_accumulator is set, assumes that result was
|
||||
// not initialized and does that with the outcome of first * second
|
||||
template <typename T>
|
||||
__device__ void polynomial_accumulate_monic_monomial_mul(
|
||||
T *result, const T *__restrict__ poly, uint64_t monomial_degree,
|
||||
uint32_t tid, uint32_t polynomial_size, int coeff_per_thread,
|
||||
bool init_accumulator = false) {
|
||||
// monomial_degree \in [0, 2 * compression_params::degree)
|
||||
int full_cycles_count = monomial_degree / polynomial_size;
|
||||
int remainder_degrees = monomial_degree % polynomial_size;
|
||||
|
||||
int pos = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
int pos = tid;
|
||||
for (int i = 0; i < coeff_per_thread; i++) {
|
||||
T element = poly[pos];
|
||||
int new_pos = (pos + monomial_degree) % params::degree;
|
||||
int new_pos = (pos + monomial_degree) % polynomial_size;
|
||||
|
||||
T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
|
||||
x = SEL(-x, x, new_pos >= remainder_degrees);
|
||||
@@ -78,7 +79,7 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
|
||||
result[new_pos] = x;
|
||||
else
|
||||
result[new_pos] += x;
|
||||
pos += params::degree / params::opt;
|
||||
pos += polynomial_size / coeff_per_thread;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -311,6 +311,40 @@ extern "C" {
|
||||
num_samples: u32,
|
||||
);
|
||||
|
||||
/// This scratch function allocates the necessary amount of data on the GPU for
|
||||
/// the public function packing keyswitch implementation on 64-bit
|
||||
pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
stream: *mut c_void,
|
||||
gpu_index: u32,
|
||||
fp_ks_buffer: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
/// Perform public functional packing keyswitch on a vector of 64-bit LWE ciphertexts
|
||||
pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64(
|
||||
stream: *mut c_void,
|
||||
gpu_index: u32,
|
||||
glwe_array_out: *mut c_void,
|
||||
lwe_array_in: *const c_void,
|
||||
fp_ksk_array: *const c_void,
|
||||
fp_ks_buffer: *mut i8,
|
||||
input_lwe_dimension: u32,
|
||||
output_glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
base_log: u32,
|
||||
level_count: u32,
|
||||
num_lwes: u32,
|
||||
);
|
||||
|
||||
pub fn cleanup_packing_keyswitch_lwe_list_to_glwe(
|
||||
stream: *mut c_void,
|
||||
gpu_index: u32,
|
||||
fp_ks_buffer: *mut *mut i8,
|
||||
);
|
||||
|
||||
/// Perform the negation of a u64 input LWE ciphertext vector.
|
||||
/// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
|
||||
/// - `gpu_index` is the index of the GPU to be used in the kernel launch
|
||||
@@ -484,6 +518,80 @@ extern "C" {
|
||||
mem_ptr: *mut *mut i8,
|
||||
);
|
||||
|
||||
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
compression_glwe_dimension: u32,
|
||||
compression_polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
num_lwes: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
lwe_per_glwe: u32,
|
||||
storage_log_modulus: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
encryption_glwe_dimension: u32,
|
||||
encryption_polynomial_size: u32,
|
||||
compression_glwe_dimension: u32,
|
||||
compression_polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
num_lwes: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
storage_log_modulus: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
);
|
||||
|
||||
pub fn cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
glwe_array_out: *mut c_void,
|
||||
lwe_array_in: *const c_void,
|
||||
fp_ksk: *const *mut c_void,
|
||||
num_lwes: u32,
|
||||
mem_ptr: *mut i8,
|
||||
);
|
||||
|
||||
pub fn cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
lwe_out: *mut c_void,
|
||||
glwe_array_in: *const c_void,
|
||||
indexes_array: *const c_void,
|
||||
indexes_array_size: u32,
|
||||
bsks: *const *mut c_void,
|
||||
mem_ptr: *mut i8,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
);
|
||||
pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
);
|
||||
|
||||
pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
|
||||
streams: *const *mut c_void,
|
||||
gpu_indexes: *const u32,
|
||||
|
||||
@@ -20,7 +20,7 @@ def main(args):
|
||||
bench_function_id = bench_data["function_id"]
|
||||
|
||||
split = bench_function_id.split("::")
|
||||
if split.len() == 5: # Signed integers
|
||||
if len(split) == 5: # Signed integers
|
||||
(_, _, function_name, parameter_set, bits) = split
|
||||
else: # Unsigned integers
|
||||
(_, function_name, parameter_set, bits) = split
|
||||
@@ -53,7 +53,8 @@ def main(args):
|
||||
estimate_upper_bound_ms,
|
||||
)
|
||||
)
|
||||
except:
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
|
||||
if len(data) == 0:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe"
|
||||
version = "0.8.0-alpha.2"
|
||||
version = "0.8.0-alpha.3"
|
||||
edition = "2021"
|
||||
readme = "../README.md"
|
||||
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography"]
|
||||
@@ -62,12 +62,12 @@ lazy_static = { version = "1.4.0", optional = true }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
rayon = { version = "1.5.0" }
|
||||
bincode = "1.3.3"
|
||||
concrete-fft = { version = "0.4.1", features = ["serde", "fft128"] }
|
||||
concrete-ntt = { version = "0.1.2" }
|
||||
pulp = "0.18.8"
|
||||
concrete-fft = { version = "0.5.0", features = ["serde", "fft128"] }
|
||||
concrete-ntt = { version = "0.2.0" }
|
||||
pulp = "0.18.22"
|
||||
tfhe-cuda-backend = { version = "0.4.0-alpha.0", path = "../backends/tfhe-cuda-backend", optional = true }
|
||||
aligned-vec = { version = "0.5", features = ["serde"] }
|
||||
dyn-stack = { version = "0.9" }
|
||||
dyn-stack = { version = "0.10" }
|
||||
paste = "1.0.7"
|
||||
fs2 = { version = "0.4.3", optional = true }
|
||||
# Used for OPRF in shortint
|
||||
@@ -76,7 +76,7 @@ sha3 = { version = "0.10", optional = true }
|
||||
itertools = "0.11.0"
|
||||
rand_core = { version = "0.6.4", features = ["std"] }
|
||||
tfhe-zk-pok = { version = "0.3.0-alpha.0", path = "../tfhe-zk-pok", optional = true }
|
||||
tfhe-versionable = { version = "0.2.0", path = "../utils/tfhe-versionable" }
|
||||
tfhe-versionable = { version = "0.2.1", path = "../utils/tfhe-versionable" }
|
||||
|
||||
# wasm deps
|
||||
wasm-bindgen = { version = "0.2.86", features = [
|
||||
|
||||
@@ -4,7 +4,19 @@ use tfhe::integer::{ClientKey, RadixCiphertext};
|
||||
use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
|
||||
fn glwe_packing(c: &mut Criterion) {
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::core_crypto::gpu::CudaStreams;
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaUnsignedRadixCiphertext};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::integer::gpu::gen_keys_radix_gpu;
|
||||
|
||||
fn cpu_glwe_packing(c: &mut Criterion) {
|
||||
let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
|
||||
let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
@@ -12,6 +24,9 @@ fn glwe_packing(c: &mut Criterion) {
|
||||
let bench_name = "integer_packing_compression";
|
||||
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
|
||||
let cks = ClientKey::new(param);
|
||||
|
||||
@@ -29,7 +44,6 @@ fn glwe_packing(c: &mut Criterion) {
|
||||
64,
|
||||
128,
|
||||
256,
|
||||
256,
|
||||
comp_param.lwe_per_glwe.0 * log_message_modulus,
|
||||
] {
|
||||
assert_eq!(num_bits % log_message_modulus, 0);
|
||||
@@ -73,9 +87,86 @@ fn glwe_packing(c: &mut Criterion) {
|
||||
}
|
||||
}
|
||||
|
||||
criterion_group!(glwe_packing2, glwe_packing);
|
||||
#[cfg(feature = "gpu")]
|
||||
fn gpu_glwe_packing(c: &mut Criterion) {
|
||||
let bench_name = "integer_cuda_packing_compression";
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
bench_group
|
||||
.sample_size(15)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
|
||||
let stream = CudaStreams::new_multi_gpu();
|
||||
|
||||
let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
|
||||
let log_message_modulus = param.message_modulus.0.ilog2() as usize;
|
||||
|
||||
for bit_size in [
|
||||
8,
|
||||
16,
|
||||
32,
|
||||
64,
|
||||
128,
|
||||
256,
|
||||
comp_param.lwe_per_glwe.0 * log_message_modulus,
|
||||
] {
|
||||
assert_eq!(bit_size % log_message_modulus, 0);
|
||||
let num_blocks = bit_size / log_message_modulus;
|
||||
|
||||
// Generate private compression key
|
||||
let cks = ClientKey::new(param);
|
||||
let private_compression_key = cks.new_compression_private_key(comp_param);
|
||||
|
||||
// Generate and convert compression keys
|
||||
let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
|
||||
let (compressed_compression_key, compressed_decompression_key) =
|
||||
radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
|
||||
let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
|
||||
let cuda_decompression_key =
|
||||
compressed_decompression_key.decompress_to_cuda(radix_cks.parameters(), &stream);
|
||||
|
||||
// Encrypt
|
||||
let ct = cks.encrypt_radix(0_u32, num_blocks);
|
||||
let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
|
||||
|
||||
// Benchmark
|
||||
let mut builder = CudaCompressedCiphertextListBuilder::new();
|
||||
|
||||
builder.push(d_ct, &stream);
|
||||
|
||||
let bench_id = format!("pack_u{bit_size}");
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
b.iter(|| {
|
||||
let compressed = builder.build(&cuda_compression_key, &stream);
|
||||
|
||||
_ = black_box(compressed);
|
||||
})
|
||||
});
|
||||
|
||||
let compressed = builder.build(&cuda_compression_key, &stream);
|
||||
|
||||
let bench_id = format!("unpack_u{bit_size}");
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
b.iter(|| {
|
||||
let unpacked: CudaRadixCiphertext =
|
||||
compressed.get(0, &cuda_decompression_key, &stream);
|
||||
|
||||
_ = black_box(unpacked);
|
||||
})
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
criterion_group!(gpu_glwe_packing2, gpu_glwe_packing);
|
||||
criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
|
||||
|
||||
fn main() {
|
||||
glwe_packing2();
|
||||
#[cfg(feature = "gpu")]
|
||||
gpu_glwe_packing2();
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
cpu_glwe_packing2();
|
||||
|
||||
Criterion::default().configure_from_args().final_summary();
|
||||
}
|
||||
|
||||
@@ -51,7 +51,10 @@ fn pke_zk_proof(c: &mut Criterion) {
|
||||
|
||||
for bits in [640usize, 1280, 4096] {
|
||||
assert_eq!(bits % 64, 0);
|
||||
let num_block = 64usize.div_ceil(param_pke.message_modulus.0.ilog2() as usize);
|
||||
// Packing, so we take the message and carry modulus to compute our block count
|
||||
let num_block = 64usize.div_ceil(
|
||||
(param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
|
||||
);
|
||||
|
||||
use rand::Rng;
|
||||
let mut rng = rand::thread_rng();
|
||||
@@ -128,7 +131,10 @@ fn pke_zk_verify(c: &mut Criterion, results_file: &Path) {
|
||||
|
||||
for bits in [640usize, 1280, 4096] {
|
||||
assert_eq!(bits % 64, 0);
|
||||
let num_block = 64usize.div_ceil(param_pke.message_modulus.0.ilog2() as usize);
|
||||
// Packing, so we take the message and carry modulus to compute our block count
|
||||
let num_block = 64usize.div_ceil(
|
||||
(param_pke.message_modulus.0 * param_pke.carry_modulus.0).ilog2() as usize,
|
||||
);
|
||||
|
||||
use rand::Rng;
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
@@ -14,7 +14,7 @@ fn oprf(c: &mut Criterion) {
|
||||
let keys = KEY_CACHE.get_from_param(param);
|
||||
let sks = keys.server_key();
|
||||
|
||||
bench_group.bench_function(&format!("2-bits-oprf::{}", param.name()), |b| {
|
||||
bench_group.bench_function(format!("2-bits-oprf::{}", param.name()), |b| {
|
||||
b.iter(|| {
|
||||
_ = black_box(sks.generate_oblivious_pseudo_random(Seed(0), 2));
|
||||
})
|
||||
|
||||
@@ -13,6 +13,7 @@ const {
|
||||
CompressedFheInt8,
|
||||
FheInt8,
|
||||
FheInt32,
|
||||
FheTypes,
|
||||
CompressedFheInt128,
|
||||
FheInt128,
|
||||
CompressedFheInt256,
|
||||
@@ -354,7 +355,6 @@ test('hlapi_public_key_encrypt_decrypt_int256_small', (t) => {
|
||||
});
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
/// 32 bits compact
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
@@ -423,19 +423,37 @@ test('hlapi_compact_ciphertext_list', (t) => {
|
||||
let clear_i32 = -3284;
|
||||
let clear_bool = true;
|
||||
let clear_u256 = generateRandomBigInt(256);
|
||||
let clear_u2048 = generateRandomBigInt(2048);
|
||||
|
||||
let builder = CompactCiphertextList.builder(publicKey);
|
||||
builder.push_u2(clear_u2);
|
||||
builder.push_i32(clear_i32);
|
||||
builder.push_boolean(clear_bool);
|
||||
builder.push_u256(clear_u256);
|
||||
builder.push_u2048(clear_u2048);
|
||||
let list = builder.build();
|
||||
|
||||
let serialized = list.safe_serialize(BigInt(10000000));
|
||||
let deserialized = CompactCiphertextList.safe_deserialize(serialized, BigInt(10000000));
|
||||
|
||||
assert.deepStrictEqual(deserialized.is_empty(), false);
|
||||
assert.deepStrictEqual(deserialized.len(), 5);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(0), FheTypes.Uint2);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(1), FheTypes.Int32);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(2), FheTypes.Bool);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(3), FheTypes.Uint256);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(4), FheTypes.Uint2048);
|
||||
|
||||
let expander = deserialized.expand();
|
||||
|
||||
assert.deepStrictEqual(expander.is_empty(), false);
|
||||
assert.deepStrictEqual(expander.len(), 5);
|
||||
assert.deepStrictEqual(expander.get_kind_of(0), FheTypes.Uint2);
|
||||
assert.deepStrictEqual(expander.get_kind_of(1), FheTypes.Int32);
|
||||
assert.deepStrictEqual(expander.get_kind_of(2), FheTypes.Bool);
|
||||
assert.deepStrictEqual(expander.get_kind_of(3), FheTypes.Uint256);
|
||||
assert.deepStrictEqual(expander.get_kind_of(4), FheTypes.Uint2048);
|
||||
|
||||
assert.deepStrictEqual(
|
||||
expander.get_uint2(0).decrypt(clientKey),
|
||||
clear_u2,
|
||||
@@ -455,6 +473,12 @@ test('hlapi_compact_ciphertext_list', (t) => {
|
||||
expander.get_uint256(3).decrypt(clientKey),
|
||||
clear_u256,
|
||||
);
|
||||
|
||||
assert.deepStrictEqual(
|
||||
expander.get_uint2048(4).decrypt(clientKey),
|
||||
clear_u2048,
|
||||
);
|
||||
|
||||
});
|
||||
|
||||
test('hlapi_compact_ciphertext_list_with_proof', (t) => {
|
||||
@@ -489,5 +513,12 @@ test('hlapi_compact_ciphertext_list_with_proof', (t) => {
|
||||
let serialized = list.safe_serialize(BigInt(10000000));
|
||||
let deserialized = ProvenCompactCiphertextList.safe_deserialize(serialized, BigInt(10000000));
|
||||
|
||||
assert.deepStrictEqual(deserialized.is_empty(), false);
|
||||
assert.deepStrictEqual(deserialized.len(), 4);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(0), FheTypes.Uint2);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(1), FheTypes.Int32);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(2), FheTypes.Bool);
|
||||
assert.deepStrictEqual(deserialized.get_kind_of(3), FheTypes.Uint256);
|
||||
|
||||
// We cannot verify packed ZK in wasm
|
||||
});
|
||||
|
||||
@@ -36,6 +36,9 @@ pub enum FheTypes {
|
||||
Type_FheUint128,
|
||||
Type_FheUint160,
|
||||
Type_FheUint256,
|
||||
Type_FheUint512,
|
||||
Type_FheUint1024,
|
||||
Type_FheUint2048,
|
||||
Type_FheInt2,
|
||||
Type_FheInt4,
|
||||
Type_FheInt6,
|
||||
@@ -68,6 +71,9 @@ impl From<crate::FheTypes> for FheTypes {
|
||||
crate::FheTypes::Uint128 => Self::Type_FheUint128,
|
||||
crate::FheTypes::Uint160 => Self::Type_FheUint160,
|
||||
crate::FheTypes::Uint256 => Self::Type_FheUint256,
|
||||
crate::FheTypes::Uint512 => Self::Type_FheUint512,
|
||||
crate::FheTypes::Uint1024 => Self::Type_FheUint1024,
|
||||
crate::FheTypes::Uint2048 => Self::Type_FheUint2048,
|
||||
crate::FheTypes::Int2 => Self::Type_FheInt2,
|
||||
crate::FheTypes::Int4 => Self::Type_FheInt4,
|
||||
crate::FheTypes::Int6 => Self::Type_FheInt6,
|
||||
|
||||
@@ -250,13 +250,10 @@ pub fn blind_rotate_ntt64_assign_mem_optimized<InputCont, OutputCont, KeyCont>(
|
||||
if *lwe_mask_element != 0u64 {
|
||||
let stack = stack.rb_mut();
|
||||
// We copy ct_0 to ct_1
|
||||
let (mut ct1, stack) =
|
||||
let (ct1, stack) =
|
||||
stack.collect_aligned(CACHELINE_ALIGN, ct0.as_ref().iter().copied());
|
||||
let mut ct1 = GlweCiphertextMutView::from_container(
|
||||
&mut *ct1,
|
||||
lut_poly_size,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
let mut ct1 =
|
||||
GlweCiphertextMutView::from_container(ct1, lut_poly_size, ciphertext_modulus);
|
||||
|
||||
// We rotate ct_1 by performing ct_1 <- ct_1 * X^{a_hat}
|
||||
for mut poly in ct1.as_mut_polynomial_list().iter_mut() {
|
||||
@@ -503,10 +500,10 @@ pub fn programmable_bootstrap_ntt64_lwe_ciphertext_mem_optimized<
|
||||
accumulator.ciphertext_modulus()
|
||||
);
|
||||
|
||||
let (mut local_accumulator_data, stack) =
|
||||
let (local_accumulator_data, stack) =
|
||||
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
|
||||
let mut local_accumulator = GlweCiphertextMutView::from_container(
|
||||
&mut *local_accumulator_data,
|
||||
local_accumulator_data,
|
||||
accumulator.polynomial_size(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
);
|
||||
@@ -568,12 +565,11 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
|
||||
out.ciphertext_modulus(),
|
||||
);
|
||||
|
||||
let (mut output_fft_buffer, mut substack0) =
|
||||
let (output_fft_buffer, mut substack0) =
|
||||
stack.make_aligned_raw::<u64>(poly_size * ggsw.glwe_size().0, align);
|
||||
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
|
||||
// the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
|
||||
// it has been fully initialized for the first time.
|
||||
let output_fft_buffer = &mut *output_fft_buffer;
|
||||
let mut is_output_uninit = true;
|
||||
|
||||
{
|
||||
@@ -616,17 +612,16 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
|
||||
glwe_decomp_term.as_polynomial_list().iter()
|
||||
)
|
||||
.for_each(|(ggsw_row, glwe_poly)| {
|
||||
let (mut ntt_poly, _) =
|
||||
substack2.rb_mut().make_aligned_raw::<u64>(poly_size, align);
|
||||
let (ntt_poly, _) = substack2.rb_mut().make_aligned_raw::<u64>(poly_size, align);
|
||||
// We perform the forward ntt transform for the glwe polynomial
|
||||
ntt.forward(PolynomialMutView::from_container(&mut ntt_poly), glwe_poly);
|
||||
ntt.forward(PolynomialMutView::from_container(ntt_poly), glwe_poly);
|
||||
// Now we loop through the polynomials of the output, and add the
|
||||
// corresponding product of polynomials.
|
||||
|
||||
update_with_fmadd_ntt64(
|
||||
output_fft_buffer,
|
||||
ggsw_row.as_ref(),
|
||||
&ntt_poly,
|
||||
ntt_poly,
|
||||
is_output_uninit,
|
||||
poly_size,
|
||||
ntt,
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::core_crypto::commons::math::decomposition::{
|
||||
};
|
||||
use crate::core_crypto::commons::numeric::UnsignedInteger;
|
||||
use crate::core_crypto::commons::parameters::{DecompositionBaseLog, DecompositionLevelCount};
|
||||
use dyn_stack::{DynArray, PodStack, ReborrowMut};
|
||||
use dyn_stack::{PodStack, ReborrowMut};
|
||||
|
||||
/// An iterator that yields the terms of the signed decomposition of an integer.
|
||||
///
|
||||
@@ -288,9 +288,9 @@ pub struct TensorSignedDecompositionLendingIterNonNative<'buffers> {
|
||||
// ...0001111
|
||||
mod_b_mask: u64,
|
||||
// The internal states of each decomposition
|
||||
states: DynArray<'buffers, u64>,
|
||||
states: &'buffers mut [u64],
|
||||
// Corresponding input signs
|
||||
input_signs: DynArray<'buffers, u8>,
|
||||
input_signs: &'buffers mut [u8],
|
||||
// A flag which stores whether the iterator is a fresh one (for the recompose method).
|
||||
fresh: bool,
|
||||
ciphertext_modulus: u64,
|
||||
@@ -306,9 +306,9 @@ impl<'buffers> TensorSignedDecompositionLendingIterNonNative<'buffers> {
|
||||
) -> (Self, PodStack<'buffers>) {
|
||||
let shift = modulus.ceil_ilog2() as usize - decomposer.base_log * decomposer.level_count;
|
||||
let input_size = input.len();
|
||||
let (mut states, stack) =
|
||||
let (states, stack) =
|
||||
stack.make_aligned_raw::<u64>(input_size, aligned_vec::CACHELINE_ALIGN);
|
||||
let (mut input_signs, stack) =
|
||||
let (input_signs, stack) =
|
||||
stack.make_aligned_raw::<u8>(input_size, aligned_vec::CACHELINE_ALIGN);
|
||||
|
||||
for ((i, state), sign) in input
|
||||
@@ -393,11 +393,7 @@ impl<'buffers> TensorSignedDecompositionLendingIterNonNative<'buffers> {
|
||||
&mut self,
|
||||
substack1: &'a mut PodStack,
|
||||
align: usize,
|
||||
) -> (
|
||||
DecompositionLevel,
|
||||
dyn_stack::DynArray<'a, u64>,
|
||||
PodStack<'a>,
|
||||
) {
|
||||
) -> (DecompositionLevel, &'a mut [u64], PodStack<'a>) {
|
||||
let (glwe_level, _, glwe_decomp_term) = self.next_term().unwrap();
|
||||
let (glwe_decomp_term, substack2) =
|
||||
substack1.rb_mut().collect_aligned(align, glwe_decomp_term);
|
||||
|
||||
@@ -200,18 +200,25 @@ impl<Scalar: UnsignedInteger> ParameterSetConformant
|
||||
&self,
|
||||
lwe_ct_parameters: &GlweCiphertextConformanceParameters<Scalar>,
|
||||
) -> bool {
|
||||
let log_modulus = self.packed_integers.log_modulus.0;
|
||||
let Self {
|
||||
packed_integers,
|
||||
glwe_dimension,
|
||||
polynomial_size,
|
||||
bodies_count,
|
||||
uncompressed_ciphertext_modulus,
|
||||
} = self;
|
||||
let log_modulus = packed_integers.log_modulus.0;
|
||||
|
||||
let number_bits_to_unpack =
|
||||
(self.glwe_dimension.0 * self.polynomial_size.0 + self.bodies_count.0) * log_modulus;
|
||||
(glwe_dimension.0 * polynomial_size.0 + bodies_count.0) * log_modulus;
|
||||
|
||||
let len = number_bits_to_unpack.div_ceil(Scalar::BITS);
|
||||
|
||||
self.packed_integers.packed_coeffs.len() == len
|
||||
&& self.glwe_dimension == lwe_ct_parameters.glwe_dim
|
||||
&& self.polynomial_size == lwe_ct_parameters.polynomial_size
|
||||
packed_integers.packed_coeffs.len() == len
|
||||
&& *glwe_dimension == lwe_ct_parameters.glwe_dim
|
||||
&& *polynomial_size == lwe_ct_parameters.polynomial_size
|
||||
&& lwe_ct_parameters.ct_modulus.is_power_of_two()
|
||||
&& self.uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
|
||||
&& *uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -147,16 +147,22 @@ impl<Scalar: UnsignedInteger> ParameterSetConformant
|
||||
type ParameterSet = LweCiphertextParameters<Scalar>;
|
||||
|
||||
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<Scalar>) -> bool {
|
||||
let lwe_size = self.lwe_dimension.to_lwe_size().0;
|
||||
let Self {
|
||||
packed_integers,
|
||||
lwe_dimension,
|
||||
uncompressed_ciphertext_modulus,
|
||||
} = self;
|
||||
|
||||
let number_bits_to_pack = lwe_size * self.packed_integers.log_modulus.0;
|
||||
let lwe_size = lwe_dimension.to_lwe_size().0;
|
||||
|
||||
let number_bits_to_pack = lwe_size * packed_integers.log_modulus.0;
|
||||
|
||||
let len = number_bits_to_pack.div_ceil(Scalar::BITS);
|
||||
|
||||
self.packed_integers.packed_coeffs.len() == len
|
||||
&& self.lwe_dimension == lwe_ct_parameters.lwe_dim
|
||||
packed_integers.packed_coeffs.len() == len
|
||||
&& *lwe_dimension == lwe_ct_parameters.lwe_dim
|
||||
&& lwe_ct_parameters.ct_modulus.is_power_of_two()
|
||||
&& self.uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
|
||||
&& *uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
|
||||
&& matches!(
|
||||
lwe_ct_parameters.ms_decompression_method,
|
||||
MsDecompressionType::ClassicPbs
|
||||
|
||||
@@ -403,24 +403,33 @@ impl MultiBitModulusSwitchedCt for FromCompressionMultiBitModulusSwitchedCt {
|
||||
impl<Scalar: UnsignedInteger + CastInto<usize> + CastFrom<usize>> ParameterSetConformant
|
||||
for CompressedModulusSwitchedMultiBitLweCiphertext<Scalar>
|
||||
{
|
||||
type ParameterSet = LweCiphertextParameters<u64>;
|
||||
type ParameterSet = LweCiphertextParameters<Scalar>;
|
||||
|
||||
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<u64>) -> bool {
|
||||
let lwe_dim = self.lwe_dimension.0;
|
||||
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<Scalar>) -> bool {
|
||||
let Self {
|
||||
body,
|
||||
packed_mask,
|
||||
packed_diffs,
|
||||
lwe_dimension,
|
||||
uncompressed_ciphertext_modulus,
|
||||
grouping_factor,
|
||||
} = self;
|
||||
|
||||
let number_mask_bits_to_pack = lwe_dim * self.packed_mask.log_modulus.0;
|
||||
let lwe_dim = lwe_dimension.0;
|
||||
|
||||
let len = number_mask_bits_to_pack.div_ceil(Scalar::BITS);
|
||||
|
||||
self.body >> self.packed_mask.log_modulus.0 == 0
|
||||
&& self.packed_mask.packed_coeffs.len() == len
|
||||
&& self.lwe_dimension == lwe_ct_parameters.lwe_dim
|
||||
body >> packed_mask.log_modulus.0 == 0
|
||||
&& packed_mask.is_conformant(&lwe_dim)
|
||||
&& packed_diffs
|
||||
.as_ref()
|
||||
.map_or(true, |packed_diffs| packed_diffs.is_conformant(&lwe_dim))
|
||||
&& *lwe_dimension == lwe_ct_parameters.lwe_dim
|
||||
&& lwe_ct_parameters.ct_modulus.is_power_of_two()
|
||||
&& match lwe_ct_parameters.ms_decompression_method {
|
||||
MsDecompressionType::ClassicPbs => false,
|
||||
MsDecompressionType::MultiBitPbs(expected_gouping_factor) => {
|
||||
expected_gouping_factor.0 == self.grouping_factor.0
|
||||
expected_gouping_factor.0 == grouping_factor.0
|
||||
}
|
||||
}
|
||||
&& *uncompressed_ciphertext_modulus == lwe_ct_parameters.ct_modulus
|
||||
}
|
||||
}
|
||||
|
||||
@@ -643,9 +643,19 @@ where
|
||||
&self,
|
||||
glwe_ct_parameters: &GlweCiphertextConformanceParameters<C::Element>,
|
||||
) -> bool {
|
||||
let Self {
|
||||
data,
|
||||
polynomial_size,
|
||||
ciphertext_modulus,
|
||||
} = self;
|
||||
|
||||
check_encrypted_content_respects_mod(self, glwe_ct_parameters.ct_modulus)
|
||||
&& self.glwe_size() == glwe_ct_parameters.glwe_dim.to_glwe_size()
|
||||
&& self.polynomial_size() == glwe_ct_parameters.polynomial_size
|
||||
&& self.ciphertext_modulus() == glwe_ct_parameters.ct_modulus
|
||||
&& data.container_len()
|
||||
== glwe_ciphertext_size(
|
||||
glwe_ct_parameters.glwe_dim.to_glwe_size(),
|
||||
glwe_ct_parameters.polynomial_size,
|
||||
)
|
||||
&& *polynomial_size == glwe_ct_parameters.polynomial_size
|
||||
&& *ciphertext_modulus == glwe_ct_parameters.ct_modulus
|
||||
}
|
||||
}
|
||||
|
||||
@@ -761,9 +761,14 @@ where
|
||||
type ParameterSet = LweCiphertextParameters<C::Element>;
|
||||
|
||||
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<C::Element>) -> bool {
|
||||
check_encrypted_content_respects_mod(self, lwe_ct_parameters.ct_modulus)
|
||||
let Self {
|
||||
data,
|
||||
ciphertext_modulus,
|
||||
} = self;
|
||||
|
||||
check_encrypted_content_respects_mod(data, lwe_ct_parameters.ct_modulus)
|
||||
&& self.lwe_size() == lwe_ct_parameters.lwe_dim.to_lwe_size()
|
||||
&& self.ciphertext_modulus() == lwe_ct_parameters.ct_modulus
|
||||
&& *ciphertext_modulus == lwe_ct_parameters.ct_modulus
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -343,17 +343,24 @@ impl<T: UnsignedInteger> ParameterSetConformant for LweCompactCiphertextListOwne
|
||||
type ParameterSet = LweCiphertextListParameters<T>;
|
||||
|
||||
fn is_conformant(&self, param: &LweCiphertextListParameters<T>) -> bool {
|
||||
let Self {
|
||||
data,
|
||||
lwe_size,
|
||||
lwe_ciphertext_count,
|
||||
ciphertext_modulus,
|
||||
} = self;
|
||||
|
||||
param
|
||||
.lwe_ciphertext_count_constraint
|
||||
.is_valid(self.lwe_ciphertext_count.0)
|
||||
&& self.data.len()
|
||||
.is_valid(lwe_ciphertext_count.0)
|
||||
&& data.len()
|
||||
== lwe_compact_ciphertext_list_size(
|
||||
self.lwe_size.to_lwe_dimension(),
|
||||
self.lwe_ciphertext_count,
|
||||
lwe_size.to_lwe_dimension(),
|
||||
*lwe_ciphertext_count,
|
||||
)
|
||||
&& check_encrypted_content_respects_mod(self, param.ct_modulus)
|
||||
&& self.lwe_size == param.lwe_dim.to_lwe_size()
|
||||
&& self.ciphertext_modulus == param.ct_modulus
|
||||
&& *lwe_size == param.lwe_dim.to_lwe_size()
|
||||
&& *ciphertext_modulus == param.ct_modulus
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use tfhe_versionable::Versionize;
|
||||
|
||||
use crate::conformance::ParameterSetConformant;
|
||||
use crate::core_crypto::backward_compatibility::entities::packed_integers::PackedIntegersVersions;
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
@@ -166,3 +167,21 @@ impl<Scalar: UnsignedInteger> PackedIntegers<Scalar> {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<Scalar: UnsignedInteger> ParameterSetConformant for PackedIntegers<Scalar> {
|
||||
type ParameterSet = usize;
|
||||
|
||||
fn is_conformant(&self, len: &usize) -> bool {
|
||||
let Self {
|
||||
packed_coeffs,
|
||||
log_modulus,
|
||||
initial_len,
|
||||
} = self;
|
||||
|
||||
let number_packed_bits = *len * log_modulus.0;
|
||||
|
||||
let packed_len = number_packed_bits.div_ceil(Scalar::BITS);
|
||||
|
||||
*len == *initial_len && packed_coeffs.len() == packed_len
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,11 +25,18 @@ impl<T: UnsignedInteger> ParameterSetConformant for SeededLweCiphertext<T> {
|
||||
type ParameterSet = LweCiphertextParameters<T>;
|
||||
|
||||
fn is_conformant(&self, lwe_ct_parameters: &LweCiphertextParameters<T>) -> bool {
|
||||
let Self {
|
||||
data,
|
||||
lwe_size,
|
||||
compression_seed: _,
|
||||
ciphertext_modulus,
|
||||
} = self;
|
||||
|
||||
check_encrypted_content_respects_mod::<T, &[T]>(
|
||||
&std::slice::from_ref(self.get_body().data),
|
||||
&std::slice::from_ref(data),
|
||||
lwe_ct_parameters.ct_modulus,
|
||||
) && self.lwe_size == lwe_ct_parameters.lwe_dim.to_lwe_size()
|
||||
&& self.ciphertext_modulus() == lwe_ct_parameters.ct_modulus
|
||||
) && *lwe_size == lwe_ct_parameters.lwe_dim.to_lwe_size()
|
||||
&& *ciphertext_modulus == lwe_ct_parameters.ct_modulus
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -193,12 +193,11 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
|
||||
ggsw.decomposition_base_log(),
|
||||
ggsw.decomposition_level_count(),
|
||||
);
|
||||
let (mut output_fft_buffer, mut substack0) =
|
||||
let (output_fft_buffer, mut substack0) =
|
||||
stack.make_aligned_raw::<c64>(fourier_poly_size * ggsw.glwe_size_out().0, align);
|
||||
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
|
||||
// the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
|
||||
// it has been fully initialized for the first time.
|
||||
let output_fft_buffer = &mut *output_fft_buffer;
|
||||
let mut is_output_uninit = true;
|
||||
|
||||
{
|
||||
@@ -244,14 +243,14 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
|
||||
glwe_decomp_term.get_mask().as_polynomial_list().iter()
|
||||
)
|
||||
.for_each(|(ggsw_row, glwe_poly)| {
|
||||
let (mut fourier, substack3) = substack2
|
||||
let (fourier, substack3) = substack2
|
||||
.rb_mut()
|
||||
.make_aligned_raw::<c64>(fourier_poly_size, align);
|
||||
|
||||
// We perform the forward fft transform for the glwe polynomial
|
||||
let fourier = fft
|
||||
.forward_as_integer(
|
||||
FourierPolynomialMutView { data: &mut fourier },
|
||||
FourierPolynomialMutView { data: fourier },
|
||||
glwe_poly,
|
||||
substack3,
|
||||
)
|
||||
|
||||
@@ -289,10 +289,10 @@ where
|
||||
if *lwe_mask_element != Scalar::ZERO {
|
||||
let stack = stack.rb_mut();
|
||||
// We copy ct_0 to ct_1
|
||||
let (mut ct1, stack) =
|
||||
let (ct1, stack) =
|
||||
stack.collect_aligned(CACHELINE_ALIGN, ct0.as_ref().iter().copied());
|
||||
let mut ct1 = GlweCiphertextMutView::from_container(
|
||||
&mut *ct1,
|
||||
ct1,
|
||||
ct0.polynomial_size(),
|
||||
ct0.ciphertext_modulus(),
|
||||
);
|
||||
@@ -361,10 +361,10 @@ where
|
||||
return this.bootstrap_u128(&mut lwe_out, &lwe_in, &accumulator, fft, stack);
|
||||
}
|
||||
|
||||
let (mut local_accumulator_data, stack) =
|
||||
let (local_accumulator_data, stack) =
|
||||
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
|
||||
let mut local_accumulator = GlweCiphertextMutView::from_container(
|
||||
&mut *local_accumulator_data,
|
||||
local_accumulator_data,
|
||||
accumulator.polynomial_size(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
);
|
||||
|
||||
@@ -397,13 +397,13 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
|
||||
ggsw.decomposition_level_count(),
|
||||
);
|
||||
|
||||
let (mut output_fft_buffer_re0, stack) =
|
||||
let (output_fft_buffer_re0, stack) =
|
||||
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
let (mut output_fft_buffer_re1, stack) =
|
||||
let (output_fft_buffer_re1, stack) =
|
||||
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
let (mut output_fft_buffer_im0, stack) =
|
||||
let (output_fft_buffer_im0, stack) =
|
||||
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
let (mut output_fft_buffer_im1, mut substack0) =
|
||||
let (output_fft_buffer_im1, mut substack0) =
|
||||
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
|
||||
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
|
||||
@@ -455,30 +455,30 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
|
||||
) {
|
||||
let len = fourier_poly_size;
|
||||
let stack = substack2.rb_mut();
|
||||
let (mut fourier_re0, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (mut fourier_re1, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (mut fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (mut fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (fourier_re0, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (fourier_re1, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
|
||||
// We perform the forward fft transform for the glwe polynomial
|
||||
fft.forward_as_integer(
|
||||
&mut fourier_re0,
|
||||
&mut fourier_re1,
|
||||
&mut fourier_im0,
|
||||
&mut fourier_im1,
|
||||
fourier_re0,
|
||||
fourier_re1,
|
||||
fourier_im0,
|
||||
fourier_im1,
|
||||
glwe_poly.as_ref(),
|
||||
);
|
||||
// Now we loop through the polynomials of the output, and add the
|
||||
// corresponding product of polynomials.
|
||||
update_with_fmadd(
|
||||
&mut output_fft_buffer_re0,
|
||||
&mut output_fft_buffer_re1,
|
||||
&mut output_fft_buffer_im0,
|
||||
&mut output_fft_buffer_im1,
|
||||
output_fft_buffer_re0,
|
||||
output_fft_buffer_re1,
|
||||
output_fft_buffer_im0,
|
||||
output_fft_buffer_im1,
|
||||
ggsw_row,
|
||||
&fourier_re0,
|
||||
&fourier_re1,
|
||||
&fourier_im0,
|
||||
&fourier_im1,
|
||||
fourier_re0,
|
||||
fourier_re1,
|
||||
fourier_im0,
|
||||
fourier_im1,
|
||||
is_output_uninit,
|
||||
fourier_poly_size,
|
||||
);
|
||||
@@ -495,11 +495,6 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
|
||||
//
|
||||
// We iterate over the polynomials in the output.
|
||||
if !is_output_uninit {
|
||||
let output_fft_buffer_re0 = output_fft_buffer_re0;
|
||||
let output_fft_buffer_re1 = output_fft_buffer_re1;
|
||||
let output_fft_buffer_im0 = output_fft_buffer_im0;
|
||||
let output_fft_buffer_im1 = output_fft_buffer_im1;
|
||||
|
||||
for (mut out, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip!(
|
||||
out.as_mut_polynomial_list().iter_mut(),
|
||||
output_fft_buffer_re0.into_chunks(fourier_poly_size),
|
||||
@@ -532,11 +527,7 @@ fn collect_next_term<'a, Scalar: UnsignedTorus>(
|
||||
decomposition: &mut TensorSignedDecompositionLendingIter<'_, Scalar>,
|
||||
substack1: &'a mut PodStack,
|
||||
align: usize,
|
||||
) -> (
|
||||
DecompositionLevel,
|
||||
dyn_stack::DynArray<'a, Scalar>,
|
||||
PodStack<'a>,
|
||||
) {
|
||||
) -> (DecompositionLevel, &'a mut [Scalar], PodStack<'a>) {
|
||||
let (glwe_level, _, glwe_decomp_term) = decomposition.next_term().unwrap();
|
||||
let (glwe_decomp_term, substack2) = substack1.rb_mut().collect_aligned(align, glwe_decomp_term);
|
||||
(glwe_level, glwe_decomp_term, substack2)
|
||||
|
||||
@@ -495,27 +495,19 @@ impl<'a> Fft128View<'a> {
|
||||
debug_assert_eq!(n, 2 * fourier_im0.len());
|
||||
debug_assert_eq!(n, 2 * fourier_im1.len());
|
||||
|
||||
let (mut tmp_re0, stack) =
|
||||
let (tmp_re0, stack) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_re0.iter().copied());
|
||||
let (mut tmp_re1, stack) =
|
||||
let (tmp_re1, stack) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_re1.iter().copied());
|
||||
let (mut tmp_im0, stack) =
|
||||
let (tmp_im0, stack) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_im0.iter().copied());
|
||||
let (mut tmp_im1, _) =
|
||||
let (tmp_im1, _) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_im1.iter().copied());
|
||||
|
||||
self.plan
|
||||
.inv(&mut tmp_re0, &mut tmp_re1, &mut tmp_im0, &mut tmp_im1);
|
||||
self.plan.inv(tmp_re0, tmp_re1, tmp_im0, tmp_im1);
|
||||
|
||||
let (standard_re, standard_im) = standard.split_at_mut(n / 2);
|
||||
conv_fn(
|
||||
standard_re,
|
||||
standard_im,
|
||||
&tmp_re0,
|
||||
&tmp_re1,
|
||||
&tmp_im0,
|
||||
&tmp_im1,
|
||||
);
|
||||
conv_fn(standard_re, standard_im, tmp_re0, tmp_re1, tmp_im0, tmp_im1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -105,9 +105,9 @@ where
|
||||
if *lwe_mask_element != 0 {
|
||||
let stack = stack.rb_mut();
|
||||
// We copy ct_0 to ct_1
|
||||
let (mut ct1_lo, stack) =
|
||||
let (ct1_lo, stack) =
|
||||
stack.collect_aligned(CACHELINE_ALIGN, ct0_lo.as_ref().iter().copied());
|
||||
let (mut ct1_hi, stack) =
|
||||
let (ct1_hi, stack) =
|
||||
stack.collect_aligned(CACHELINE_ALIGN, ct0_hi.as_ref().iter().copied());
|
||||
let mut ct1_lo = GlweCiphertextMutView::from_container(
|
||||
&mut *ct1_lo,
|
||||
@@ -177,9 +177,9 @@ where
|
||||
let align = CACHELINE_ALIGN;
|
||||
let ciphertext_modulus = accumulator.ciphertext_modulus();
|
||||
|
||||
let (mut local_accumulator_lo, stack) =
|
||||
let (local_accumulator_lo, stack) =
|
||||
stack.collect_aligned(align, accumulator.as_ref().iter().map(|i| *i as u64));
|
||||
let (mut local_accumulator_hi, mut stack) = stack.collect_aligned(
|
||||
let (local_accumulator_hi, mut stack) = stack.collect_aligned(
|
||||
align,
|
||||
accumulator.as_ref().iter().map(|i| (*i >> 64) as u64),
|
||||
);
|
||||
@@ -207,7 +207,7 @@ where
|
||||
fft,
|
||||
stack.rb_mut(),
|
||||
);
|
||||
let (mut local_accumulator, _) = stack.collect_aligned(
|
||||
let (local_accumulator, _) = stack.collect_aligned(
|
||||
align,
|
||||
izip!(local_accumulator_lo.as_ref(), local_accumulator_hi.as_ref())
|
||||
.map(|(&lo, &hi)| lo as u128 | ((hi as u128) << 64)),
|
||||
|
||||
@@ -63,32 +63,28 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
|
||||
ggsw.decomposition_level_count(),
|
||||
);
|
||||
|
||||
let (mut output_fft_buffer_re0, stack) =
|
||||
let (output_fft_buffer_re0, stack) =
|
||||
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
let (mut output_fft_buffer_re1, stack) =
|
||||
let (output_fft_buffer_re1, stack) =
|
||||
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
let (mut output_fft_buffer_im0, stack) =
|
||||
let (output_fft_buffer_im0, stack) =
|
||||
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
let (mut output_fft_buffer_im1, mut substack0) =
|
||||
let (output_fft_buffer_im1, mut substack0) =
|
||||
stack.make_aligned_raw::<f64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
|
||||
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
|
||||
// the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
|
||||
// it has been fully initialized for the first time.
|
||||
let output_fft_buffer_re0 = &mut *output_fft_buffer_re0;
|
||||
let output_fft_buffer_re1 = &mut *output_fft_buffer_re1;
|
||||
let output_fft_buffer_im0 = &mut *output_fft_buffer_im0;
|
||||
let output_fft_buffer_im1 = &mut *output_fft_buffer_im1;
|
||||
let mut is_output_uninit = true;
|
||||
|
||||
{
|
||||
// ------------------------------------------------------ EXTERNAL PRODUCT IN FOURIER
|
||||
// DOMAIN In this section, we perform the external product in the fourier
|
||||
// domain, and accumulate the result in the output_fft_buffer variable.
|
||||
let (mut decomposition_states_lo, stack) = substack0
|
||||
let (decomposition_states_lo, stack) = substack0
|
||||
.rb_mut()
|
||||
.make_aligned_raw::<u64>(poly_size * glwe_size, align);
|
||||
let (mut decomposition_states_hi, mut substack1) =
|
||||
let (decomposition_states_hi, mut substack1) =
|
||||
stack.make_aligned_raw::<u64>(poly_size * glwe_size, align);
|
||||
|
||||
let shift = 128 - decomposer.base_log * decomposer.level_count;
|
||||
@@ -104,6 +100,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
|
||||
*out_lo = value as u64;
|
||||
*out_hi = (value >> 64) as u64;
|
||||
}
|
||||
// Reborrow to avoid mut slices to be moved
|
||||
let decomposition_states_lo = &mut *decomposition_states_lo;
|
||||
let decomposition_states_hi = &mut *decomposition_states_hi;
|
||||
let mut current_level = decomposer.level_count;
|
||||
@@ -118,17 +115,17 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
|
||||
assert_ne!(current_level, 0);
|
||||
let glwe_level = DecompositionLevel(current_level);
|
||||
current_level -= 1;
|
||||
let (mut glwe_decomp_term_lo, stack) = substack1
|
||||
let (glwe_decomp_term_lo, stack) = substack1
|
||||
.rb_mut()
|
||||
.make_aligned_raw::<u64>(poly_size * glwe_size, align);
|
||||
let (mut glwe_decomp_term_hi, mut substack2) =
|
||||
let (glwe_decomp_term_hi, mut substack2) =
|
||||
stack.make_aligned_raw::<u64>(poly_size * glwe_size, align);
|
||||
|
||||
let base_log = decomposer.base_log;
|
||||
|
||||
collect_next_term_split(
|
||||
&mut glwe_decomp_term_lo,
|
||||
&mut glwe_decomp_term_hi,
|
||||
glwe_decomp_term_lo,
|
||||
glwe_decomp_term_hi,
|
||||
decomposition_states_lo,
|
||||
decomposition_states_hi,
|
||||
mod_b_mask_lo,
|
||||
@@ -136,9 +133,6 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
|
||||
base_log,
|
||||
);
|
||||
|
||||
let glwe_decomp_term_lo = &mut *glwe_decomp_term_lo;
|
||||
let glwe_decomp_term_hi = &mut *glwe_decomp_term_hi;
|
||||
|
||||
let glwe_decomp_term_lo = GlweCiphertextView::from_container(
|
||||
&*glwe_decomp_term_lo,
|
||||
ggsw.polynomial_size(),
|
||||
@@ -170,16 +164,16 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
|
||||
) {
|
||||
let len = fourier_poly_size;
|
||||
let stack = substack2.rb_mut();
|
||||
let (mut fourier_re0, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (mut fourier_re1, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (mut fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (mut fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (fourier_re0, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (fourier_re1, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (fourier_im0, stack) = stack.make_aligned_raw::<f64>(len, align);
|
||||
let (fourier_im1, _) = stack.make_aligned_raw::<f64>(len, align);
|
||||
// We perform the forward fft transform for the glwe polynomial
|
||||
fft.forward_as_integer_split(
|
||||
&mut fourier_re0,
|
||||
&mut fourier_re1,
|
||||
&mut fourier_im0,
|
||||
&mut fourier_im1,
|
||||
fourier_re0,
|
||||
fourier_re1,
|
||||
fourier_im0,
|
||||
fourier_im1,
|
||||
glwe_poly_lo.as_ref(),
|
||||
glwe_poly_hi.as_ref(),
|
||||
);
|
||||
@@ -192,10 +186,10 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
|
||||
output_fft_buffer_im0,
|
||||
output_fft_buffer_im1,
|
||||
ggsw_row,
|
||||
&fourier_re0,
|
||||
&fourier_re1,
|
||||
&fourier_im0,
|
||||
&fourier_im1,
|
||||
fourier_re0,
|
||||
fourier_re1,
|
||||
fourier_im0,
|
||||
fourier_im1,
|
||||
is_output_uninit,
|
||||
fourier_poly_size,
|
||||
);
|
||||
|
||||
@@ -204,7 +204,7 @@ fn test_split_pbs() {
|
||||
fft: Fft128View<'_>,
|
||||
stack: PodStack<'_>,
|
||||
) {
|
||||
let (mut local_accumulator_data, stack) =
|
||||
let (local_accumulator_data, stack) =
|
||||
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
|
||||
let mut local_accumulator = GlweCiphertextMutView::from_container(
|
||||
&mut *local_accumulator_data,
|
||||
|
||||
@@ -1316,17 +1316,16 @@ impl<'a> Fft128View<'a> {
|
||||
debug_assert_eq!(n, 2 * fourier_im0.len());
|
||||
debug_assert_eq!(n, 2 * fourier_im1.len());
|
||||
|
||||
let (mut tmp_re0, stack) =
|
||||
let (tmp_re0, stack) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_re0.iter().copied());
|
||||
let (mut tmp_re1, stack) =
|
||||
let (tmp_re1, stack) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_re1.iter().copied());
|
||||
let (mut tmp_im0, stack) =
|
||||
let (tmp_im0, stack) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_im0.iter().copied());
|
||||
let (mut tmp_im1, _) =
|
||||
let (tmp_im1, _) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier_im1.iter().copied());
|
||||
|
||||
self.plan
|
||||
.inv(&mut tmp_re0, &mut tmp_re1, &mut tmp_im0, &mut tmp_im1);
|
||||
self.plan.inv(tmp_re0, tmp_re1, tmp_im0, tmp_im1);
|
||||
|
||||
let (standard_re_lo, standard_im_lo) = standard_lo.split_at_mut(n / 2);
|
||||
let (standard_re_hi, standard_im_hi) = standard_hi.split_at_mut(n / 2);
|
||||
@@ -1335,10 +1334,10 @@ impl<'a> Fft128View<'a> {
|
||||
standard_re_hi,
|
||||
standard_im_lo,
|
||||
standard_im_hi,
|
||||
&tmp_re0,
|
||||
&tmp_re1,
|
||||
&tmp_im0,
|
||||
&tmp_im1,
|
||||
tmp_re0,
|
||||
tmp_re1,
|
||||
tmp_im0,
|
||||
tmp_im1,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -353,7 +353,7 @@ impl<'a> FourierLweBootstrapKeyView<'a> {
|
||||
lut.as_mut_polynomial_list()
|
||||
.iter_mut()
|
||||
.for_each(|mut poly| {
|
||||
let (mut tmp_poly, _) = stack
|
||||
let (tmp_poly, _) = stack
|
||||
.rb_mut()
|
||||
.make_aligned_raw(poly.as_ref().len(), CACHELINE_ALIGN);
|
||||
|
||||
@@ -364,7 +364,7 @@ impl<'a> FourierLweBootstrapKeyView<'a> {
|
||||
|
||||
// We initialize the ct_0 used for the successive cmuxes
|
||||
let mut ct0 = lut;
|
||||
let (mut ct1, mut stack) = stack.make_aligned_raw(ct0.as_ref().len(), CACHELINE_ALIGN);
|
||||
let (ct1, mut stack) = stack.make_aligned_raw(ct0.as_ref().len(), CACHELINE_ALIGN);
|
||||
let mut ct1 =
|
||||
GlweCiphertextMutView::from_container(&mut *ct1, lut_poly_size, ciphertext_modulus);
|
||||
|
||||
@@ -437,7 +437,7 @@ impl<'a> FourierLweBootstrapKeyView<'a> {
|
||||
accumulator.ciphertext_modulus()
|
||||
);
|
||||
|
||||
let (mut local_accumulator_data, stack) =
|
||||
let (local_accumulator_data, stack) =
|
||||
stack.collect_aligned(CACHELINE_ALIGN, accumulator.as_ref().iter().copied());
|
||||
let mut local_accumulator = GlweCiphertextMutView::from_container(
|
||||
&mut *local_accumulator_data,
|
||||
|
||||
@@ -588,7 +588,7 @@ pub fn add_external_product_assign<Scalar>(
|
||||
ggsw.decomposition_level_count(),
|
||||
);
|
||||
|
||||
let (mut output_fft_buffer, mut substack0) =
|
||||
let (output_fft_buffer, mut substack0) =
|
||||
stack.make_aligned_raw::<c64>(fourier_poly_size * ggsw.glwe_size().0, align);
|
||||
// output_fft_buffer is initially uninitialized, considered to be implicitly zero, to avoid
|
||||
// the cost of filling it up with zeros. `is_output_uninit` is set to `false` once
|
||||
@@ -638,13 +638,13 @@ pub fn add_external_product_assign<Scalar>(
|
||||
glwe_decomp_term.as_polynomial_list().iter()
|
||||
)
|
||||
.for_each(|(ggsw_row, glwe_poly)| {
|
||||
let (mut fourier, substack3) = substack2
|
||||
let (fourier, substack3) = substack2
|
||||
.rb_mut()
|
||||
.make_aligned_raw::<c64>(fourier_poly_size, align);
|
||||
// We perform the forward fft transform for the glwe polynomial
|
||||
let fourier = fft
|
||||
.forward_as_integer(
|
||||
FourierPolynomialMutView { data: &mut fourier },
|
||||
FourierPolynomialMutView { data: fourier },
|
||||
glwe_poly,
|
||||
substack3,
|
||||
)
|
||||
@@ -691,11 +691,7 @@ pub(crate) fn collect_next_term<'a, Scalar: UnsignedTorus>(
|
||||
decomposition: &mut TensorSignedDecompositionLendingIter<'_, Scalar>,
|
||||
substack1: &'a mut PodStack,
|
||||
align: usize,
|
||||
) -> (
|
||||
DecompositionLevel,
|
||||
dyn_stack::DynArray<'a, Scalar>,
|
||||
PodStack<'a>,
|
||||
) {
|
||||
) -> (DecompositionLevel, &'a mut [Scalar], PodStack<'a>) {
|
||||
let (glwe_level, _, glwe_decomp_term) = decomposition.next_term().unwrap();
|
||||
let (glwe_decomp_term, substack2) = substack1.rb_mut().collect_aligned(align, glwe_decomp_term);
|
||||
(glwe_level, glwe_decomp_term, substack2)
|
||||
|
||||
@@ -123,17 +123,16 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
|
||||
let align = CACHELINE_ALIGN;
|
||||
|
||||
let (mut lwe_in_buffer_data, stack) =
|
||||
stack.collect_aligned(align, lwe_in.as_ref().iter().copied());
|
||||
let (lwe_in_buffer_data, stack) = stack.collect_aligned(align, lwe_in.as_ref().iter().copied());
|
||||
let mut lwe_in_buffer =
|
||||
LweCiphertext::from_container(&mut *lwe_in_buffer_data, lwe_in.ciphertext_modulus());
|
||||
|
||||
let (mut lwe_out_ks_buffer_data, stack) =
|
||||
let (lwe_out_ks_buffer_data, stack) =
|
||||
stack.make_aligned_with(ksk.output_lwe_size().0, align, |_| Scalar::ZERO);
|
||||
let mut lwe_out_ks_buffer =
|
||||
LweCiphertext::from_container(&mut *lwe_out_ks_buffer_data, ksk.ciphertext_modulus());
|
||||
|
||||
let (mut pbs_accumulator_data, stack) =
|
||||
let (pbs_accumulator_data, stack) =
|
||||
stack.make_aligned_with(glwe_size.0 * polynomial_size.0, align, |_| Scalar::ZERO);
|
||||
let mut pbs_accumulator = GlweCiphertextMutView::from_container(
|
||||
&mut *pbs_accumulator_data,
|
||||
@@ -144,7 +143,7 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
let lwe_size = glwe_dimension
|
||||
.to_equivalent_lwe_dimension(polynomial_size)
|
||||
.to_lwe_size();
|
||||
let (mut lwe_out_pbs_buffer_data, mut stack) =
|
||||
let (lwe_out_pbs_buffer_data, mut stack) =
|
||||
stack.make_aligned_with(lwe_size.0, align, |_| Scalar::ZERO);
|
||||
let mut lwe_out_pbs_buffer = LweCiphertext::from_container(
|
||||
&mut *lwe_out_pbs_buffer_data,
|
||||
@@ -153,26 +152,27 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
|
||||
// We iterate on the list in reverse as we want to store the extracted MSB at index 0
|
||||
for (bit_idx, mut output_ct) in lwe_list_out.iter_mut().rev().enumerate() {
|
||||
// Shift on padding bit
|
||||
let (lwe_bit_left_shift_buffer_data, _) = stack.rb_mut().collect_aligned(
|
||||
align,
|
||||
lwe_in_buffer
|
||||
.as_ref()
|
||||
.iter()
|
||||
.map(|s| *s << (ciphertext_n_bits - delta_log.0 - bit_idx - 1)),
|
||||
);
|
||||
// Block to keep the lwe_bit_left_shift_buffer_data alive only as long as needed
|
||||
{
|
||||
// Shift on padding bit
|
||||
let (lwe_bit_left_shift_buffer_data, _) = stack.rb_mut().collect_aligned(
|
||||
align,
|
||||
lwe_in_buffer
|
||||
.as_ref()
|
||||
.iter()
|
||||
.map(|s| *s << (ciphertext_n_bits - delta_log.0 - bit_idx - 1)),
|
||||
);
|
||||
|
||||
// Key switch to input PBS key
|
||||
keyswitch_lwe_ciphertext(
|
||||
&ksk,
|
||||
&LweCiphertext::from_container(
|
||||
&*lwe_bit_left_shift_buffer_data,
|
||||
lwe_in.ciphertext_modulus(),
|
||||
),
|
||||
&mut lwe_out_ks_buffer,
|
||||
);
|
||||
|
||||
drop(lwe_bit_left_shift_buffer_data);
|
||||
// Key switch to input PBS key
|
||||
keyswitch_lwe_ciphertext(
|
||||
&ksk,
|
||||
&LweCiphertext::from_container(
|
||||
lwe_bit_left_shift_buffer_data,
|
||||
lwe_in.ciphertext_modulus(),
|
||||
),
|
||||
&mut lwe_out_ks_buffer,
|
||||
);
|
||||
}
|
||||
|
||||
// Store the keyswitch output unmodified to the output list (as we need to to do other
|
||||
// computations on the output of the keyswitch)
|
||||
@@ -306,7 +306,7 @@ pub fn circuit_bootstrap_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
);
|
||||
|
||||
// Output for every bootstrapping
|
||||
let (mut lwe_out_bs_buffer_data, mut stack) = stack.make_aligned_with(
|
||||
let (lwe_out_bs_buffer_data, mut stack) = stack.make_aligned_with(
|
||||
fourier_bsk_output_lwe_dimension.to_lwe_size().0,
|
||||
CACHELINE_ALIGN,
|
||||
|_| Scalar::ZERO,
|
||||
@@ -384,7 +384,7 @@ pub fn homomorphic_shift_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
let polynomial_size = fourier_bsk.polynomial_size();
|
||||
let ciphertext_moudulus = lwe_out.ciphertext_modulus();
|
||||
|
||||
let (mut lwe_left_shift_buffer_data, stack) =
|
||||
let (lwe_left_shift_buffer_data, stack) =
|
||||
stack.make_aligned_with(lwe_in_size.0, CACHELINE_ALIGN, |_| Scalar::ZERO);
|
||||
let mut lwe_left_shift_buffer = LweCiphertext::from_container(
|
||||
&mut *lwe_left_shift_buffer_data,
|
||||
@@ -403,7 +403,7 @@ pub fn homomorphic_shift_boolean<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
*shift_buffer_body.data =
|
||||
(*shift_buffer_body.data).wrapping_add(Scalar::ONE << (ciphertext_n_bits - 2));
|
||||
|
||||
let (mut pbs_accumulator_data, stack) = stack.make_aligned_with(
|
||||
let (pbs_accumulator_data, stack) = stack.make_aligned_with(
|
||||
polynomial_size.0 * fourier_bsk.glwe_size().0,
|
||||
CACHELINE_ALIGN,
|
||||
|_| Scalar::ZERO,
|
||||
@@ -486,31 +486,31 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
// At index 0 you have the lut that will be loaded, and then the result for each layer gets
|
||||
// computed at the next index, last layer result gets stored in `result`.
|
||||
// This allow to use memory space in C * nb_layer instead of C' * 2 ^ nb_layer
|
||||
let (mut t_0_data, stack) = stack.make_aligned_with(
|
||||
let (t_0_data, stack) = stack.make_aligned_with(
|
||||
polynomial_size.0 * glwe_size.0 * nb_layer,
|
||||
CACHELINE_ALIGN,
|
||||
|_| Scalar::ZERO,
|
||||
);
|
||||
let (mut t_1_data, stack) = stack.make_aligned_with(
|
||||
let (t_1_data, stack) = stack.make_aligned_with(
|
||||
polynomial_size.0 * glwe_size.0 * nb_layer,
|
||||
CACHELINE_ALIGN,
|
||||
|_| Scalar::ZERO,
|
||||
);
|
||||
|
||||
let mut t_0 = GlweCiphertextList::from_container(
|
||||
t_0_data.as_mut(),
|
||||
t_0_data,
|
||||
glwe_size,
|
||||
polynomial_size,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
let mut t_1 = GlweCiphertextList::from_container(
|
||||
t_1_data.as_mut(),
|
||||
t_1_data,
|
||||
glwe_size,
|
||||
polynomial_size,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
|
||||
let (mut t_fill, mut stack) = stack.make_with(nb_layer, |_| 0_usize);
|
||||
let (t_fill, mut stack) = stack.make_with(nb_layer, |_| 0_usize);
|
||||
|
||||
let mut lut_polynomial_iter = lut_per_layer.iter();
|
||||
loop {
|
||||
@@ -565,8 +565,6 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
t_fill[j + 1] += 1;
|
||||
t_fill[j] = 0;
|
||||
|
||||
drop(diff_data);
|
||||
|
||||
(j_counter, t0_j, t1_j) = (j_counter_plus_1, t_0_j_plus_1, t_1_j_plus_1);
|
||||
} else {
|
||||
assert_eq!(j, nb_layer - 1);
|
||||
@@ -680,7 +678,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
|
||||
);
|
||||
|
||||
let glwe_size = pfpksk_list.output_key_glwe_dimension().to_glwe_size();
|
||||
let (mut ggsw_list_data, stack) = stack.make_aligned_with(
|
||||
let (ggsw_list_data, stack) = stack.make_aligned_with(
|
||||
lwe_list_in.lwe_ciphertext_count().0 * pfpksk_list.output_polynomial_size().0 / 2
|
||||
* glwe_size.0
|
||||
* glwe_size.0
|
||||
@@ -688,14 +686,14 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
|
||||
CACHELINE_ALIGN,
|
||||
|_| c64::default(),
|
||||
);
|
||||
let (mut ggsw_res_data, mut stack) = stack.make_aligned_with(
|
||||
let (ggsw_res_data, mut stack) = stack.make_aligned_with(
|
||||
pfpksk_list.output_polynomial_size().0 * glwe_size.0 * glwe_size.0 * level_cbs.0,
|
||||
CACHELINE_ALIGN,
|
||||
|_| Scalar::ZERO,
|
||||
);
|
||||
|
||||
let mut ggsw_list = FourierGgswCiphertextListMutView::new(
|
||||
&mut ggsw_list_data,
|
||||
ggsw_list_data,
|
||||
lwe_list_in.lwe_ciphertext_count().0,
|
||||
glwe_size,
|
||||
pfpksk_list.output_polynomial_size(),
|
||||
@@ -704,7 +702,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
|
||||
);
|
||||
|
||||
let mut ggsw_res = GgswCiphertext::from_container(
|
||||
&mut *ggsw_res_data,
|
||||
ggsw_res_data,
|
||||
glwe_size,
|
||||
pfpksk_list.output_polynomial_size(),
|
||||
base_log_cbs,
|
||||
@@ -817,15 +815,12 @@ pub fn vertical_packing<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
// the last blind rotation.
|
||||
let (cmux_ggsw, br_ggsw) = ggsw_list.split_at(log_number_of_luts_for_cmux_tree);
|
||||
|
||||
let (mut cmux_tree_lut_res_data, mut stack) =
|
||||
let (cmux_tree_lut_res_data, mut stack) =
|
||||
stack.make_aligned_with(polynomial_size.0 * glwe_size.0, CACHELINE_ALIGN, |_| {
|
||||
Scalar::ZERO
|
||||
});
|
||||
let mut cmux_tree_lut_res = GlweCiphertext::from_container(
|
||||
&mut *cmux_tree_lut_res_data,
|
||||
polynomial_size,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
let mut cmux_tree_lut_res =
|
||||
GlweCiphertext::from_container(cmux_tree_lut_res_data, polynomial_size, ciphertext_modulus);
|
||||
|
||||
cmux_tree_memory_optimized(
|
||||
cmux_tree_lut_res.as_mut_view(),
|
||||
@@ -866,7 +861,7 @@ pub fn blind_rotate_assign<Scalar: UnsignedTorus + CastInto<usize>>(
|
||||
|
||||
for ggsw in ggsw_list.into_ggsw_iter().rev() {
|
||||
let ct_0 = lut.as_mut_view();
|
||||
let (mut ct1_data, stack) = stack
|
||||
let (ct1_data, stack) = stack
|
||||
.rb_mut()
|
||||
.collect_aligned(CACHELINE_ALIGN, ct_0.as_ref().iter().copied());
|
||||
let mut ct_1 = GlweCiphertext::from_container(
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::core_crypto::commons::math::decomposition::decompose_one_level;
|
||||
pub use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
|
||||
use crate::core_crypto::commons::numeric::UnsignedInteger;
|
||||
use crate::core_crypto::commons::parameters::{DecompositionBaseLog, DecompositionLevelCount};
|
||||
use dyn_stack::{DynArray, PodStack};
|
||||
use dyn_stack::PodStack;
|
||||
use std::iter::Map;
|
||||
use std::slice::IterMut;
|
||||
|
||||
@@ -18,7 +18,7 @@ pub struct TensorSignedDecompositionLendingIter<'buffers, Scalar: UnsignedIntege
|
||||
// ...0001111
|
||||
mod_b_mask: Scalar,
|
||||
// The internal states of each decomposition
|
||||
states: DynArray<'buffers, Scalar>,
|
||||
states: &'buffers mut [Scalar],
|
||||
// A flag which stores whether the iterator is a fresh one (for the recompose method).
|
||||
fresh: bool,
|
||||
}
|
||||
|
||||
@@ -532,12 +532,12 @@ impl<'a> FftView<'a> {
|
||||
let standard = standard.as_mut();
|
||||
let n = standard.len();
|
||||
debug_assert_eq!(n, 2 * fourier.len());
|
||||
let (mut tmp, stack) =
|
||||
let (tmp, stack) =
|
||||
stack.collect_aligned(aligned_vec::CACHELINE_ALIGN, fourier.iter().copied());
|
||||
self.plan.inv(&mut tmp, stack);
|
||||
self.plan.inv(tmp, stack);
|
||||
|
||||
let (standard_re, standard_im) = standard.split_at_mut(n / 2);
|
||||
conv_fn(standard_re, standard_im, &tmp, self.twisties);
|
||||
conv_fn(standard_re, standard_im, tmp, self.twisties);
|
||||
}
|
||||
|
||||
fn backward_with_conv_in_place<
|
||||
|
||||
@@ -89,52 +89,10 @@ pub fn mm256_cvtpd_epi64(simd: V3, x: __m256d) -> __m256i {
|
||||
#[cfg(feature = "nightly-avx512")]
|
||||
#[inline(always)]
|
||||
pub fn mm512_cvtt_roundpd_epi64(simd: V4, x: __m512d) -> __m512i {
|
||||
// This first one is required for the zmm_reg notation
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse")]
|
||||
#[target_feature(enable = "sse2")]
|
||||
#[target_feature(enable = "fxsr")]
|
||||
#[target_feature(enable = "sse3")]
|
||||
#[target_feature(enable = "ssse3")]
|
||||
#[target_feature(enable = "sse4.1")]
|
||||
#[target_feature(enable = "sse4.2")]
|
||||
#[target_feature(enable = "popcnt")]
|
||||
#[target_feature(enable = "avx")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
#[target_feature(enable = "bmi1")]
|
||||
#[target_feature(enable = "bmi2")]
|
||||
#[target_feature(enable = "fma")]
|
||||
#[target_feature(enable = "lzcnt")]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
unsafe fn implementation(x: __m512d) -> __m512i {
|
||||
let mut as_i64x8: __m512i;
|
||||
|
||||
// From Intel's documentation the syntax to use this intrinsics is
|
||||
// Instruction: vcvttpd2qq zmm, zmm
|
||||
// With Intel syntax, left operand is the destination, right operand is the source
|
||||
// For the asm! macro
|
||||
// in: indicates an input register
|
||||
// out: indicates an output register
|
||||
// zmm_reg: the avx512 register type
|
||||
// options: see https://doc.rust-lang.org/nightly/reference/inline-assembly.html#options
|
||||
// pure: no side effect
|
||||
// nomem: does not reference RAM (only registers)
|
||||
// nostrack: does not alter the state of the stack
|
||||
core::arch::asm!(
|
||||
"vcvttpd2qq {dst}, {src}",
|
||||
src = in(zmm_reg) x,
|
||||
dst = out(zmm_reg) as_i64x8,
|
||||
options(pure, nomem, nostack)
|
||||
);
|
||||
|
||||
as_i64x8
|
||||
}
|
||||
let _ = simd.avx512dq;
|
||||
|
||||
// SAFETY: simd contains an instance of avx512dq, that matches the target feature of
|
||||
// `implementation`
|
||||
unsafe { implementation(x) }
|
||||
_ = simd;
|
||||
unsafe { _mm512_cvttpd_epi64(x) }
|
||||
}
|
||||
|
||||
/// Convert a vector of i64 values to a vector of f64 values. Not sure how it works.
|
||||
@@ -174,52 +132,10 @@ pub fn mm256_cvtepi64_pd(simd: V3, x: __m256i) -> __m256d {
|
||||
#[cfg(feature = "nightly-avx512")]
|
||||
#[inline(always)]
|
||||
pub fn mm512_cvtepi64_pd(simd: V4, x: __m512i) -> __m512d {
|
||||
// This first one is required for the zmm_reg notation
|
||||
#[inline]
|
||||
#[target_feature(enable = "sse")]
|
||||
#[target_feature(enable = "sse2")]
|
||||
#[target_feature(enable = "fxsr")]
|
||||
#[target_feature(enable = "sse3")]
|
||||
#[target_feature(enable = "ssse3")]
|
||||
#[target_feature(enable = "sse4.1")]
|
||||
#[target_feature(enable = "sse4.2")]
|
||||
#[target_feature(enable = "popcnt")]
|
||||
#[target_feature(enable = "avx")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
#[target_feature(enable = "bmi1")]
|
||||
#[target_feature(enable = "bmi2")]
|
||||
#[target_feature(enable = "fma")]
|
||||
#[target_feature(enable = "lzcnt")]
|
||||
#[target_feature(enable = "avx512f")]
|
||||
#[target_feature(enable = "avx512dq")]
|
||||
unsafe fn implementation(x: __m512i) -> __m512d {
|
||||
let mut as_f64x8: __m512d;
|
||||
|
||||
// From Intel's documentation the syntax to use this intrinsics is
|
||||
// Instruction: vcvtqq2pd zmm, zmm
|
||||
// With Intel syntax, left operand is the destination, right operand is the source
|
||||
// For the asm! macro
|
||||
// in: indicates an input register
|
||||
// out: indicates an output register
|
||||
// zmm_reg: the avx512 register type
|
||||
// options: see https://doc.rust-lang.org/nightly/reference/inline-assembly.html#options
|
||||
// pure: no side effect
|
||||
// nomem: does not reference RAM (only registers)
|
||||
// nostrack: does not alter the state of the stack
|
||||
core::arch::asm!(
|
||||
"vcvtqq2pd {dst}, {src}",
|
||||
src = in(zmm_reg) x,
|
||||
dst = out(zmm_reg) as_f64x8,
|
||||
options(pure, nomem, nostack)
|
||||
);
|
||||
|
||||
as_f64x8
|
||||
}
|
||||
let _ = simd.avx512dq;
|
||||
|
||||
// SAFETY: simd contains an instance of avx512dq, that matches the target feature of
|
||||
// `implementation`
|
||||
unsafe { implementation(x) }
|
||||
_ = simd;
|
||||
unsafe { _mm512_cvtepi64_pd(x) }
|
||||
}
|
||||
|
||||
#[cfg(feature = "nightly-avx512")]
|
||||
|
||||
@@ -5,16 +5,16 @@ use crate::core_crypto::gpu::{extract_lwe_samples_from_glwe_ciphertext_list_asyn
|
||||
use crate::core_crypto::prelude::{MonomialDegree, UnsignedTorus};
|
||||
use itertools::Itertools;
|
||||
|
||||
/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
|
||||
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
|
||||
/// GPU-accelerated.
|
||||
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_extract_lwe_samples_from_glwe_ciphertext_list_async<Scalar>(
|
||||
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
|
||||
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
|
||||
vec_nth: &[MonomialDegree],
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus,
|
||||
{
|
||||
let in_lwe_dim = input_glwe_list
|
||||
@@ -58,3 +58,25 @@ pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
|
||||
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
|
||||
/// GPU-accelerated.
|
||||
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
|
||||
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
|
||||
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
|
||||
vec_nth: &[MonomialDegree],
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
Scalar: UnsignedTorus,
|
||||
{
|
||||
unsafe {
|
||||
cuda_extract_lwe_samples_from_glwe_ciphertext_list_async(
|
||||
input_glwe_list,
|
||||
output_lwe_list,
|
||||
vec_nth,
|
||||
streams,
|
||||
);
|
||||
}
|
||||
streams.synchronize();
|
||||
}
|
||||
|
||||
36
tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
Normal file
36
tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
Normal file
@@ -0,0 +1,36 @@
|
||||
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
|
||||
use crate::core_crypto::gpu::{packing_keyswitch_list_async, CudaStreams};
|
||||
use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async<Scalar>(
|
||||
lwe_pksk: &CudaLwePackingKeyswitchKey<Scalar>,
|
||||
input_lwe_ciphertext_list: &CudaLweCiphertextList<Scalar>,
|
||||
output_glwe_ciphertext: &mut CudaGlweCiphertextList<Scalar>,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
let input_lwe_dimension = input_lwe_ciphertext_list.lwe_dimension();
|
||||
let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension();
|
||||
let output_polynomial_size = output_glwe_ciphertext.polynomial_size();
|
||||
|
||||
packing_keyswitch_list_async(
|
||||
streams,
|
||||
&mut output_glwe_ciphertext.0.d_vec,
|
||||
&input_lwe_ciphertext_list.0.d_vec,
|
||||
input_lwe_dimension,
|
||||
output_glwe_dimension,
|
||||
output_polynomial_size,
|
||||
&lwe_pksk.d_vec,
|
||||
lwe_pksk.decomposition_base_log(),
|
||||
lwe_pksk.decomposition_level_count(),
|
||||
input_lwe_ciphertext_list.lwe_ciphertext_count(),
|
||||
);
|
||||
}
|
||||
@@ -1,13 +1,15 @@
|
||||
pub mod glwe_sample_extraction;
|
||||
pub mod lwe_keyswitch;
|
||||
pub mod lwe_linear_algebra;
|
||||
pub mod lwe_multi_bit_programmable_bootstrapping;
|
||||
pub mod lwe_packing_keyswitch;
|
||||
pub mod lwe_programmable_bootstrapping;
|
||||
|
||||
pub mod glwe_sample_extraction;
|
||||
mod lwe_keyswitch;
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
pub use lwe_keyswitch::*;
|
||||
pub use lwe_linear_algebra::*;
|
||||
pub use lwe_multi_bit_programmable_bootstrapping::*;
|
||||
pub use lwe_packing_keyswitch::*;
|
||||
pub use lwe_programmable_bootstrapping::*;
|
||||
|
||||
@@ -0,0 +1,234 @@
|
||||
use super::*;
|
||||
use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async;
|
||||
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::Serialize;
|
||||
|
||||
const NB_TESTS: usize = 10;
|
||||
fn generate_keys<Scalar: UnsignedTorus + Sync + Send + Serialize + DeserializeOwned>(
|
||||
params: PackingKeySwitchTestParams<Scalar>,
|
||||
streams: &CudaStreams,
|
||||
rsc: &mut TestResources,
|
||||
) -> CudaPackingKeySwitchKeys<Scalar> {
|
||||
let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
|
||||
params.lwe_dimension,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
|
||||
let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
|
||||
params.glwe_dimension,
|
||||
params.polynomial_size,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
|
||||
let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
|
||||
&lwe_sk,
|
||||
&glwe_sk,
|
||||
params.pbs_base_log,
|
||||
params.pbs_level,
|
||||
params.glwe_noise_distribution,
|
||||
params.ciphertext_modulus,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&pksk,
|
||||
params.ciphertext_modulus
|
||||
));
|
||||
|
||||
let cuda_pksk = CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(&pksk, streams);
|
||||
|
||||
CudaPackingKeySwitchKeys {
|
||||
lwe_sk,
|
||||
glwe_sk,
|
||||
pksk: cuda_pksk,
|
||||
}
|
||||
}
|
||||
|
||||
fn lwe_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
|
||||
where
|
||||
Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
|
||||
P: Into<PackingKeySwitchTestParams<Scalar>>,
|
||||
PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
|
||||
{
|
||||
let params = params.into();
|
||||
|
||||
let lwe_noise_distribution = params.lwe_noise_distribution;
|
||||
let ciphertext_modulus = params.ciphertext_modulus;
|
||||
let message_modulus_log = params.message_modulus_log;
|
||||
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
|
||||
let mut msg = msg_modulus;
|
||||
let delta: Scalar = encoding_with_padding / msg_modulus;
|
||||
|
||||
let gpu_index = 0;
|
||||
let stream = CudaStreams::new_single_gpu(gpu_index);
|
||||
|
||||
while msg != Scalar::ZERO {
|
||||
msg = msg.wrapping_sub(Scalar::ONE);
|
||||
for _ in 0..NB_TESTS {
|
||||
let keys = generate_keys(params, &stream, &mut rsc);
|
||||
let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk);
|
||||
|
||||
let plaintext = Plaintext(msg * delta);
|
||||
|
||||
let input_lwe = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&lwe_sk,
|
||||
plaintext,
|
||||
lwe_noise_distribution,
|
||||
ciphertext_modulus,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
let d_input_lwe = CudaLweCiphertextList::from_lwe_ciphertext(&input_lwe, &stream);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&input_lwe,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let mut d_output_glwe = CudaGlweCiphertextList::new(
|
||||
glwe_sk.glwe_dimension(),
|
||||
glwe_sk.polynomial_size(),
|
||||
GlweCiphertextCount(1),
|
||||
ciphertext_modulus,
|
||||
&stream,
|
||||
);
|
||||
|
||||
unsafe {
|
||||
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async(
|
||||
&pksk,
|
||||
&d_input_lwe,
|
||||
&mut d_output_glwe,
|
||||
&stream,
|
||||
);
|
||||
}
|
||||
let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream);
|
||||
let mut decrypted_plaintext_list = PlaintextList::new(
|
||||
Scalar::ZERO,
|
||||
PlaintextCount(output_glwe_list.polynomial_size().0),
|
||||
);
|
||||
|
||||
decrypt_glwe_ciphertext_list(
|
||||
&glwe_sk,
|
||||
&output_glwe_list,
|
||||
&mut decrypted_plaintext_list,
|
||||
);
|
||||
let decoded = round_decode(*decrypted_plaintext_list.get(0).0, delta) % msg_modulus;
|
||||
|
||||
assert_eq!(msg, decoded);
|
||||
}
|
||||
|
||||
// In coverage, we break after one while loop iteration, changing message values does not
|
||||
// yield higher coverage
|
||||
#[cfg(tarpaulin)]
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fn lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
|
||||
where
|
||||
Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
|
||||
P: Into<PackingKeySwitchTestParams<Scalar>>,
|
||||
PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
|
||||
{
|
||||
let params = params.into();
|
||||
|
||||
let lwe_noise_distribution = params.lwe_noise_distribution;
|
||||
let ciphertext_modulus = params.ciphertext_modulus;
|
||||
let message_modulus_log = params.message_modulus_log;
|
||||
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
|
||||
let mut msg = msg_modulus;
|
||||
let delta: Scalar = encoding_with_padding / msg_modulus;
|
||||
|
||||
let gpu_index = 0;
|
||||
let stream = CudaStreams::new_single_gpu(gpu_index);
|
||||
|
||||
while msg != Scalar::ZERO {
|
||||
msg = msg.wrapping_sub(Scalar::ONE);
|
||||
for _ in 0..NB_TESTS {
|
||||
let keys = generate_keys(params, &stream, &mut rsc);
|
||||
let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk);
|
||||
|
||||
let mut input_lwe_list = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
lwe_sk.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(glwe_sk.polynomial_size().0),
|
||||
ciphertext_modulus,
|
||||
);
|
||||
|
||||
let mut input_plaintext_list =
|
||||
PlaintextList::new(msg * delta, PlaintextCount(glwe_sk.polynomial_size().0));
|
||||
|
||||
encrypt_lwe_ciphertext_list(
|
||||
&lwe_sk,
|
||||
&mut input_lwe_list,
|
||||
&input_plaintext_list,
|
||||
lwe_noise_distribution,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
let d_input_lwe_list =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&input_lwe_list,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let mut d_output_glwe = CudaGlweCiphertextList::new(
|
||||
glwe_sk.glwe_dimension(),
|
||||
glwe_sk.polynomial_size(),
|
||||
GlweCiphertextCount(1),
|
||||
ciphertext_modulus,
|
||||
&stream,
|
||||
);
|
||||
|
||||
unsafe {
|
||||
cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async(
|
||||
&pksk,
|
||||
&d_input_lwe_list,
|
||||
&mut d_output_glwe,
|
||||
&stream,
|
||||
);
|
||||
}
|
||||
|
||||
let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream);
|
||||
|
||||
let mut decrypted_plaintext_list = PlaintextList::new(
|
||||
Scalar::ZERO,
|
||||
PlaintextCount(output_glwe_list.polynomial_size().0),
|
||||
);
|
||||
|
||||
decrypt_glwe_ciphertext_list(
|
||||
&glwe_sk,
|
||||
&output_glwe_list,
|
||||
&mut decrypted_plaintext_list,
|
||||
);
|
||||
|
||||
decrypted_plaintext_list
|
||||
.iter_mut()
|
||||
.for_each(|x| *x.0 = round_decode(*x.0, delta) % msg_modulus);
|
||||
input_plaintext_list.iter_mut().for_each(|x| *x.0 /= delta);
|
||||
|
||||
assert_eq!(decrypted_plaintext_list, input_plaintext_list);
|
||||
}
|
||||
|
||||
// In coverage, we break after one while loop iteration, changing message values does not
|
||||
// yield higher coverage
|
||||
#[cfg(tarpaulin)]
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
create_gpu_parametrized_test!(lwe_encrypt_pks_to_glwe_decrypt_custom_mod);
|
||||
create_gpu_parametrized_test!(lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod);
|
||||
@@ -4,8 +4,15 @@ mod glwe_sample_extraction;
|
||||
mod lwe_keyswitch;
|
||||
mod lwe_linear_algebra;
|
||||
mod lwe_multi_bit_programmable_bootstrapping;
|
||||
mod lwe_packing_keyswitch;
|
||||
mod lwe_programmable_bootstrapping;
|
||||
|
||||
pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
|
||||
pub lwe_sk: LweSecretKey<Vec<Scalar>>,
|
||||
pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
|
||||
pub pksk: CudaLwePackingKeyswitchKey<Scalar>,
|
||||
}
|
||||
|
||||
// Macro to generate tests for all parameter sets
|
||||
macro_rules! create_gpu_parametrized_test{
|
||||
($name:ident { $($param:ident),* }) => {
|
||||
@@ -47,4 +54,5 @@ macro_rules! create_gpu_multi_bit_parametrized_test{
|
||||
};
|
||||
}
|
||||
|
||||
use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
|
||||
use {create_gpu_multi_bit_parametrized_test, create_gpu_parametrized_test};
|
||||
|
||||
@@ -199,51 +199,6 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
|
||||
LweCiphertext::from_container(container, self.ciphertext_modulus())
|
||||
}
|
||||
|
||||
/// ```rust
|
||||
/// use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
/// use tfhe::core_crypto::gpu::CudaStreams;
|
||||
/// use tfhe::core_crypto::prelude::{
|
||||
/// CiphertextModulus, LweCiphertextCount, LweCiphertextList, LweSize,
|
||||
/// };
|
||||
///
|
||||
/// let mut streams = CudaStreams::new_single_gpu(0);
|
||||
///
|
||||
/// let lwe_size = LweSize(743);
|
||||
/// let ciphertext_modulus = CiphertextModulus::new_native();
|
||||
/// let lwe_ciphertext_count = LweCiphertextCount(2);
|
||||
///
|
||||
/// // Create a new LweCiphertextList
|
||||
/// let lwe_list = LweCiphertextList::new(0u64, lwe_size, lwe_ciphertext_count, ciphertext_modulus);
|
||||
///
|
||||
/// // Copy to GPU
|
||||
/// let d_lwe_list = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, &mut streams);
|
||||
/// let d_lwe_list_copied = d_lwe_list.duplicate(&mut streams);
|
||||
///
|
||||
/// let lwe_list_copied = d_lwe_list_copied.to_lwe_ciphertext_list(&mut streams);
|
||||
///
|
||||
/// assert_eq!(lwe_list, lwe_list_copied);
|
||||
/// ```
|
||||
pub fn duplicate(&self, streams: &CudaStreams) -> Self {
|
||||
let lwe_dimension = self.lwe_dimension();
|
||||
let lwe_ciphertext_count = self.lwe_ciphertext_count();
|
||||
let ciphertext_modulus = self.ciphertext_modulus();
|
||||
|
||||
// Copy to the GPU
|
||||
let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0);
|
||||
unsafe {
|
||||
d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0);
|
||||
}
|
||||
streams.synchronize();
|
||||
|
||||
let cuda_lwe_list = CudaLweList {
|
||||
d_vec,
|
||||
lwe_ciphertext_count,
|
||||
lwe_dimension,
|
||||
ciphertext_modulus,
|
||||
};
|
||||
Self(cuda_lwe_list)
|
||||
}
|
||||
|
||||
pub(crate) fn lwe_dimension(&self) -> LweDimension {
|
||||
self.0.lwe_dimension
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user