mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 07:38:08 -05:00
Compare commits
114 Commits
as/add_deb
...
as/ks32_gp
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dc4001c80f | ||
|
|
30938eec74 | ||
|
|
516789bd5d | ||
|
|
027792d659 | ||
|
|
1ed9d6a85e | ||
|
|
126138a59d | ||
|
|
241685fccc | ||
|
|
e739f43ec5 | ||
|
|
3073d60f11 | ||
|
|
a05d228899 | ||
|
|
63055d5ca8 | ||
|
|
46a3008739 | ||
|
|
f2674da031 | ||
|
|
12c2a2a8b7 | ||
|
|
b61dd21ef7 | ||
|
|
ca4159f123 | ||
|
|
ab25919187 | ||
|
|
1b38f8ccfc | ||
|
|
6a676551d8 | ||
|
|
afb79a0b1c | ||
|
|
0277403c45 | ||
|
|
18159d6458 | ||
|
|
728409aef8 | ||
|
|
034f3b3c25 | ||
|
|
c30e9c39f6 | ||
|
|
1513c3bc8c | ||
|
|
e07f07c4c8 | ||
|
|
81cc0c31b4 | ||
|
|
c95e38e26f | ||
|
|
f0f3dd76eb | ||
|
|
0604d237eb | ||
|
|
e523fd2cb6 | ||
|
|
33dee7673c | ||
|
|
9b5596ca66 | ||
|
|
aefec1fe64 | ||
|
|
f9e876730a | ||
|
|
f3cddb5635 | ||
|
|
a395cfe9bf | ||
|
|
602c6faf8a | ||
|
|
563502a6a6 | ||
|
|
5f30569452 | ||
|
|
39b81a8ded | ||
|
|
da223b36b6 | ||
|
|
db16276715 | ||
|
|
a59742f518 | ||
|
|
2bf595d0e2 | ||
|
|
fb2b1a13e7 | ||
|
|
9fdaa983e3 | ||
|
|
73de886c07 | ||
|
|
45a849ad36 | ||
|
|
ef5b984448 | ||
|
|
6abed1f228 | ||
|
|
71b45c14da | ||
|
|
4f5d711c4e | ||
|
|
2602c9e1b3 | ||
|
|
06dffc60bd | ||
|
|
2a82076121 | ||
|
|
7549474aac | ||
|
|
4fcff55745 | ||
|
|
3d345a648b | ||
|
|
3975e0115b | ||
|
|
6494a82fb3 | ||
|
|
8aa60f8882 | ||
|
|
15cab8b413 | ||
|
|
23d46ba2bc | ||
|
|
daf0e79e4a | ||
|
|
c5ad73865c | ||
|
|
9aab79e23a | ||
|
|
6ca48132e1 | ||
|
|
f53c75636d | ||
|
|
ce63cabc05 | ||
|
|
4fec2e17ae | ||
|
|
e87c36beb4 | ||
|
|
24c6ffc24a | ||
|
|
3680f796af | ||
|
|
3ded3fe7c9 | ||
|
|
451cfe3aba | ||
|
|
1e28cf7f3b | ||
|
|
91b62c737f | ||
|
|
da12bb29d8 | ||
|
|
d60028c47c | ||
|
|
d5b5369a9a | ||
|
|
9457ca786c | ||
|
|
8b5d7321fb | ||
|
|
736185bb31 | ||
|
|
e4b230aaf1 | ||
|
|
7ed827808c | ||
|
|
6e7aaac90f | ||
|
|
d1c190fac6 | ||
|
|
7e1c8f7db5 | ||
|
|
d30c2060bf | ||
|
|
4ccd5ea262 | ||
|
|
1ab3022df8 | ||
|
|
a257849c66 | ||
|
|
0f4f8dd755 | ||
|
|
aaaa929c2e | ||
|
|
d397ea3a39 | ||
|
|
3e25536021 | ||
|
|
1c19851491 | ||
|
|
4b0623da4a | ||
|
|
d415d47894 | ||
|
|
e22f9c09e3 | ||
|
|
4d02d3abb4 | ||
|
|
ae6f96e0ec | ||
|
|
70e1828c58 | ||
|
|
1b1e6a7068 | ||
|
|
fc447fd2d0 | ||
|
|
d5e5902f61 | ||
|
|
9f54777ee1 | ||
|
|
4a73b7bb4b | ||
|
|
022cb3b18a | ||
|
|
c4feabbfa3 | ||
|
|
3c6ed37a18 | ||
|
|
fe6e81ff78 |
2
.github/actionlint.yaml
vendored
2
.github/actionlint.yaml
vendored
@@ -7,6 +7,8 @@ self-hosted-runner:
|
||||
- large_ubuntu_16
|
||||
- large_ubuntu_16-22.04
|
||||
- v80-desktop
|
||||
- v80-marais
|
||||
- v80-couperin
|
||||
# Configuration variables in array of strings defined in your repository or
|
||||
# organization. `null` means disabling configuration variables check.
|
||||
# Empty array means no configuration variable is allowed.
|
||||
|
||||
@@ -66,14 +66,9 @@ jobs:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
persist-credentials: 'true' # Needed to pull lfs data
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
# Cache key is an aggregated hash of lfs files hashes
|
||||
- name: Get LFS data sha
|
||||
id: hash-lfs-data
|
||||
@@ -83,7 +78,7 @@ jobs:
|
||||
|
||||
- name: Retrieve data from cache
|
||||
id: retrieve-data-cache
|
||||
uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
|
||||
uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
|
||||
with:
|
||||
path: |
|
||||
utils/tfhe-backward-compat-data/**/*.cbor
|
||||
@@ -95,6 +90,16 @@ jobs:
|
||||
run: |
|
||||
make pull_backward_compat_data
|
||||
|
||||
# Pull token was stored by action/checkout to be used by lfs, we don't need it anymore
|
||||
- name: Remove git credentials
|
||||
run: |
|
||||
git config --local --unset-all http.https://github.com/.extraheader
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run backward compatibility tests
|
||||
run: |
|
||||
make test_backward_compatibility_ci
|
||||
@@ -102,7 +107,7 @@ jobs:
|
||||
- name: Store data in cache
|
||||
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
|
||||
continue-on-error: true
|
||||
uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
|
||||
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
|
||||
with:
|
||||
path: |
|
||||
utils/tfhe-backward-compat-data/**/*.cbor
|
||||
|
||||
4
.github/workflows/aws_tfhe_fast_tests.yml
vendored
4
.github/workflows/aws_tfhe_fast_tests.yml
vendored
@@ -217,7 +217,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
|
||||
uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -230,7 +230,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
|
||||
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
|
||||
1
.github/workflows/aws_tfhe_integer_tests.yml
vendored
1
.github/workflows/aws_tfhe_integer_tests.yml
vendored
@@ -107,6 +107,7 @@ jobs:
|
||||
group: ${{ github.workflow_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
timeout-minutes: 480 # 8 hours
|
||||
steps:
|
||||
- name: Checkout tfhe-rs
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
|
||||
4
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
4
.github/workflows/aws_tfhe_wasm_tests.yml
vendored
@@ -78,7 +78,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
|
||||
uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -91,7 +91,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
|
||||
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
|
||||
51
.github/workflows/benchmark_core_crypto.yml
vendored
51
.github/workflows/benchmark_core_crypto.yml
vendored
@@ -3,6 +3,16 @@ name: benchmark_core_crypto
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
param_type:
|
||||
description: "Parameters type"
|
||||
type: choice
|
||||
default: classical
|
||||
options:
|
||||
- classical
|
||||
- multi_bit
|
||||
- both
|
||||
|
||||
schedule:
|
||||
# Weekly benchmarks will be triggered each Saturday at 5a.m.
|
||||
- cron: '0 5 * * 6'
|
||||
@@ -22,8 +32,38 @@ env:
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
name: benchmark_core_crypto/prepare-matrix
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
outputs:
|
||||
param_type: ${{ steps.set_param_type.outputs.param_type }}
|
||||
steps:
|
||||
- name: Set parameters types
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
run: |
|
||||
if [[ "${INPUTS_PARAM_TYPE}" == "both" ]]; then
|
||||
echo "PARAM_TYPE=[\"classical\", \"multi_bit\"]" >> "${GITHUB_ENV}"
|
||||
else
|
||||
echo "PARAM_TYPE=[\"${INPUTS_PARAM_TYPE}\"]" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
env:
|
||||
INPUTS_PARAM_TYPE: ${{ inputs.param_type }}
|
||||
|
||||
- name: Default parameters type
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
echo "PARAM_TYPE=[\"classical\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Set parameters types output
|
||||
id: set_param_type
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "param_type=${{ toJSON(env.PARAM_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
setup-instance:
|
||||
name: benchmark_core_crypto/setup-instance
|
||||
needs: prepare-matrix
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name != 'schedule' ||
|
||||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
|
||||
@@ -43,11 +83,16 @@ jobs:
|
||||
|
||||
core-crypto-benchmarks:
|
||||
name: benchmark_core_crypto/core-crypto-benchmarks
|
||||
needs: setup-instance
|
||||
needs: [ prepare-matrix, setup-instance ]
|
||||
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
|
||||
concurrency:
|
||||
group: ${{ github.workflow_ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
timeout-minutes: 1440 # 24 hours
|
||||
strategy:
|
||||
max-parallel: 1
|
||||
matrix:
|
||||
param_type: ${{ fromJSON(needs.prepare-matrix.outputs.param_type) }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
@@ -78,6 +123,8 @@ jobs:
|
||||
make bench_pbs
|
||||
make bench_pbs128
|
||||
make bench_ks
|
||||
env:
|
||||
BENCH_PARAM_TYPE: ${{ matrix.param_type }}
|
||||
|
||||
- name: Parse results
|
||||
run: |
|
||||
@@ -96,7 +143,7 @@ jobs:
|
||||
- name: Upload parsed results artifact
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
|
||||
with:
|
||||
name: ${{ github.sha }}_core_crypto
|
||||
name: ${{ github.sha }}_core_crypto_${{ matrix.param_type }}_pbs
|
||||
path: ${{ env.RESULTS_FILENAME }}
|
||||
|
||||
- name: Checkout Slab repo
|
||||
|
||||
9
.github/workflows/benchmark_hpu_hlapi.yml
vendored
9
.github/workflows/benchmark_hpu_hlapi.yml
vendored
@@ -16,7 +16,7 @@ permissions: {}
|
||||
jobs:
|
||||
hlapi-benchmarks-hpu:
|
||||
name: Execute HLAPI benchmarks for HPU backend
|
||||
runs-on: v80-desktop
|
||||
runs-on: v80-marais
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -60,11 +60,14 @@ jobs:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Select HPU board
|
||||
run: |
|
||||
echo "V80_PCIE_DEV=24" >> "${GITHUB_ENV}"
|
||||
echo "V80_SERIAL_NUMBER=XFL12NWY3ZKG" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
make pull_hpu_files
|
||||
export V80_SERIAL_NUMBER=XFL12E4XJXWK
|
||||
source /opt/xilinx/Vivado/2024.2/settings64.sh
|
||||
make bench_hlapi_erc20_hpu
|
||||
make bench_hlapi_hpu
|
||||
|
||||
|
||||
16
.github/workflows/benchmark_hpu_integer.yml
vendored
16
.github/workflows/benchmark_hpu_integer.yml
vendored
@@ -29,7 +29,7 @@ permissions: {}
|
||||
jobs:
|
||||
prepare-matrix:
|
||||
name: Prepare operations matrix
|
||||
runs-on: v80-desktop
|
||||
runs-on: v80-marais
|
||||
outputs:
|
||||
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
|
||||
steps:
|
||||
@@ -48,17 +48,17 @@ jobs:
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
|
||||
|
||||
|
||||
|
||||
- name: Set benchmark types output
|
||||
id: set_bench_type
|
||||
run: | # zizmor: ignore[template-injection] this env variable is safe
|
||||
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
|
||||
integer-benchmarks-hpu:
|
||||
name: benchmark_hpu_integer/integer-benchmarks-hpu
|
||||
needs: prepare-matrix
|
||||
runs-on: v80-desktop
|
||||
runs-on: v80-marais
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}_${{ github.ref }}
|
||||
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
|
||||
@@ -111,11 +111,15 @@ jobs:
|
||||
run: |
|
||||
echo "FAST_BENCH=FALSE" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Select HPU board
|
||||
run: |
|
||||
echo "V80_PCIE_DEV=24" >> "${GITHUB_ENV}"
|
||||
echo "V80_SERIAL_NUMBER=XFL12NWY3ZKG" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Run benchmarks
|
||||
run: |
|
||||
echo "${V80_PCIE_DEV} ${V80_SERIAL_NUMBER}"
|
||||
make pull_hpu_files
|
||||
export V80_SERIAL_NUMBER=XFL12E4XJXWK
|
||||
source /opt/xilinx/Vivado/2024.2/settings64.sh
|
||||
make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
|
||||
env:
|
||||
BENCH_TYPE: ${{ matrix.bench_type }}
|
||||
|
||||
14
.github/workflows/benchmark_perf_regression.yml
vendored
14
.github/workflows/benchmark_perf_regression.yml
vendored
@@ -21,20 +21,20 @@ env:
|
||||
permissions: { }
|
||||
|
||||
jobs:
|
||||
verify-actor:
|
||||
verify-triggering-actor:
|
||||
name: benchmark_perf_regression/verify-actor
|
||||
if: (github.event_name == 'pull_request' &&
|
||||
(contains(github.event.label.name, 'bench-perfs-cpu') ||
|
||||
contains(github.event.label.name, 'bench-perfs-gpu'))) ||
|
||||
(github.event.issue.pull_request && startsWith(github.event.comment.body, '/bench'))
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
uses: ./.github/workflows/verify_triggering_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
prepare-benchmarks:
|
||||
name: benchmark_perf_regression/prepare-benchmarks
|
||||
needs: verify-actor
|
||||
needs: verify-triggering-actor
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
commands: ${{ steps.set_commands.outputs.commands }}
|
||||
@@ -44,7 +44,7 @@ jobs:
|
||||
custom-env: ${{ steps.get_custom_env.outputs.custom_env }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
@@ -132,7 +132,7 @@ jobs:
|
||||
gcc: 11
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
@@ -159,7 +159,7 @@ jobs:
|
||||
command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo with tags
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
@@ -213,7 +213,7 @@ jobs:
|
||||
toolchain: nightly
|
||||
|
||||
- name: Checkout Slab repo
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
repository: zama-ai/slab
|
||||
path: slab
|
||||
|
||||
4
.github/workflows/benchmark_wasm_client.yml
vendored
4
.github/workflows/benchmark_wasm_client.yml
vendored
@@ -117,7 +117,7 @@ jobs:
|
||||
|
||||
- name: Node cache restoration
|
||||
id: node-cache
|
||||
uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
|
||||
uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
|
||||
with:
|
||||
path: |
|
||||
~/.nvm
|
||||
@@ -130,7 +130,7 @@ jobs:
|
||||
make install_node
|
||||
|
||||
- name: Node cache save
|
||||
uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 #v4.2.4
|
||||
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
|
||||
if: steps.node-cache.outputs.cache-hit != 'true'
|
||||
with:
|
||||
path: |
|
||||
|
||||
2
.github/workflows/cargo_audit.yml
vendored
2
.github/workflows/cargo_audit.yml
vendored
@@ -1,4 +1,6 @@
|
||||
# Run cargo audit
|
||||
name: cargo_audit
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
|
||||
170
.github/workflows/cargo_build.yml
vendored
170
.github/workflows/cargo_build.yml
vendored
@@ -18,17 +18,95 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
cargo-builds:
|
||||
name: cargo_build/cargo-builds (bpr)
|
||||
runs-on: ${{ matrix.os }}
|
||||
prepare-parallel-pcc-matrix:
|
||||
name: cargo_build/prepare-parallel-pcc-matrix
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
|
||||
steps:
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: "false"
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
# Fetch all the Make recipes that start with `pcc_batch_`
|
||||
- name: Set pcc commands matrix
|
||||
id: set-pcc-commands-matrix
|
||||
run: |
|
||||
COMMANDS=$(grep -oE '^pcc_batch_[^:]*:' Makefile | sed 's/:/\"/; s/^/\"/' | paste -sd,)
|
||||
echo "commands=[${COMMANDS}]" >> "$GITHUB_OUTPUT"
|
||||
|
||||
parallel-pcc-cpu:
|
||||
name: cargo_build/parallel-pcc-cpu
|
||||
needs: prepare-parallel-pcc-matrix
|
||||
runs-on: large_ubuntu_16
|
||||
strategy:
|
||||
matrix:
|
||||
command: ${{fromJson(needs.prepare-parallel-pcc-matrix.outputs.matrix_command)}}
|
||||
fail-fast: false
|
||||
steps:
|
||||
- name: Checkout tfhe-rs repo
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run pcc checks batch
|
||||
run: |
|
||||
make "${COMMAND}"
|
||||
env:
|
||||
COMMAND: ${{ matrix.command }}
|
||||
|
||||
pcc-hpu:
|
||||
name: cargo_build/pcc-hpu
|
||||
runs-on: large_ubuntu_16
|
||||
steps:
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Run Hpu pcc checks
|
||||
run: |
|
||||
make pcc_hpu
|
||||
|
||||
build-tfhe-full:
|
||||
name: cargo_build/build-tfhe-full
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
# GitHub macos-latest are now M1 macs, so use ours, we limit what runs so it will be fast
|
||||
# even with a few PRs
|
||||
os: [large_ubuntu_16, macos-latest, windows-latest]
|
||||
os: [large_ubuntu_16, macos-latest-xlarge, large_windows_16_latest]
|
||||
fail-fast: false
|
||||
steps:
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Build Release tfhe full
|
||||
run: |
|
||||
make build_tfhe_full
|
||||
|
||||
build:
|
||||
name: cargo_build/build
|
||||
runs-on: large_ubuntu_16
|
||||
steps:
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
@@ -41,7 +119,6 @@ jobs:
|
||||
toolchain: stable
|
||||
|
||||
- name: Install and run newline linter checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
wget https://github.com/fernandrone/linelint/releases/download/0.0.6/linelint-linux-amd64
|
||||
echo "16b70fb7b471d6f95cbdc0b4e5dc2b0ac9e84ba9ecdc488f7bdf13df823aca4b linelint-linux-amd64" > checksum
|
||||
@@ -50,60 +127,93 @@ jobs:
|
||||
mv linelint-linux-amd64 /usr/local/bin/linelint
|
||||
make check_newline
|
||||
|
||||
- name: Run pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make pcc
|
||||
|
||||
- name: Build tfhe-csprng
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_tfhe_csprng
|
||||
|
||||
- name: Build with MSRV
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_tfhe_msrv
|
||||
|
||||
- name: Build coverage tests
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
build-layers:
|
||||
name: cargo_build/build-layers
|
||||
runs-on: large_ubuntu_16
|
||||
steps:
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Build Release core
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_core AVX512_SUPPORT=ON
|
||||
make build_core_experimental AVX512_SUPPORT=ON
|
||||
|
||||
- name: Build Release boolean
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_boolean
|
||||
|
||||
- name: Build Release shortint
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_shortint
|
||||
|
||||
- name: Build Release integer
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_integer
|
||||
|
||||
- name: Build Release tfhe full
|
||||
run: |
|
||||
make build_tfhe_full
|
||||
build-c-api:
|
||||
name: cargo_build/build-c-api
|
||||
runs-on: large_ubuntu_16
|
||||
steps:
|
||||
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Install latest stable
|
||||
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
|
||||
with:
|
||||
toolchain: stable
|
||||
|
||||
- name: Build Release c_api
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_c_api
|
||||
|
||||
- name: Build coverage tests
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make build_tfhe_coverage
|
||||
|
||||
- name: Run Hpu pcc checks
|
||||
if: ${{ contains(matrix.os, 'ubuntu') }}
|
||||
run: |
|
||||
make pcc_hpu
|
||||
|
||||
# The wasm build check is a bit annoying to set-up here and is done during the tests in
|
||||
# aws_tfhe_tests.yml
|
||||
|
||||
cargo-builds:
|
||||
name: cargo_build/cargo-builds (bpr)
|
||||
needs: [ parallel-pcc-cpu, pcc-hpu, build-tfhe-full, build, build-layers, build-c-api ]
|
||||
if: ${{ always() }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check all builds success
|
||||
if: needs.parallel-pcc-cpu.result == 'success' &&
|
||||
needs.pcc-hpu.result == 'success' &&
|
||||
needs.build-tfhe-full.result == 'success' &&
|
||||
needs.build.result == 'success' &&
|
||||
needs.build-layers.result == 'success' &&
|
||||
needs.build-c-api.result == 'success'
|
||||
run: |
|
||||
echo "All tfhe-rs build checks passed"
|
||||
|
||||
- name: Check builds failure
|
||||
if: needs.parallel-pcc-cpu.result != 'success' ||
|
||||
needs.pcc-hpu.result != 'success' ||
|
||||
needs.build-tfhe-full.result != 'success' ||
|
||||
needs.build.result != 'success' ||
|
||||
needs.build-layers.result != 'success' ||
|
||||
needs.build-c-api.result != 'success'
|
||||
run: |
|
||||
echo "Some tfhe-rs build checks failed"
|
||||
exit 1
|
||||
|
||||
2
.github/workflows/ci_lint.yml
vendored
2
.github/workflows/ci_lint.yml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
GH_TOKEN: ${{ env.CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Ensure SHA pinned actions
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@fc87bb5b5a97953d987372e74478de634726b3e5 # v3.0.25
|
||||
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@9e9574ef04ea69da568d6249bd69539ccc704e74 # v4.0.0
|
||||
with:
|
||||
allowlist: |
|
||||
slsa-framework/slsa-github-generator
|
||||
|
||||
@@ -193,7 +193,7 @@ jobs:
|
||||
uses: foundry-rs/foundry-toolchain@82dee4ba654bd2146511f85f0d013af94670c4de
|
||||
|
||||
- name: Cache cargo
|
||||
uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
|
||||
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
@@ -203,14 +203,14 @@ jobs:
|
||||
restore-keys: ${{ runner.os }}-cargo-
|
||||
|
||||
- name: Login to GitHub Container Registry
|
||||
uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Login to Chainguard Registry
|
||||
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
|
||||
with:
|
||||
registry: cgr.dev
|
||||
username: ${{ secrets.CGR_USERNAME }}
|
||||
|
||||
@@ -1,33 +1,31 @@
|
||||
# Publish new release of tfhe-rs on various platform.
|
||||
name: make_release
|
||||
# Common workflow to make crate release
|
||||
name: make_release_common
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
workflow_call:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
package-name:
|
||||
type: string
|
||||
required: true
|
||||
dry-run:
|
||||
type: boolean
|
||||
default: true
|
||||
push_to_crates:
|
||||
description: "Push to crate"
|
||||
type: boolean
|
||||
default: true
|
||||
push_web_package:
|
||||
description: "Push web js package"
|
||||
type: boolean
|
||||
default: true
|
||||
push_node_package:
|
||||
description: "Push node js package"
|
||||
type: boolean
|
||||
default: true
|
||||
npm_latest_tag:
|
||||
description: "Set NPM tag as latest"
|
||||
type: boolean
|
||||
default: false
|
||||
secrets:
|
||||
REPO_CHECKOUT_TOKEN:
|
||||
required: true
|
||||
SLACK_CHANNEL:
|
||||
required: true
|
||||
BOT_USERNAME:
|
||||
required: true
|
||||
SLACK_WEBHOOK:
|
||||
required: true
|
||||
ALLOWED_TEAM:
|
||||
required: true
|
||||
READ_ORG_TOKEN:
|
||||
required: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
NPM_TAG: ""
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
@@ -36,18 +34,18 @@ env:
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-tag:
|
||||
name: make_release/verify-tag
|
||||
verify-triggering-actor:
|
||||
name: make_release_common/verify-triggering-actor
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
uses: ./.github/workflows/verify_triggering_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
ALLOWED_TEAM: ${{ secrets.ALLOWED_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
package:
|
||||
name: make_release/package
|
||||
name: make_release_common/package
|
||||
runs-on: ubuntu-latest
|
||||
needs: verify-tag
|
||||
needs: verify-triggering-actor
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
@@ -58,20 +56,23 @@ jobs:
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
env:
|
||||
PACKAGE: ${{ inputs.package-name }}
|
||||
run: |
|
||||
cargo package -p tfhe
|
||||
cargo package -p "${PACKAGE}"
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate
|
||||
name: crate-${{ inputs.package-name }}
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
|
||||
provenance:
|
||||
name: make_release/provenance
|
||||
if: ${{ !inputs.dry_run }}
|
||||
needs: [package]
|
||||
name: make_release_common/provenance
|
||||
if: ${{ !inputs.dry-run }}
|
||||
needs: package
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
@@ -84,14 +85,14 @@ jobs:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
|
||||
publish_release:
|
||||
name: make_release/publish_release
|
||||
needs: [package] # for comparing hashes
|
||||
name: make_release_common/publish-release
|
||||
needs: package
|
||||
runs-on: ubuntu-latest
|
||||
# For provenance of npmjs publish
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write # also needed for OIDC token exchange on crates.io
|
||||
# Needed for OIDC token exchange on crates.io
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
@@ -99,28 +100,27 @@ jobs:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Create NPM version tag
|
||||
if: ${{ inputs.npm_latest_tag }}
|
||||
run: |
|
||||
echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
|
||||
with:
|
||||
name: crate
|
||||
name: crate-${{ inputs.package-name }}
|
||||
path: target/package
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
|
||||
- name: Publish crate.io package
|
||||
if: ${{ inputs.push_to_crates }}
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
PACKAGE: ${{ inputs.package-name }}
|
||||
DRY_RUN: ${{ inputs.dry-run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe ${DRY_RUN}
|
||||
cargo publish -p "${PACKAGE}" ${DRY_RUN}
|
||||
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
@@ -132,45 +132,12 @@ jobs:
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
- name: Build web package
|
||||
if: ${{ inputs.push_web_package }}
|
||||
run: |
|
||||
make build_web_js_api_parallel
|
||||
|
||||
- name: Publish web package
|
||||
if: ${{ inputs.push_web_package }}
|
||||
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
|
||||
with:
|
||||
token: ${{ secrets.NPM_TOKEN }}
|
||||
package: tfhe/pkg/package.json
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Build Node package
|
||||
if: ${{ inputs.push_node_package }}
|
||||
run: |
|
||||
rm -rf tfhe/pkg
|
||||
|
||||
make build_node_js_api
|
||||
sed -i 's/"tfhe"/"node-tfhe"/g' tfhe/pkg/package.json
|
||||
|
||||
- name: Publish Node package
|
||||
if: ${{ inputs.push_node_package }}
|
||||
uses: JS-DevTools/npm-publish@19c28f1ef146469e409470805ea4279d47c3d35c
|
||||
with:
|
||||
token: ${{ secrets.NPM_TOKEN }}
|
||||
package: tfhe/pkg/package.json
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
SLACK_MESSAGE: "SLSA ${{ inputs.package-name }} - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
SLACK_MESSAGE: "${{ inputs.package-name }} release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
20
.github/workflows/make_release_cuda.yml
vendored
20
.github/workflows/make_release_cuda.yml
vendored
@@ -18,17 +18,17 @@ env:
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-tag:
|
||||
name: make_release_cuda/verify-tag
|
||||
verify-triggering-actor:
|
||||
name: make_release_cuda/verify-triggering-actor
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
uses: ./.github/workflows/verify_triggering_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
setup-instance:
|
||||
name: make_release_cuda/setup-instance
|
||||
needs: verify-tag
|
||||
needs: verify-triggering-actor
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
runner-name: ${{ steps.start-instance.outputs.label }}
|
||||
@@ -101,6 +101,12 @@ jobs:
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-cuda-backend
|
||||
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate-tfhe-cuda-backend
|
||||
path: target/package/*.crate
|
||||
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
@@ -169,6 +175,12 @@ jobs:
|
||||
env:
|
||||
GCC_VERSION: ${{ matrix.gcc }}
|
||||
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
|
||||
with:
|
||||
name: crate-tfhe-cuda-backend
|
||||
path: target/package
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
|
||||
102
.github/workflows/make_release_hpu.yml
vendored
102
.github/workflows/make_release_hpu.yml
vendored
@@ -18,43 +18,12 @@ env:
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-tag:
|
||||
name: make_release_hpu/verify-tag
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
package:
|
||||
name: make_release_hpu/package
|
||||
runs-on: ubuntu-latest
|
||||
needs: verify-tag
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-hpu-backend
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
name: make_release_hpu/provenance
|
||||
if: ${{ !inputs.dry_run }}
|
||||
needs: [package]
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
make-release:
|
||||
name: make_release_hpu/make-release
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-hpu-backend"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
@@ -62,55 +31,10 @@ jobs:
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish_release:
|
||||
name: make_release_hpu/publish-release
|
||||
runs-on: ubuntu-latest
|
||||
needs: [verify-tag, package] # for comparing hashes
|
||||
permissions:
|
||||
# Needed for OIDC token exchange on crates.io
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe-hpu-backend ${DRY_RUN}
|
||||
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
120
.github/workflows/make_release_tfhe.yml
vendored
Normal file
120
.github/workflows/make_release_tfhe.yml
vendored
Normal file
@@ -0,0 +1,120 @@
|
||||
# Publish new release of tfhe-rs on various platform.
|
||||
name: make_release_tfhe
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
push_to_crates:
|
||||
description: "Push to crate"
|
||||
type: boolean
|
||||
default: true
|
||||
push_web_package:
|
||||
description: "Push web js package"
|
||||
type: boolean
|
||||
default: true
|
||||
push_node_package:
|
||||
description: "Push node js package"
|
||||
type: boolean
|
||||
default: true
|
||||
npm_latest_tag:
|
||||
description: "Set NPM tag as latest"
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
NPM_TAG: ""
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
make-release:
|
||||
name: make_release_tfhe/make-release
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
# Needed to create the provenance via GitHub OIDC
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
make-release-js:
|
||||
name: make_release_tfhe/make-release-js
|
||||
needs: make-release
|
||||
runs-on: ubuntu-latest
|
||||
# For provenance of npmjs publish
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write # also needed for OIDC token exchange on crates.io
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Create NPM version tag
|
||||
if: ${{ inputs.npm_latest_tag }}
|
||||
run: |
|
||||
echo "NPM_TAG=latest" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Build web package
|
||||
if: ${{ inputs.push_web_package }}
|
||||
run: |
|
||||
make build_web_js_api_parallel
|
||||
|
||||
- name: Publish web package
|
||||
if: ${{ inputs.push_web_package }}
|
||||
uses: JS-DevTools/npm-publish@7f8fe47b3bea1be0c3aec2b717c5ec1f3e03410b
|
||||
with:
|
||||
token: ${{ secrets.NPM_TOKEN }}
|
||||
package: tfhe/pkg/package.json
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Build Node package
|
||||
if: ${{ inputs.push_node_package }}
|
||||
run: |
|
||||
rm -rf tfhe/pkg
|
||||
|
||||
make build_node_js_api
|
||||
sed -i 's/"tfhe"/"node-tfhe"/g' tfhe/pkg/package.json
|
||||
|
||||
- name: Publish Node package
|
||||
if: ${{ inputs.push_node_package }}
|
||||
uses: JS-DevTools/npm-publish@7f8fe47b3bea1be0c3aec2b717c5ec1f3e03410b
|
||||
with:
|
||||
token: ${{ secrets.NPM_TOKEN }}
|
||||
package: tfhe/pkg/package.json
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
tag: ${{ env.NPM_TAG }}
|
||||
provenance: true
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
110
.github/workflows/make_release_tfhe_csprng.yml
vendored
110
.github/workflows/make_release_tfhe_csprng.yml
vendored
@@ -8,53 +8,15 @@ on:
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
|
||||
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-tag:
|
||||
name: make_release_tfhe_csprng/verify-tag
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
package:
|
||||
name: make_release_tfhe_csprng/package
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-csprng
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate-tfhe-csprng
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
|
||||
provenance:
|
||||
name: make_release_tfhe_csprng/provenance
|
||||
if: ${{ !inputs.dry_run }}
|
||||
needs: [package]
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
make-release:
|
||||
name: make_release_tfhe_csprng/make-release
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-csprng"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
@@ -62,56 +24,10 @@ jobs:
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
|
||||
publish_release:
|
||||
name: make_release_tfhe_csprng/publish-release
|
||||
needs: [verify-tag, package]
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed for OIDC token exchange on crates.io
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
|
||||
with:
|
||||
name: crate-tfhe-csprng
|
||||
path: target/package
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe-csprng ${DRY_RUN}
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-csprng - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-csprng release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
102
.github/workflows/make_release_tfhe_fft.yml
vendored
102
.github/workflows/make_release_tfhe_fft.yml
vendored
@@ -19,43 +19,12 @@ env:
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-tag:
|
||||
name: make_release_tfhe_fft/verify-tag
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
package:
|
||||
name: make_release_tfhe_fft/package
|
||||
runs-on: ubuntu-latest
|
||||
needs: verify-tag
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-fft
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
name: make_release_tfhe_fft/provenance
|
||||
if: ${{ !inputs.dry_run }}
|
||||
needs: [package]
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
make-release:
|
||||
name: make_release_tfhe_fft/make-release
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-fft"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
@@ -63,55 +32,10 @@ jobs:
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish_release:
|
||||
name: make_release_tfhe_fft/publish-release
|
||||
runs-on: ubuntu-latest
|
||||
needs: [verify-tag, package] # for comparing hashes
|
||||
permissions:
|
||||
# Needed for OIDC token exchange on crates.io
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe-fft ${DRY_RUN}
|
||||
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-fft crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-fft release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
102
.github/workflows/make_release_tfhe_ntt.yml
vendored
102
.github/workflows/make_release_tfhe_ntt.yml
vendored
@@ -19,43 +19,12 @@ env:
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-tag:
|
||||
name: make_release_tfhe_ntt/verify-tag
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
package:
|
||||
name: make_release_tfhe_ntt/package
|
||||
runs-on: ubuntu-latest
|
||||
needs: verify-tag
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-ntt
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
name: make_release_tfhe_ntt/provenance
|
||||
if: ${{ !inputs.dry_run }}
|
||||
needs: [package]
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
make-release:
|
||||
name: make_release_tfhe_ntt/make-release
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-ntt"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
@@ -63,55 +32,10 @@ jobs:
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish_release:
|
||||
name: make_release_tfhe_ntt/publish-release
|
||||
runs-on: ubuntu-latest
|
||||
needs: [verify-tag, package] # for comparing hashes
|
||||
permissions:
|
||||
# Needed for OIDC token exchange on crates.io
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe-ntt ${DRY_RUN}
|
||||
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-ntt crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-ntt release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
196
.github/workflows/make_release_tfhe_versionable.yml
vendored
196
.github/workflows/make_release_tfhe_versionable.yml
vendored
@@ -2,6 +2,11 @@ name: make_release_tfhe_versionable
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
dry_run:
|
||||
description: "Dry-run"
|
||||
type: boolean
|
||||
default: true
|
||||
|
||||
env:
|
||||
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
@@ -13,41 +18,34 @@ env:
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
verify-tag:
|
||||
name: make_release_tfhe_versionable/verify-tag
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
make-release-derive:
|
||||
name: make_release_tfhe_versionable/make-release-derive
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-versionable-derive"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
# Needed to create the provenance via GitHub OIDC
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
package-derive:
|
||||
name: make_release_tfhe_versionable/package-derive
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-versionable-derive
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate-tfhe-versionable-derive
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance-derive:
|
||||
name: make_release_tfhe_versionable/provenance-derive
|
||||
needs: [package-derive]
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
make-release:
|
||||
name: make_release_tfhe_versionable/make-release
|
||||
needs: make-release-derive
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-versionable"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
@@ -55,132 +53,10 @@ jobs:
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package-derive.outputs.hash }}
|
||||
|
||||
publish_release-derive:
|
||||
name: make_release_tfhe_versionable/publish_release_derive
|
||||
needs: [ verify-tag, package-derive ] # for comparing hashes
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed for OIDC token exchange on crates.io
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
|
||||
with:
|
||||
name: crate-tfhe-versionable-derive
|
||||
path: target/package
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
run: |
|
||||
cargo publish -p tfhe-versionable-derive
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package-derive.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-versionable-derive - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-versionable-derive release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
|
||||
package:
|
||||
name: make_release_tfhe_versionable/package
|
||||
needs: publish_release-derive
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-versionable
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate-tfhe-versionable
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
name: make_release_tfhe_versionable/provenance
|
||||
needs: package
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
# Needed to create the provenance via GitHub OIDC
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish_release:
|
||||
name: make_release_tfhe_versionable/publish-release
|
||||
needs: package # for comparing hashes
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
|
||||
with:
|
||||
name: crate-tfhe-versionable
|
||||
path: target/package
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
run: |
|
||||
cargo publish -p tfhe-versionable
|
||||
- name: Generate hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-versionable - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-versionable release finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
102
.github/workflows/make_release_zk_pok.yml
vendored
102
.github/workflows/make_release_zk_pok.yml
vendored
@@ -18,43 +18,12 @@ env:
|
||||
permissions: { }
|
||||
|
||||
jobs:
|
||||
verify-tag:
|
||||
name: make_release_zk_pok/verify-tag
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
uses: ./.github/workflows/verify_commit_actor.yml
|
||||
secrets:
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
package:
|
||||
name: make_release_zk_pok/package
|
||||
runs-on: ubuntu-latest
|
||||
needs: verify-tag
|
||||
outputs:
|
||||
hash: ${{ steps.hash.outputs.hash }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Prepare package
|
||||
run: |
|
||||
cargo package -p tfhe-zk-pok
|
||||
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
||||
with:
|
||||
name: crate-zk-pok
|
||||
path: target/package/*.crate
|
||||
- name: generate hash
|
||||
id: hash
|
||||
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
provenance:
|
||||
name: make_release_zk_pok/provenance
|
||||
if: ${{ !inputs.dry_run }}
|
||||
needs: [ package ]
|
||||
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
|
||||
make-release:
|
||||
name: make_release_zk_pok/make-release
|
||||
uses: ./.github/workflows/make_release_common.yml
|
||||
with:
|
||||
package-name: "tfhe-zk-pok"
|
||||
dry-run: ${{ inputs.dry_run }}
|
||||
permissions:
|
||||
# Needed to detect the GitHub Actions environment
|
||||
actions: read
|
||||
@@ -62,55 +31,10 @@ jobs:
|
||||
id-token: write
|
||||
# Needed to upload assets/artifacts
|
||||
contents: write
|
||||
with:
|
||||
# SHA-256 hashes of the Crate package.
|
||||
base64-subjects: ${{ needs.package.outputs.hash }}
|
||||
|
||||
publish_release:
|
||||
name: make_release_zk_pok/publish-release
|
||||
needs: [ verify-tag, package ] # for comparing hashes
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
# Needed for OIDC token exchange on crates.io
|
||||
id-token: write
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: 'false'
|
||||
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
- name: Download artifact
|
||||
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
|
||||
with:
|
||||
name: crate-zk-pok
|
||||
path: target/package
|
||||
- name: Authenticate on registry
|
||||
uses: rust-lang/crates-io-auth-action@e919bc7605cde86df457cf5b93c5e103838bd879 # v1.0.1
|
||||
id: auth
|
||||
- name: Publish crate.io package
|
||||
env:
|
||||
CARGO_REGISTRY_TOKEN: ${{ steps.auth.outputs.token }}
|
||||
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
|
||||
run: |
|
||||
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
|
||||
# would fail. This is safe since DRY_RUN is handled in the env section above.
|
||||
# shellcheck disable=SC2086
|
||||
cargo publish -p tfhe-zk-pok ${DRY_RUN}
|
||||
- name: Verify hash
|
||||
id: published_hash
|
||||
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
|
||||
- name: Slack notification (hashes comparison)
|
||||
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: failure
|
||||
SLACK_MESSAGE: "SLSA tfhe-zk-pok crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
|
||||
- name: Slack Notification
|
||||
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
|
||||
continue-on-error: true
|
||||
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
|
||||
env:
|
||||
SLACK_COLOR: ${{ job.status }}
|
||||
SLACK_MESSAGE: "tfhe-zk-pok release failed: (${{ env.ACTION_RUN_URL }})"
|
||||
secrets:
|
||||
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
|
||||
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
|
||||
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
|
||||
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
|
||||
ALLOWED_TEAM: ${{ secrets.RELEASE_TEAM }}
|
||||
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
|
||||
|
||||
2
.github/workflows/unverified_prs.yml
vendored
2
.github/workflows/unverified_prs.yml
vendored
@@ -14,7 +14,7 @@ jobs:
|
||||
issues: read
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
|
||||
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
|
||||
with:
|
||||
stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
|
||||
days-before-stale: 2
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Verify a commit actor
|
||||
name: verify_commit_actor
|
||||
# Verify a triggering actor
|
||||
name: verify_triggering_actor
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
@@ -13,7 +13,7 @@ permissions: {}
|
||||
|
||||
jobs:
|
||||
check-actor:
|
||||
name: verify_commit_actor/check-actor
|
||||
name: verify_triggering_actor/check-actor
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
# Check triggering actor membership
|
||||
@@ -24,7 +24,7 @@ exclude = [
|
||||
]
|
||||
[workspace.dependencies]
|
||||
aligned-vec = { version = "0.6", default-features = false }
|
||||
bytemuck = "1.14.3"
|
||||
bytemuck = "<1.24"
|
||||
dyn-stack = { version = "0.11", default-features = false }
|
||||
itertools = "0.14"
|
||||
num-complex = "0.4"
|
||||
@@ -54,3 +54,10 @@ debug-assertions = false
|
||||
|
||||
[workspace.metadata.dylint]
|
||||
libraries = [{ path = "utils/tfhe-lints" }]
|
||||
|
||||
[profile.debug_lto_off]
|
||||
inherits = "dev"
|
||||
debug = true
|
||||
lto = "off"
|
||||
debug-assertions = false
|
||||
overflow-checks = false
|
||||
|
||||
74
Makefile
74
Makefile
@@ -54,6 +54,7 @@ TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
|
||||
|
||||
# tfhe-hpu-backend
|
||||
HPU_CONFIG=v80
|
||||
V80_PCIE_DEV?=01
|
||||
|
||||
# Exclude these files from coverage reports
|
||||
define COVERAGE_EXCLUDED_FILES
|
||||
@@ -745,6 +746,16 @@ test_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
--features=integer,gpu -p tfhe -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence integer::gpu::server_key::radix::tests_long_run::test_signed_random_op_sequence --test-threads=1 --nocapture
|
||||
|
||||
.PHONY: build_debug_integer_short_run_gpu # Run the long run integer tests on the gpu backend
|
||||
build_debug_integer_short_run_gpu: install_rs_check_toolchain install_cargo_nextest
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test -vv --no-run --profile debug_lto_off \
|
||||
--features=integer,gpu-debug-fake-multi-gpu -p tfhe
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile debug_lto_off \
|
||||
--features=integer,gpu-debug-fake-multi-gpu -p tfhe -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence::test_gpu_short_random --list
|
||||
@echo "To debug fake-multi-gpu short run tests run:"
|
||||
@echo "TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE <executable> integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence::test_gpu_short_random_op_sequence_param_gpu_multi_bit_group_4_message_2_carry_2_ks_pbs_tuniform_2m128 --nocapture"
|
||||
@echo "Where <executable> = the one printed in the () in the 'Running unittests src/lib.rs ()' line above"
|
||||
|
||||
.PHONY: test_integer_compression
|
||||
test_integer_compression: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
|
||||
@@ -1009,6 +1020,11 @@ build_one_hl_api_test_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=integer,gpu-debug -vv -p tfhe -- "$${TEST}" --test-threads=1 --nocapture
|
||||
|
||||
.PHONY: build_one_hl_api_test_fake_multi_gpu
|
||||
build_one_hl_api_test_fake_multi_gpu: install_rs_build_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --no-run \
|
||||
--features=integer,gpu-debug-fake-multi-gpu -vv -p tfhe -- "$${TEST}" --test-threads=1 --nocapture
|
||||
|
||||
test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
|
||||
ifeq ($(HPU_CONFIG), v80)
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
|
||||
@@ -1189,6 +1205,8 @@ check_compile_tests: install_rs_build_toolchain
|
||||
--features=experimental,boolean,shortint,integer,internal-keycache \
|
||||
-p tfhe
|
||||
|
||||
.PHONY: check_compile_tests_c_api # Build C API tests without running them
|
||||
check_compile_tests_c_api: install_rs_build_toolchain
|
||||
@if [[ "$(OS)" == "Linux" || "$(OS)" == "Darwin" ]]; then \
|
||||
"$(MAKE)" build_c_api && \
|
||||
./scripts/c_api_tests.sh --build-only --cargo-profile "$(CARGO_PROFILE)"; \
|
||||
@@ -1326,11 +1344,12 @@ bench_signed_integer_gpu: install_rs_check_toolchain
|
||||
|
||||
.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
|
||||
bench_integer_hpu: install_rs_check_toolchain
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG); \
|
||||
export V80_PCIE_DEV=${V80_PCIE_DEV}; \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench integer-bench \
|
||||
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --
|
||||
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
|
||||
|
||||
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
|
||||
bench_integer_compression: install_rs_check_toolchain
|
||||
@@ -1503,21 +1522,22 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
|
||||
bench_hlapi: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi \
|
||||
--features=integer,internal-keycache,nightly-avx512 -p tfhe-benchmark --
|
||||
--features=integer,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
|
||||
bench_hlapi_gpu: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi \
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark --
|
||||
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_hpu # Run benchmarks for HLAPI operations on HPU
|
||||
bench_hlapi_hpu: install_rs_check_toolchain
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG); \
|
||||
export V80_PCIE_DEV=${V80_PCIE_DEV}; \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi \
|
||||
--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --
|
||||
--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
|
||||
bench_hlapi_erc20: install_rs_check_toolchain
|
||||
@@ -1545,11 +1565,12 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
|
||||
|
||||
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
|
||||
bench_hlapi_erc20_hpu: install_rs_check_toolchain
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
|
||||
source ./setup_hpu.sh --config $(HPU_CONFIG); \
|
||||
export V80_PCIE_DEV=${V80_PCIE_DEV}; \
|
||||
RUSTFLAGS="$(RUSTFLAGS)" \
|
||||
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
|
||||
--bench hlapi-erc20 \
|
||||
--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --
|
||||
--features=integer,internal-keycache,hpu,hpu-v80,pbs-stats -p tfhe-benchmark --
|
||||
|
||||
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
|
||||
bench_tfhe_zk_pok: install_rs_check_toolchain
|
||||
@@ -1662,11 +1683,38 @@ sha256_bool: install_rs_check_toolchain
|
||||
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) run --profile $(CARGO_PROFILE) \
|
||||
--example sha256_bool --features=boolean
|
||||
|
||||
.PHONY: pcc # pcc stands for pre commit checks (except GPU)
|
||||
pcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
|
||||
check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash \
|
||||
clippy_all check_compile_tests test_tfhe_lints \
|
||||
tfhe_lints
|
||||
.PHONY: pcc # pcc stands for pre commit checks for CPU compilation
|
||||
pcc: pcc_batch_1 pcc_batch_2 pcc_batch_3 pcc_batch_4 pcc_batch_5 pcc_batch_6 pcc_batch_7
|
||||
|
||||
#
|
||||
# PCC split into several batches to speed-up CI feedback.
|
||||
# Each batch have roughly the same execution time.
|
||||
# Durations are given from GitHub Ubuntu large runner with 16 CPU.
|
||||
#
|
||||
|
||||
.PHONY: pcc_batch_1 # duration: 6'10''
|
||||
pcc_batch_1: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
|
||||
check_md_docs_are_tested check_intra_md_links check_doc_paths_use_dash test_tfhe_lints tfhe_lints \
|
||||
clippy_rustdoc
|
||||
|
||||
.PHONY: pcc_batch_2 # duration: 6'10''
|
||||
pcc_batch_2: clippy clippy_all_targets
|
||||
|
||||
.PHONY: pcc_batch_3 # duration: 6'50''
|
||||
pcc_batch_3: clippy_shortint clippy_integer
|
||||
|
||||
.PHONY: pcc_batch_4 # duration: 7'40''
|
||||
pcc_batch_4: clippy_core clippy_js_wasm_api clippy_ws_tests clippy_bench
|
||||
|
||||
.PHONY: pcc_batch_5 # duration: 7'20''
|
||||
pcc_batch_5: clippy_tfhe_lints check_compile_tests clippy_backward_compat_data
|
||||
|
||||
.PHONY: pcc_batch_6 # duration: 4'50'' (shortest one, extend it with further checks)
|
||||
pcc_batch_6: clippy_boolean clippy_c_api clippy_tasks clippy_tfhe_csprng clippy_zk_pok \
|
||||
clippy_trivium clippy_versionable clippy_param_dedup
|
||||
|
||||
.PHONY: pcc_batch_7 # duration: 7'50'' (currently PCC execution bottleneck)
|
||||
pcc_batch_7: check_compile_tests_c_api
|
||||
|
||||
.PHONY: pcc_gpu # pcc stands for pre commit checks for GPU compilation
|
||||
pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
|
||||
|
||||
@@ -45,7 +45,7 @@ production-ready library for all the advanced features of TFHE.
|
||||
- **Short integer API** that enables exact, unbounded FHE integer arithmetics with up to 8 bits of message space
|
||||
- **Size-efficient public key encryption**
|
||||
- **Ciphertext and server key compression** for efficient data transfer
|
||||
- **Full Rust API, C bindings to the Rust High-Level API, and client-side Javascript API using WASM**.
|
||||
- **Full Rust API, C bindings to the Rust High-Level API, and client-side JavaScript API using WASM**.
|
||||
|
||||
*Learn more about TFHE-rs features in the [documentation](https://docs.zama.ai/tfhe-rs/readme).*
|
||||
<br></br>
|
||||
@@ -79,7 +79,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
|
||||
```
|
||||
|
||||
> [!Note]
|
||||
> Note: You need to use Rust version >= 1.84 to compile TFHE-rs.
|
||||
> Note: You need Rust version 1.84 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
|
||||
|
||||
> [!Note]
|
||||
> Note: AArch64-based machines are not supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.
|
||||
@@ -147,7 +147,7 @@ To run this code, use the following command:
|
||||
|
||||
> [!Note]
|
||||
> Note that when running code that uses `TFHE-rs`, it is highly recommended
|
||||
to run in release mode with cargo's `--release` flag to have the best performances possible.
|
||||
to run in release mode with cargo's `--release` flag to have the best performance possible.
|
||||
|
||||
*Find an example with more explanations in [this part of the documentation](https://docs.zama.ai/tfhe-rs/get-started/quick-start)*
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ extend-ignore-identifiers-re = [
|
||||
# Example in trivium
|
||||
"C9217BA0D762ACA1",
|
||||
"0x[0-9a-fA-F]+",
|
||||
"xrt_coreutil",
|
||||
]
|
||||
|
||||
[files]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tfhe-cuda-backend"
|
||||
version = "0.11.0"
|
||||
version = "0.12.0"
|
||||
edition = "2021"
|
||||
authors = ["Zama team"]
|
||||
license = "BSD-3-Clause-Clear"
|
||||
@@ -20,3 +20,4 @@ bindgen = "0.71"
|
||||
experimental-multi-arch = []
|
||||
profile = []
|
||||
debug = []
|
||||
debug-fake-multi-gpu = []
|
||||
|
||||
@@ -48,13 +48,16 @@ fn main() {
|
||||
// Conditionally pass the "USE_NVTOOLS" variable to CMake if the feature is enabled
|
||||
if cfg!(feature = "profile") {
|
||||
cmake_config.define("USE_NVTOOLS", "ON");
|
||||
println!("cargo:rustc-link-lib=nvToolsExt");
|
||||
} else {
|
||||
cmake_config.define("USE_NVTOOLS", "OFF");
|
||||
}
|
||||
|
||||
if cfg!(feature = "debug") {
|
||||
cmake_config.define("CMAKE_BUILD_TYPE", "Debug");
|
||||
} else if cfg!(feature = "debug-fake-multi-gpu") {
|
||||
cmake_config.define("CMAKE_BUILD_TYPE", "DebugOnlyCpu");
|
||||
cmake_config.define("CMAKE_VERBOSE_MAKEFILE", "ON");
|
||||
cmake_config.define("FAKE_MULTI_GPU", "ON");
|
||||
}
|
||||
|
||||
// Build the CMake project
|
||||
@@ -81,6 +84,7 @@ fn main() {
|
||||
"cuda/include/ciphertext.h",
|
||||
"cuda/include/integer/compression/compression.h",
|
||||
"cuda/include/integer/integer.h",
|
||||
"cuda/include/aes/aes.h",
|
||||
"cuda/include/zk/zk.h",
|
||||
"cuda/include/keyswitch/keyswitch.h",
|
||||
"cuda/include/keyswitch/ks_enums.h",
|
||||
|
||||
@@ -87,6 +87,9 @@ if(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debug")
|
||||
add_definitions(-DDEBUG)
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -G -g")
|
||||
set(USE_NVTOOLS 1)
|
||||
elseif(CMAKE_BUILD_TYPE_LOWERCASE STREQUAL "debugonlycpu")
|
||||
message("Compiling GPU kernels in Release and CPU code in Debug")
|
||||
set(OPTIMIZATION_FLAGS "${OPTIMIZATION_FLAGS} -O0 -g")
|
||||
else()
|
||||
# Release mode
|
||||
message("Compiling in Release mode")
|
||||
@@ -99,6 +102,11 @@ if(${USE_NVTOOLS})
|
||||
add_definitions(-DUSE_NVTOOLS)
|
||||
endif()
|
||||
|
||||
if(${FAKE_MULTI_GPU})
|
||||
message(STATUS "Fake multi-gpu debugging is enabled")
|
||||
add_definitions(-DDEBUG_FAKE_MULTI_GPU)
|
||||
endif()
|
||||
|
||||
# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging to use
|
||||
# nvtx when profiling -lnvToolsExt
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
|
||||
44
backends/tfhe-cuda-backend/cuda/include/aes/aes.h
Normal file
44
backends/tfhe-cuda-backend/cuda/include/aes/aes.h
Normal file
@@ -0,0 +1,44 @@
|
||||
#ifndef AES_H
|
||||
#define AES_H
|
||||
#include "../integer/integer.h"
|
||||
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_integer_aes_encrypt_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
|
||||
uint32_t sbox_parallelism);
|
||||
|
||||
void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *iv,
|
||||
CudaRadixCiphertextFFI const *round_keys,
|
||||
const uint64_t *counter_bits_le_all_blocks,
|
||||
uint32_t num_aes_inputs, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
uint64_t scratch_cuda_integer_key_expansion_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *expanded_keys,
|
||||
CudaRadixCiphertextFFI const *key,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
}
|
||||
|
||||
#endif
|
||||
440
backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
Normal file
440
backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
Normal file
@@ -0,0 +1,440 @@
|
||||
#ifndef AES_UTILITIES
|
||||
#define AES_UTILITIES
|
||||
#include "../integer/integer_utilities.h"
|
||||
|
||||
/**
|
||||
* This structure holds pre-computed LUTs for essential bitwise operations
|
||||
* required by the homomorphic AES circuit. Pre-computing these tables allows
|
||||
* for efficient application of non-linear functions like AND during the PBS
|
||||
* process. It includes LUTs for:
|
||||
* - AND: for the non-linear part of the S-Box.
|
||||
* - FLUSH: to clear carry bits and isolate the message bit (x -> x & 1).
|
||||
* - CARRY: to extract the carry bit for additions (x -> (x >> 1) & 1).
|
||||
*/
|
||||
template <typename Torus> struct int_aes_lut_buffers {
|
||||
int_radix_lut<Torus> *and_lut;
|
||||
int_radix_lut<Torus> *flush_lut;
|
||||
int_radix_lut<Torus> *carry_lut;
|
||||
|
||||
int_aes_lut_buffers(CudaStreams streams, const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint32_t num_aes_inputs,
|
||||
uint32_t sbox_parallelism, uint64_t &size_tracker) {
|
||||
|
||||
constexpr uint32_t AES_STATE_BITS = 128;
|
||||
constexpr uint32_t SBOX_MAX_AND_GATES = 18;
|
||||
|
||||
this->and_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1,
|
||||
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
std::function<Torus(Torus, Torus)> and_lambda =
|
||||
[](Torus a, Torus b) -> Torus { return a & b; };
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
|
||||
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, and_lambda, allocate_gpu_memory);
|
||||
auto active_streams_and_lut = streams.active_gpu_subset(
|
||||
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
|
||||
this->and_lut->broadcast_lut(active_streams_and_lut);
|
||||
|
||||
this->flush_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, AES_STATE_BITS * num_aes_inputs,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
|
||||
return x & 1;
|
||||
};
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
|
||||
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, flush_lambda, allocate_gpu_memory);
|
||||
auto active_streams_flush_lut =
|
||||
streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
|
||||
this->flush_lut->broadcast_lut(active_streams_flush_lut);
|
||||
|
||||
this->carry_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_aes_inputs, allocate_gpu_memory, size_tracker);
|
||||
std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
|
||||
return (x >> 1) & 1;
|
||||
};
|
||||
generate_device_accumulator(
|
||||
streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
|
||||
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, carry_lambda, allocate_gpu_memory);
|
||||
auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
|
||||
this->carry_lut->broadcast_lut(active_streams_carry_lut);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
this->and_lut->release(streams);
|
||||
delete this->and_lut;
|
||||
this->and_lut = nullptr;
|
||||
|
||||
this->flush_lut->release(streams);
|
||||
delete this->flush_lut;
|
||||
this->flush_lut = nullptr;
|
||||
|
||||
this->carry_lut->release(streams);
|
||||
delete this->carry_lut;
|
||||
this->carry_lut = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* The operations within an AES round, particularly MixColumns, require
|
||||
* intermediate storage for calculations. These buffers are designed to hold
|
||||
* temporary values like copies of columns or the results of multiplications,
|
||||
* avoiding overwriting data that is still needed in the same round.
|
||||
*/
|
||||
template <typename Torus> struct int_aes_round_workspaces {
|
||||
CudaRadixCiphertextFFI *mix_columns_col_copy_buffer;
|
||||
CudaRadixCiphertextFFI *mix_columns_mul_workspace_buffer;
|
||||
CudaRadixCiphertextFFI *vec_tmp_bit_buffer;
|
||||
|
||||
int_aes_round_workspaces(CudaStreams streams, const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint32_t num_aes_inputs,
|
||||
uint64_t &size_tracker) {
|
||||
|
||||
constexpr uint32_t BITS_PER_BYTE = 8;
|
||||
constexpr uint32_t BYTES_PER_COLUMN = 4;
|
||||
constexpr uint32_t BITS_PER_COLUMN = BITS_PER_BYTE * BYTES_PER_COLUMN;
|
||||
constexpr uint32_t MIX_COLUMNS_MUL_WORKSPACE_BYTES = BYTES_PER_COLUMN + 1;
|
||||
|
||||
this->mix_columns_col_copy_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->mix_columns_col_copy_buffer, BITS_PER_COLUMN * num_aes_inputs,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->mix_columns_mul_workspace_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->mix_columns_mul_workspace_buffer,
|
||||
MIX_COLUMNS_MUL_WORKSPACE_BYTES * BITS_PER_BYTE * num_aes_inputs,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->vec_tmp_bit_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->vec_tmp_bit_buffer,
|
||||
num_aes_inputs, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams, bool allocate_gpu_memory) {
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->mix_columns_col_copy_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->mix_columns_col_copy_buffer;
|
||||
this->mix_columns_col_copy_buffer = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->mix_columns_mul_workspace_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->mix_columns_mul_workspace_buffer;
|
||||
this->mix_columns_mul_workspace_buffer = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->vec_tmp_bit_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->vec_tmp_bit_buffer;
|
||||
this->vec_tmp_bit_buffer = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* In CTR mode, a counter is homomorphically added to the encrypted IV. This
|
||||
* structure holds the necessary buffers for this 128-bit ripple-carry
|
||||
* addition, such as the buffer for the propagating carry bit
|
||||
* (`vec_tmp_carry_buffer`) across the addition chain.
|
||||
*/
|
||||
template <typename Torus> struct int_aes_counter_workspaces {
|
||||
CudaRadixCiphertextFFI *vec_tmp_carry_buffer;
|
||||
CudaRadixCiphertextFFI *vec_tmp_sum_buffer;
|
||||
CudaRadixCiphertextFFI *vec_trivial_b_bits_buffer;
|
||||
Torus *h_counter_bits_buffer;
|
||||
Torus *d_counter_bits_buffer;
|
||||
|
||||
int_aes_counter_workspaces(CudaStreams streams,
|
||||
const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint32_t num_aes_inputs,
|
||||
uint64_t &size_tracker) {
|
||||
|
||||
this->vec_tmp_carry_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->vec_tmp_carry_buffer,
|
||||
num_aes_inputs, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->vec_tmp_sum_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->vec_tmp_sum_buffer,
|
||||
num_aes_inputs, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->vec_trivial_b_bits_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->vec_trivial_b_bits_buffer, num_aes_inputs,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->h_counter_bits_buffer =
|
||||
(Torus *)malloc(num_aes_inputs * sizeof(Torus));
|
||||
size_tracker += num_aes_inputs * sizeof(Torus);
|
||||
this->d_counter_bits_buffer = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_aes_inputs * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
|
||||
size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams, bool allocate_gpu_memory) {
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->vec_tmp_carry_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->vec_tmp_carry_buffer;
|
||||
this->vec_tmp_carry_buffer = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->vec_tmp_sum_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->vec_tmp_sum_buffer;
|
||||
this->vec_tmp_sum_buffer = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->vec_trivial_b_bits_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->vec_trivial_b_bits_buffer;
|
||||
this->vec_trivial_b_bits_buffer = nullptr;
|
||||
|
||||
free(this->h_counter_bits_buffer);
|
||||
if (allocate_gpu_memory) {
|
||||
cuda_drop_async(this->d_counter_bits_buffer, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
streams.synchronize();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* This structure allocates the most significant memory blocks:
|
||||
* - `sbox_internal_workspace`: A large workspace for the complex, parallel
|
||||
* evaluation of the S-Box circuit.
|
||||
* - `main_bitsliced_states_buffer`: Holds the entire set of AES states in a
|
||||
* bitsliced layout, which is optimal for parallel bitwise operations on the
|
||||
* GPU.
|
||||
* - Other buffers are used for data layout transformations (transposition) and
|
||||
* for batching small operations into larger, more efficient launches.
|
||||
*/
|
||||
template <typename Torus> struct int_aes_main_workspaces {
|
||||
CudaRadixCiphertextFFI *sbox_internal_workspace;
|
||||
CudaRadixCiphertextFFI *initial_states_and_jit_key_workspace;
|
||||
CudaRadixCiphertextFFI *main_bitsliced_states_buffer;
|
||||
CudaRadixCiphertextFFI *tmp_tiled_key_buffer;
|
||||
CudaRadixCiphertextFFI *batch_processing_buffer;
|
||||
|
||||
int_aes_main_workspaces(CudaStreams streams, const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint32_t num_aes_inputs,
|
||||
uint32_t sbox_parallelism, uint64_t &size_tracker) {
|
||||
|
||||
constexpr uint32_t AES_STATE_BITS = 128;
|
||||
constexpr uint32_t SBOX_MAX_AND_GATES = 18;
|
||||
constexpr uint32_t BATCH_BUFFER_OPERANDS = 3;
|
||||
|
||||
this->sbox_internal_workspace = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->sbox_internal_workspace,
|
||||
num_aes_inputs * AES_STATE_BITS * sbox_parallelism,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->initial_states_and_jit_key_workspace = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->initial_states_and_jit_key_workspace,
|
||||
num_aes_inputs * AES_STATE_BITS, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->main_bitsliced_states_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
this->main_bitsliced_states_buffer, num_aes_inputs * AES_STATE_BITS,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->tmp_tiled_key_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->tmp_tiled_key_buffer,
|
||||
num_aes_inputs * AES_STATE_BITS, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->batch_processing_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->batch_processing_buffer,
|
||||
num_aes_inputs * SBOX_MAX_AND_GATES * BATCH_BUFFER_OPERANDS *
|
||||
sbox_parallelism,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams, bool allocate_gpu_memory) {
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->sbox_internal_workspace,
|
||||
allocate_gpu_memory);
|
||||
delete this->sbox_internal_workspace;
|
||||
this->sbox_internal_workspace = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->initial_states_and_jit_key_workspace,
|
||||
allocate_gpu_memory);
|
||||
delete this->initial_states_and_jit_key_workspace;
|
||||
this->initial_states_and_jit_key_workspace = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->main_bitsliced_states_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->main_bitsliced_states_buffer;
|
||||
this->main_bitsliced_states_buffer = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->tmp_tiled_key_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->tmp_tiled_key_buffer;
|
||||
this->tmp_tiled_key_buffer = nullptr;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->batch_processing_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->batch_processing_buffer;
|
||||
this->batch_processing_buffer = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* This structure acts as a container, holding instances of all the other buffer
|
||||
* management structs. It provides a
|
||||
* single object to manage the entire lifecycle of memory needed for a complete
|
||||
* AES-CTR encryption operation.
|
||||
*/
|
||||
template <typename Torus> struct int_aes_encrypt_buffer {
|
||||
int_radix_params params;
|
||||
bool allocate_gpu_memory;
|
||||
uint32_t num_aes_inputs;
|
||||
uint32_t sbox_parallel_instances;
|
||||
|
||||
int_aes_lut_buffers<Torus> *luts;
|
||||
int_aes_round_workspaces<Torus> *round_workspaces;
|
||||
int_aes_counter_workspaces<Torus> *counter_workspaces;
|
||||
int_aes_main_workspaces<Torus> *main_workspaces;
|
||||
|
||||
int_aes_encrypt_buffer(CudaStreams streams, const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint32_t num_aes_inputs,
|
||||
uint32_t sbox_parallelism, uint64_t &size_tracker) {
|
||||
|
||||
PANIC_IF_FALSE(num_aes_inputs >= 1,
|
||||
"num_aes_inputs should be greater or equal to 1");
|
||||
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
this->num_aes_inputs = num_aes_inputs;
|
||||
this->sbox_parallel_instances = sbox_parallelism;
|
||||
|
||||
this->luts = new int_aes_lut_buffers<Torus>(
|
||||
streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
|
||||
size_tracker);
|
||||
|
||||
this->round_workspaces = new int_aes_round_workspaces<Torus>(
|
||||
streams, params, allocate_gpu_memory, num_aes_inputs, size_tracker);
|
||||
|
||||
this->counter_workspaces = new int_aes_counter_workspaces<Torus>(
|
||||
streams, params, allocate_gpu_memory, num_aes_inputs, size_tracker);
|
||||
|
||||
this->main_workspaces = new int_aes_main_workspaces<Torus>(
|
||||
streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
|
||||
size_tracker);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
luts->release(streams);
|
||||
delete luts;
|
||||
luts = nullptr;
|
||||
|
||||
round_workspaces->release(streams, allocate_gpu_memory);
|
||||
delete round_workspaces;
|
||||
round_workspaces = nullptr;
|
||||
|
||||
counter_workspaces->release(streams, allocate_gpu_memory);
|
||||
delete counter_workspaces;
|
||||
counter_workspaces = nullptr;
|
||||
|
||||
main_workspaces->release(streams, allocate_gpu_memory);
|
||||
delete main_workspaces;
|
||||
main_workspaces = nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* This structure holds the buffer for the 44 words of the expanded key
|
||||
* and temporary storage for word manipulations.
|
||||
* It contains its own instance of `int_aes_encrypt_buffer` because the
|
||||
* key expansion algorithm itself requires using the S-Box.
|
||||
* This separation ensures that memory for key expansion can be allocated and
|
||||
* freed independently of the main encryption process.
|
||||
*/
|
||||
template <typename Torus> struct int_key_expansion_buffer {
|
||||
int_radix_params params;
|
||||
bool allocate_gpu_memory;
|
||||
|
||||
CudaRadixCiphertextFFI *words_buffer;
|
||||
|
||||
CudaRadixCiphertextFFI *tmp_word_buffer;
|
||||
CudaRadixCiphertextFFI *tmp_rotated_word_buffer;
|
||||
|
||||
int_aes_encrypt_buffer<Torus> *aes_encrypt_buffer;
|
||||
|
||||
int_key_expansion_buffer(CudaStreams streams, const int_radix_params ¶ms,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
|
||||
constexpr uint32_t TOTAL_WORDS = 44;
|
||||
constexpr uint32_t BITS_PER_WORD = 32;
|
||||
constexpr uint32_t TOTAL_BITS = TOTAL_WORDS * BITS_PER_WORD;
|
||||
|
||||
this->words_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->words_buffer, TOTAL_BITS,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
this->tmp_word_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->tmp_word_buffer,
|
||||
BITS_PER_WORD, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->tmp_rotated_word_buffer = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), this->tmp_rotated_word_buffer,
|
||||
BITS_PER_WORD, params.big_lwe_dimension, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
this->aes_encrypt_buffer = new int_aes_encrypt_buffer<Torus>(
|
||||
streams, params, allocate_gpu_memory, 1, 4, size_tracker);
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->words_buffer, allocate_gpu_memory);
|
||||
delete this->words_buffer;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->tmp_word_buffer, allocate_gpu_memory);
|
||||
delete this->tmp_word_buffer;
|
||||
|
||||
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
||||
this->tmp_rotated_word_buffer,
|
||||
allocate_gpu_memory);
|
||||
delete this->tmp_rotated_word_buffer;
|
||||
|
||||
this->aes_encrypt_buffer->release(streams);
|
||||
delete this->aes_encrypt_buffer;
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -78,10 +78,9 @@ public:
|
||||
get_active_gpu_count(num_radix_blocks, _gpu_count));
|
||||
}
|
||||
|
||||
// Returns a subset containing only the first gpu of this set. It
|
||||
// is used to create subset of streams for mono-GPU functions
|
||||
CudaStreams subset_first_gpu() const {
|
||||
return CudaStreams(_streams, _gpu_indexes, 1);
|
||||
// Returns a CudaStreams struct containing only the ith stream
|
||||
CudaStreams get_ith(int i) const {
|
||||
return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
|
||||
}
|
||||
|
||||
// Synchronize all the streams in the set
|
||||
@@ -184,4 +183,93 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
struct CudaStreamsBarrier {
|
||||
private:
|
||||
std::vector<cudaEvent_t> _events;
|
||||
CudaStreams _streams;
|
||||
|
||||
CudaStreamsBarrier(const CudaStreamsBarrier &) {} // Prevent copy-construction
|
||||
CudaStreamsBarrier &operator=(const CudaStreamsBarrier &) {
|
||||
return *this;
|
||||
} // Prevent assignment
|
||||
public:
|
||||
void create_on(const CudaStreams &streams) {
|
||||
_streams = streams;
|
||||
|
||||
GPU_ASSERT(streams.count() > 1, "CudaStreamsFirstWaitsWorkersBarrier: "
|
||||
"Attempted to create on single GPU");
|
||||
_events.resize(streams.count());
|
||||
for (int i = 0; i < streams.count(); i++) {
|
||||
_events[i] = cuda_create_event(streams.gpu_index(i));
|
||||
}
|
||||
}
|
||||
|
||||
CudaStreamsBarrier(){};
|
||||
|
||||
void local_streams_wait_for_stream_0(const CudaStreams &user_streams) {
|
||||
GPU_ASSERT(!_events.empty(),
|
||||
"CudaStreamsBarrier: must call create_on before use");
|
||||
GPU_ASSERT(user_streams.gpu_index(0) == _streams.gpu_index(0),
|
||||
"CudaStreamsBarrier: synchronization can only be performed on "
|
||||
"the GPUs the barrier was initially created on.");
|
||||
|
||||
cuda_event_record(_events[0], user_streams.stream(0),
|
||||
user_streams.gpu_index(0));
|
||||
for (int j = 1; j < user_streams.count(); j++) {
|
||||
GPU_ASSERT(user_streams.gpu_index(j) == _streams.gpu_index(j),
|
||||
"CudaStreamsBarrier: synchronization can only be performed on "
|
||||
"the GPUs the barrier was initially created on.");
|
||||
cuda_stream_wait_event(user_streams.stream(j), _events[0],
|
||||
user_streams.gpu_index(j));
|
||||
}
|
||||
}
|
||||
|
||||
void stream_0_wait_for_local_streams(const CudaStreams &user_streams) {
|
||||
GPU_ASSERT(
|
||||
!_events.empty(),
|
||||
"CudaStreamsFirstWaitsWorkersBarrier: must call create_on before use");
|
||||
GPU_ASSERT(
|
||||
user_streams.count() <= _events.size(),
|
||||
"CudaStreamsFirstWaitsWorkersBarrier: trying to synchronize too many "
|
||||
"streams. "
|
||||
"The barrier was created on a LUT that had %lu active streams, while "
|
||||
"the user stream set has %u streams",
|
||||
_events.size(), user_streams.count());
|
||||
|
||||
if (user_streams.count() > 1) {
|
||||
// Worker GPUs record their events
|
||||
for (int j = 1; j < user_streams.count(); j++) {
|
||||
GPU_ASSERT(_streams.gpu_index(j) == user_streams.gpu_index(j),
|
||||
"CudaStreamsBarrier: The user stream "
|
||||
"set GPU[%d]=%u while the LUT stream set GPU[%d]=%u",
|
||||
j, user_streams.gpu_index(j), j, _streams.gpu_index(j));
|
||||
|
||||
cuda_event_record(_events[j], user_streams.stream(j),
|
||||
user_streams.gpu_index(j));
|
||||
}
|
||||
|
||||
// GPU 0 waits for all workers
|
||||
for (int j = 1; j < user_streams.count(); j++) {
|
||||
cuda_stream_wait_event(user_streams.stream(0), _events[j],
|
||||
user_streams.gpu_index(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release() {
|
||||
for (int j = 0; j < _streams.count(); j++) {
|
||||
cuda_event_destroy(_events[j], _streams.gpu_index(j));
|
||||
}
|
||||
|
||||
_events.clear();
|
||||
}
|
||||
|
||||
~CudaStreamsBarrier() {
|
||||
GPU_ASSERT(_events.empty(),
|
||||
"CudaStreamsBarrier: must "
|
||||
"call release before destruction: events size = %lu",
|
||||
_events.size());
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
@@ -4,26 +4,6 @@
|
||||
#include "../../pbs/pbs_enums.h"
|
||||
#include "../integer.h"
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t lwe_dimension;
|
||||
} CudaLweCiphertextListFFI;
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
// Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
|
||||
// smaller)
|
||||
// Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
|
||||
// each LWE of the group). In the end the total number of bodies is equal to
|
||||
// the number of input LWE
|
||||
uint32_t total_lwe_bodies_count;
|
||||
uint32_t glwe_dimension;
|
||||
uint32_t polynomial_size;
|
||||
} CudaPackedGlweCiphertextListFFI;
|
||||
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr,
|
||||
|
||||
@@ -86,6 +86,26 @@ typedef struct {
|
||||
bool const divisor_has_more_bits_than_numerator;
|
||||
} CudaScalarDivisorFFI;
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t lwe_dimension;
|
||||
} CudaLweCiphertextListFFI;
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
// Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
|
||||
// smaller)
|
||||
// Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
|
||||
// each LWE of the group). In the end the total number of bodies is equal to
|
||||
// the number of input LWE
|
||||
uint32_t total_lwe_bodies_count;
|
||||
uint32_t glwe_dimension;
|
||||
uint32_t polynomial_size;
|
||||
} CudaPackedGlweCiphertextListFFI;
|
||||
|
||||
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, void const *input_lut,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -105,9 +125,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
void cuda_apply_univariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks);
|
||||
void *const *ksks, void *const *bsks);
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -125,9 +143,8 @@ void cuda_apply_bivariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_1,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift);
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift);
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -135,9 +152,8 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_luts, uint32_t lut_stride);
|
||||
void *const *ksks, void *const *bsks, uint32_t num_luts,
|
||||
uint32_t lut_stride);
|
||||
|
||||
uint64_t scratch_cuda_full_propagation_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t lwe_dimension,
|
||||
@@ -147,11 +163,10 @@ uint64_t scratch_cuda_full_propagation_64(
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_full_propagation_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks);
|
||||
void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *input_blocks,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -169,9 +184,8 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks);
|
||||
void *const *bsks, void *const *ksks, int8_t *mem_ptr,
|
||||
uint32_t polynomial_size, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
@@ -196,8 +210,7 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
@@ -210,8 +223,7 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -231,8 +243,7 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -251,16 +262,13 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
|
||||
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_scalar_blocks);
|
||||
void *const *ksks, uint32_t num_scalar_blocks);
|
||||
|
||||
void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -278,15 +286,13 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
|
||||
@@ -304,8 +310,7 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_cmux(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -321,8 +326,7 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -333,8 +337,8 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
@@ -342,23 +346,20 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry);
|
||||
|
||||
void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry);
|
||||
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
|
||||
|
||||
void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -380,9 +381,8 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow);
|
||||
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow);
|
||||
|
||||
void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -400,8 +400,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
@@ -418,7 +417,6 @@ void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars);
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
|
||||
@@ -437,8 +435,7 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -455,9 +452,7 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks);
|
||||
void *const *ksks, void *const *bsks, uint32_t num_blocks);
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
@@ -476,8 +471,7 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
|
||||
bool is_signed, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
bool is_signed, void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -494,9 +488,7 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks);
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
|
||||
|
||||
void cleanup_cuda_integer_are_all_comparisons_block_true(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -513,9 +505,7 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks);
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);
|
||||
|
||||
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
@@ -541,9 +531,7 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks);
|
||||
void *const *ksks, void *const *bsks);
|
||||
|
||||
void cleanup_cuda_apply_noise_squashing_kb(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -561,9 +549,7 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry);
|
||||
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry);
|
||||
|
||||
void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -580,7 +566,6 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi);
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
@@ -595,11 +580,12 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_extend_radix_with_sign_msb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
|
||||
uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
int8_t *mem_ptr,
|
||||
uint32_t num_additional_blocks,
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -616,7 +602,6 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits);
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
|
||||
@@ -635,9 +620,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
void const *clear_blocks, void const *h_clear_blocks,
|
||||
@@ -659,9 +642,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
uint32_t numerator_bits);
|
||||
@@ -681,8 +662,7 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
void cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key);
|
||||
void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr_void);
|
||||
@@ -692,16 +672,15 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks_to_process,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
uint32_t message_bits_per_block, uint32_t total_random_bits,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, uint32_t message_bits_per_block,
|
||||
uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_integer_grouped_oprf_async_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input, uint32_t num_blocks_to_process, int8_t *mem,
|
||||
void *const *bsks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input,
|
||||
uint32_t num_blocks_to_process,
|
||||
int8_t *mem, void *const *bsks);
|
||||
|
||||
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
@@ -721,8 +700,7 @@ void cuda_integer_ilog2_kb_64(
|
||||
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_2,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key);
|
||||
void *const *bsks, void *const *ksks);
|
||||
|
||||
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -3,16 +3,6 @@
|
||||
#include <stdint.h>
|
||||
enum PBS_TYPE { MULTI_BIT = 0, CLASSICAL = 1 };
|
||||
enum PBS_VARIANT { DEFAULT = 0, CG = 1, TBC = 2 };
|
||||
enum PBS_MS_REDUCTION_T { NO_REDUCTION = 0, DRIFT = 1, CENTERED = 2 };
|
||||
|
||||
extern "C" {
|
||||
typedef struct {
|
||||
void *const *ptr;
|
||||
uint32_t num_zeros;
|
||||
double ms_bound;
|
||||
double ms_r_sigma;
|
||||
double ms_input_variance;
|
||||
} CudaModulusSwitchNoiseReductionKeyFFI;
|
||||
}
|
||||
enum PBS_MS_REDUCTION_T { NO_REDUCTION = 0, CENTERED = 1 };
|
||||
|
||||
#endif // CUDA_PBS_ENUMS_H
|
||||
|
||||
@@ -80,7 +80,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
Torus *global_accumulator;
|
||||
double2 *global_join_buffer;
|
||||
Torus *temp_lwe_array_in;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
PBS_MS_REDUCTION_T noise_reduction_type;
|
||||
@@ -97,10 +96,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
this->temp_lwe_array_in = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(Torus),
|
||||
stream, gpu_index, size_tracker,
|
||||
noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT);
|
||||
switch (pbs_variant) {
|
||||
case PBS_VARIANT::DEFAULT: {
|
||||
uint64_t full_sm_step_one =
|
||||
@@ -234,10 +229,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
|
||||
if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
|
||||
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -249,8 +240,6 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
InputTorus *temp_lwe_array_in;
|
||||
uint64_t *trivial_indexes;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
PBS_MS_REDUCTION_T noise_reduction_type;
|
||||
@@ -268,27 +257,6 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
|
||||
cuda_set_device(gpu_index);
|
||||
this->pbs_variant = pbs_variant;
|
||||
|
||||
if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
|
||||
this->temp_lwe_array_in =
|
||||
(InputTorus *)cuda_malloc_with_size_tracking_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(InputTorus),
|
||||
stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
||||
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
|
||||
for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
|
||||
h_trivial_indexes[i] = i;
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
trivial_indexes, h_trivial_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
|
||||
allocate_gpu_memory);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
delete[] h_trivial_indexes;
|
||||
}
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
@@ -424,13 +392,6 @@ struct pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> {
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
|
||||
if (noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
|
||||
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -89,18 +89,14 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
|
||||
void const *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
|
||||
@@ -22,8 +22,7 @@ uint64_t scratch_cuda_expand_without_verification_64(
|
||||
void cuda_expand_without_verification_64(
|
||||
CudaStreamsFFI streams, void *lwe_array_out,
|
||||
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
|
||||
void *const *bsks, void *const *computing_ksks, void *const *casting_keys);
|
||||
|
||||
void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void);
|
||||
|
||||
88
backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
Normal file
88
backends/tfhe-cuda-backend/cuda/src/aes/aes.cu
Normal file
@@ -0,0 +1,88 @@
|
||||
#include "../../include/aes/aes.h"
|
||||
#include "aes.cuh"
|
||||
|
||||
uint64_t scratch_cuda_integer_aes_encrypt_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_aes_inputs,
|
||||
uint32_t sbox_parallelism) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_aes_encrypt<uint64_t>(
|
||||
CudaStreams(streams), (int_aes_encrypt_buffer<uint64_t> **)mem_ptr,
|
||||
params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism);
|
||||
}
|
||||
|
||||
void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *iv,
|
||||
CudaRadixCiphertextFFI const *round_keys,
|
||||
const uint64_t *counter_bits_le_all_blocks,
|
||||
uint32_t num_aes_inputs, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_aes_ctr_encrypt<uint64_t>(
|
||||
CudaStreams(streams), output, iv, round_keys, counter_bits_le_all_blocks,
|
||||
num_aes_inputs, (int_aes_encrypt_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint32_t **)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
|
||||
int_aes_encrypt_buffer<uint64_t> *mem_ptr =
|
||||
(int_aes_encrypt_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_key_expansion_64(
|
||||
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
noise_reduction_type);
|
||||
|
||||
return scratch_cuda_integer_key_expansion<uint64_t>(
|
||||
CudaStreams(streams), (int_key_expansion_buffer<uint64_t> **)mem_ptr,
|
||||
params, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *expanded_keys,
|
||||
CudaRadixCiphertextFFI const *key,
|
||||
int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks) {
|
||||
|
||||
host_integer_key_expansion<uint64_t>(
|
||||
CudaStreams(streams), expanded_keys, key,
|
||||
(int_key_expansion_buffer<uint64_t> *)mem_ptr, bsks, (uint32_t **)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
|
||||
int8_t **mem_ptr_void) {
|
||||
int_key_expansion_buffer<uint64_t> *mem_ptr =
|
||||
(int_key_expansion_buffer<uint64_t> *)(*mem_ptr_void);
|
||||
|
||||
mem_ptr->release(CudaStreams(streams));
|
||||
delete mem_ptr;
|
||||
*mem_ptr_void = nullptr;
|
||||
}
|
||||
1254
backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
Normal file
1254
backends/tfhe-cuda-backend/cuda/src/aes/aes.cuh
Normal file
File diff suppressed because it is too large
Load Diff
@@ -103,23 +103,6 @@ void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
|
||||
lwe_dimension, log_modulus);
|
||||
}
|
||||
|
||||
// This end point is used only for testing purposes
|
||||
// its output always follows trivial ordering
|
||||
void cuda_improve_noise_modulus_switch_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *lwe_array_indexes,
|
||||
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
|
||||
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
|
||||
uint32_t log_modulus) {
|
||||
host_drift_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_array_indexes),
|
||||
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
|
||||
num_zeros, input_variance, r_sigma, bound, log_modulus);
|
||||
}
|
||||
|
||||
void cuda_glwe_sample_extract_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
|
||||
|
||||
@@ -36,7 +36,7 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
*
|
||||
*/
|
||||
// Each thread in x are used to calculate one output.
|
||||
// threads in y are used to paralelize the lwe_dimension_in loop.
|
||||
// threads in y are used to parallelize the lwe_dimension_in loop.
|
||||
// shared memory is used to store intermediate results of the reduction.
|
||||
// Note: To reduce register pressure we have slightly changed the algorithm,
|
||||
// the idea consists in calculating the negate value of the output. So, instead
|
||||
@@ -48,12 +48,12 @@ __device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
// in two parts, a constant part is calculated before the loop, and a variable
|
||||
// part is calculated inside the loop. This seems to help with the register
|
||||
// pressure as well.
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__global__ void
|
||||
keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const Torus *__restrict__ ksk, uint32_t lwe_dimension_in,
|
||||
const KSTorus *__restrict__ ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
|
||||
const int tid = threadIdx.x + blockIdx.y * blockDim.x;
|
||||
const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;
|
||||
@@ -107,11 +107,11 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *ksk, uint32_t lwe_dimension_in,
|
||||
Torus const *lwe_input_indexes, KSTorus const *ksk, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples) {
|
||||
|
||||
@@ -135,19 +135,19 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
|
||||
dim3 grid(num_samples, num_blocks_per_sample, 1);
|
||||
dim3 threads(num_threads_x, num_threads_y, 1);
|
||||
|
||||
keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
|
||||
keyswitch<Torus, KSTorus><<<grid, threads, shared_mem, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
|
||||
lwe_dimension_in, lwe_dimension_out, base_log, level_count);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void execute_keyswitch_async(CudaStreams streams,
|
||||
const LweArrayVariant<Torus> &lwe_array_out,
|
||||
const LweArrayVariant<Torus> &lwe_output_indexes,
|
||||
const LweArrayVariant<Torus> &lwe_array_in,
|
||||
const LweArrayVariant<Torus> &lwe_input_indexes,
|
||||
Torus *const *ksks, uint32_t lwe_dimension_in,
|
||||
KSTorus *const *ksks, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
|
||||
@@ -426,31 +426,4 @@ __global__ void __launch_bounds__(512)
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_drift_modulus_switch(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
|
||||
Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
|
||||
uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
|
||||
const double input_variance, const double r_sigma, const double bound,
|
||||
uint32_t log_modulus) {
|
||||
|
||||
PANIC_IF_FALSE(lwe_size >= 512,
|
||||
"The lwe_size (%d) is less than 512, this is not supported\n",
|
||||
lwe_size);
|
||||
PANIC_IF_FALSE(
|
||||
lwe_size <= 1024,
|
||||
"The lwe_size (%d) is greater than 1024, this is not supported\n",
|
||||
lwe_size);
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
// This reduction requires a power of two num of threads
|
||||
int num_threads = 512, num_blocks = num_lwes;
|
||||
|
||||
improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
|
||||
r_sigma, bound, log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
#endif // CNCRT_TORUS_H
|
||||
|
||||
@@ -266,6 +266,11 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
uint32_t gpu_index, bool gpu_memory_allocated) {
|
||||
if (size == 0 || !gpu_memory_allocated)
|
||||
return;
|
||||
GPU_ASSERT(dest != nullptr,
|
||||
"Cuda error: trying to copy gpu->gpu to null ptr");
|
||||
GPU_ASSERT(src != nullptr,
|
||||
"Cuda error: trying to copy gpu->gpu from null ptr");
|
||||
|
||||
cudaPointerAttributes attr_dest;
|
||||
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
|
||||
PANIC_IF_FALSE(
|
||||
|
||||
@@ -21,14 +21,12 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
|
||||
void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
|
||||
bool is_signed, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
bool is_signed, void *const *bsks, void *const *ksks) {
|
||||
|
||||
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_abs_kb<uint64_t>(CudaStreams(streams), ct, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key,
|
||||
mem, is_signed);
|
||||
(uint32_t **)(ksks), mem, is_signed);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
|
||||
|
||||
@@ -30,11 +30,10 @@ __host__ uint64_t scratch_cuda_integer_abs_kb(
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_abs_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ct, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
|
||||
__host__ void
|
||||
host_integer_abs_kb(CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
void *const *bsks, uint32_t *const *ksks,
|
||||
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
|
||||
if (!is_signed)
|
||||
return;
|
||||
|
||||
@@ -49,19 +48,19 @@ __host__ void host_integer_abs_kb(
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, mask, num_bits_in_ciphertext - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key);
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), ct, mask, ct,
|
||||
ct->num_radix_blocks, mem_ptr->params.message_modulus,
|
||||
mem_ptr->params.carry_modulus);
|
||||
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
host_propagate_single_carry<Torus>(streams, ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks,
|
||||
requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_bitop_kb<Torus>(streams, ct, mask, ct, mem_ptr->bitxor_mem,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
bsks, ksks);
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_ABS_CUH
|
||||
|
||||
@@ -23,13 +23,11 @@ void cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_radix_bitop_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key);
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint32_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
|
||||
|
||||
@@ -11,13 +11,12 @@
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_bitop_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int_bitop_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *bsks, KSTorus *const *ksks) {
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
|
||||
@@ -45,9 +44,8 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
}
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
|
||||
lut->params.message_modulus);
|
||||
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks, lut,
|
||||
lwe_array_out->num_radix_blocks, lut->params.message_modulus);
|
||||
|
||||
memcpy(lwe_array_out->degrees, degrees,
|
||||
lwe_array_out->num_radix_blocks * sizeof(uint64_t));
|
||||
|
||||
@@ -35,16 +35,17 @@ uint64_t scratch_cuda_extend_radix_with_sign_msb_64(
|
||||
num_blocks, num_additional_blocks, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_extend_radix_with_sign_msb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input, int8_t *mem_ptr,
|
||||
uint32_t num_additional_blocks, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void cuda_extend_radix_with_sign_msb_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
int8_t *mem_ptr,
|
||||
uint32_t num_additional_blocks,
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PUSH_RANGE("cast")
|
||||
host_extend_radix_with_sign_msb<uint64_t>(
|
||||
CudaStreams(streams), output, input,
|
||||
(int_extend_radix_with_sign_msb_buffer<uint64_t> *)mem_ptr,
|
||||
num_additional_blocks, bsks, (uint64_t **)ksks, ms_noise_reduction_key);
|
||||
num_additional_blocks, bsks, (uint32_t **)ksks);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
@@ -50,13 +50,12 @@ __host__ uint64_t scratch_extend_radix_with_sign_msb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_extend_radix_with_sign_msb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input,
|
||||
int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
|
||||
uint32_t num_additional_blocks, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
uint32_t num_additional_blocks, void *const *bsks, KSTorus *const *ksks) {
|
||||
|
||||
if (num_additional_blocks == 0) {
|
||||
PUSH_RANGE("cast/extend no addblocks")
|
||||
@@ -79,8 +78,7 @@ __host__ void host_extend_radix_with_sign_msb(
|
||||
input_blocks - 1, input_blocks);
|
||||
|
||||
host_apply_univariate_lut_kb(streams, mem_ptr->padding_block,
|
||||
mem_ptr->last_block, mem_ptr->lut, ksks,
|
||||
ms_noise_reduction_key, bsks);
|
||||
mem_ptr->last_block, mem_ptr->lut, ksks, bsks);
|
||||
|
||||
for (uint32_t i = 0; i < num_additional_blocks; ++i) {
|
||||
uint32_t dst_block_idx = input_blocks + i;
|
||||
|
||||
@@ -29,13 +29,12 @@ void cuda_cmux_integer_radix_ciphertext_kb_64(
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PUSH_RANGE("cmux")
|
||||
host_integer_radix_cmux_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_condition, lwe_array_true,
|
||||
lwe_array_false, (int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
(uint32_t **)(ksks));
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
@@ -4,16 +4,14 @@
|
||||
#include "integer.cuh"
|
||||
#include "radix_ciphertext.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void zero_out_if(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
int_zero_out_if_buffer<Torus> *mem_ptr,
|
||||
int_radix_lut<Torus> *predicate, void *const *bsks,
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out->num_radix_blocks >= num_radix_blocks &&
|
||||
lwe_array_input->num_radix_blocks >= num_radix_blocks,
|
||||
@@ -38,18 +36,17 @@ zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
num_radix_blocks);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks,
|
||||
ms_noise_reduction_key, predicate, num_radix_blocks);
|
||||
streams, lwe_array_out, tmp_lwe_array_input, bsks, ksks, predicate,
|
||||
num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_cmux_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_condition,
|
||||
CudaRadixCiphertextFFI const *lwe_array_true,
|
||||
CudaRadixCiphertextFFI const *lwe_array_false,
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks, KSTorus *const *ksks) {
|
||||
|
||||
if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
@@ -73,8 +70,8 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
}
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, mem_ptr->buffer_out, mem_ptr->buffer_in,
|
||||
mem_ptr->condition_array, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->predicate_lut, 2 * num_radix_blocks, params.message_modulus);
|
||||
mem_ptr->condition_array, bsks, ksks, mem_ptr->predicate_lut,
|
||||
2 * num_radix_blocks, params.message_modulus);
|
||||
|
||||
// If the condition was true, true_ct will have kept its value and false_ct
|
||||
// will be 0 If the condition was false, true_ct will be 0 and false_ct will
|
||||
@@ -91,7 +88,7 @@ __host__ void host_integer_radix_cmux_kb(
|
||||
params.message_modulus, params.carry_modulus);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, &mem_true, bsks, ksks, ms_noise_reduction_key,
|
||||
streams, lwe_array_out, &mem_true, bsks, ksks,
|
||||
mem_ptr->message_extract_lut, num_radix_blocks);
|
||||
}
|
||||
|
||||
|
||||
@@ -41,8 +41,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PUSH_RANGE("comparison")
|
||||
if (lwe_array_1->num_radix_blocks != lwe_array_2->num_radix_blocks)
|
||||
PANIC("Cuda error: input num radix blocks must be the same")
|
||||
@@ -57,7 +56,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
case NE:
|
||||
host_integer_radix_equality_check_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
bsks, (uint32_t **)(ksks), num_radix_blocks);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -68,8 +67,8 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
"even.")
|
||||
host_integer_radix_difference_check_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
buffer->diff_buffer->operator_f, bsks, (uint32_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
@@ -77,7 +76,7 @@ void cuda_comparison_integer_radix_ciphertext_kb_64(
|
||||
PANIC("Cuda error (max/min): the number of radix blocks has to be even.")
|
||||
host_integer_radix_maxmin_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2, buffer,
|
||||
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
bsks, (uint32_t **)(ksks), num_radix_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
@@ -118,16 +117,14 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
void cuda_integer_are_all_comparisons_block_true_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_are_all_comparisons_block_true_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
(uint32_t **)(ksks), num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_are_all_comparisons_block_true(
|
||||
@@ -162,16 +159,14 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
int_comparison_buffer<uint64_t> *buffer =
|
||||
(int_comparison_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks);
|
||||
(uint32_t **)(ksks), num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
|
||||
|
||||
@@ -56,14 +56,12 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
* blocks are 1 otherwise the block encrypts 0
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void are_all_comparisons_block_true(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
||||
@@ -158,8 +156,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
if (remaining_blocks == 1) {
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, accumulator, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
|
||||
// Reset max_value_lut_indexes before returning, otherwise if the lut is
|
||||
// reused the lut indexes will be wrong
|
||||
memset(is_max_value_lut->h_lut_indexes, 0,
|
||||
@@ -176,8 +173,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, tmp_out, accumulator, bsks, ksks, ms_noise_reduction_key,
|
||||
lut, num_chunks);
|
||||
streams, tmp_out, accumulator, bsks, ksks, lut, num_chunks);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -188,14 +184,12 @@ __host__ void are_all_comparisons_block_true(
|
||||
* It writes in lwe_array_out a single lwe ciphertext encrypting 1 if at least
|
||||
* one input ciphertext encrypts 1 otherwise encrypts 0
|
||||
*/
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void is_at_least_one_comparisons_block_true(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -249,24 +243,23 @@ __host__ void is_at_least_one_comparisons_block_true(
|
||||
// In the last iteration we copy the output to the final address
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, buffer->tmp_block_accumulated, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
lut, 1);
|
||||
return;
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, mem_ptr->tmp_lwe_array_out, buffer->tmp_block_accumulated,
|
||||
bsks, ksks, ms_noise_reduction_key, lut, num_chunks);
|
||||
bsks, ksks, lut, num_chunks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_compare_blocks_with_zero(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
|
||||
KSTorus *const *ksks, int32_t num_radix_blocks,
|
||||
int_radix_lut<Torus> *zero_comparison) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
@@ -322,21 +315,18 @@ __host__ void host_compare_blocks_with_zero(
|
||||
}
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, sum, bsks, ksks, ms_noise_reduction_key,
|
||||
zero_comparison, num_sum_blocks);
|
||||
streams, lwe_array_out, sum, bsks, ksks, zero_comparison, num_sum_blocks);
|
||||
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, num_sum_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_equality_check_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
@@ -347,27 +337,24 @@ __host__ void host_integer_radix_equality_check_kb(
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, comparisons, lwe_array_1, lwe_array_2, bsks, ksks,
|
||||
ms_noise_reduction_key, eq_buffer->operator_lut, num_radix_blocks,
|
||||
eq_buffer->operator_lut, num_radix_blocks,
|
||||
eq_buffer->operator_lut->params.message_modulus);
|
||||
|
||||
// This takes a Vec of blocks, where each block is either 0 or 1.
|
||||
//
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, lwe_array_out, comparisons, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
are_all_comparisons_block_true<Torus>(streams, lwe_array_out, comparisons,
|
||||
mem_ptr, bsks, ksks, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void compare_radix_blocks_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
||||
@@ -400,8 +387,8 @@ __host__ void compare_radix_blocks_kb(
|
||||
// Apply LUT to compare to 0
|
||||
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_out, bsks, ksks, ms_noise_reduction_key,
|
||||
is_non_zero_lut, num_radix_blocks);
|
||||
streams, lwe_array_out, lwe_array_out, bsks, ksks, is_non_zero_lut,
|
||||
num_radix_blocks);
|
||||
|
||||
// Add one
|
||||
// Here Lhs can have the following values: (-1) % (message modulus * carry
|
||||
@@ -413,15 +400,14 @@ __host__ void compare_radix_blocks_kb(
|
||||
// Reduces a vec containing shortint blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single shortint block containing the
|
||||
// final sign
|
||||
template <typename Torus>
|
||||
__host__ void tree_sign_reduction(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void
|
||||
tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_block_comparisons,
|
||||
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
|
||||
std::function<Torus(Torus)> sign_handler_f,
|
||||
void *const *bsks, KSTorus *const *ksks,
|
||||
uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_block_comparisons->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -430,7 +416,6 @@ __host__ void tree_sign_reduction(
|
||||
"than the number of blocks to operate on")
|
||||
|
||||
auto params = tree_buffer->params;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto message_modulus = params.message_modulus;
|
||||
@@ -455,8 +440,7 @@ __host__ void tree_sign_reduction(
|
||||
partial_block_count, message_modulus);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, x, y, bsks, ksks, ms_noise_reduction_key, inner_tree_leaf,
|
||||
partial_block_count >> 1);
|
||||
streams, x, y, bsks, ksks, inner_tree_leaf, partial_block_count >> 1);
|
||||
|
||||
if ((partial_block_count % 2) != 0) {
|
||||
partial_block_count >>= 1;
|
||||
@@ -502,20 +486,17 @@ __host__ void tree_sign_reduction(
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, y, bsks, ksks, ms_noise_reduction_key, last_lut,
|
||||
1);
|
||||
streams, lwe_array_out, y, bsks, ksks, last_lut, 1);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_difference_check_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
||||
@@ -555,7 +536,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, diff_buffer->tmp_packed, diff_buffer->tmp_packed, bsks, ksks,
|
||||
ms_noise_reduction_key, identity_lut, 2 * packed_num_radix_blocks);
|
||||
identity_lut, 2 * packed_num_radix_blocks);
|
||||
} else {
|
||||
as_radix_ciphertext_slice<Torus>(&lhs, lwe_array_left, 0,
|
||||
lwe_array_left->num_radix_blocks);
|
||||
@@ -573,16 +554,14 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
// Compare packed blocks, or simply the total number of radix blocks in the
|
||||
// inputs
|
||||
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
packed_num_radix_blocks);
|
||||
bsks, ksks, packed_num_radix_blocks);
|
||||
num_comparisons = packed_num_radix_blocks;
|
||||
} else {
|
||||
// Packing is possible
|
||||
if (carry_modulus >= message_modulus) {
|
||||
// Compare (num_radix_blocks - 2) / 2 packed blocks
|
||||
compare_radix_blocks_kb<Torus>(streams, comparisons, &lhs, &rhs, mem_ptr,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
packed_num_radix_blocks);
|
||||
bsks, ksks, packed_num_radix_blocks);
|
||||
|
||||
// Compare the last block before the sign block separately
|
||||
auto identity_lut = mem_ptr->identity_lut;
|
||||
@@ -596,7 +575,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, &last_left_block_before_sign_block, &shifted_lwe_array_left,
|
||||
bsks, ksks, ms_noise_reduction_key, identity_lut, 1);
|
||||
bsks, ksks, identity_lut, 1);
|
||||
|
||||
CudaRadixCiphertextFFI last_right_block_before_sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
@@ -609,8 +588,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, &last_right_block_before_sign_block,
|
||||
&shifted_lwe_array_right, bsks, ksks, ms_noise_reduction_key,
|
||||
identity_lut, 1);
|
||||
&shifted_lwe_array_right, bsks, ksks, identity_lut, 1);
|
||||
|
||||
CudaRadixCiphertextFFI shifted_comparisons;
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
@@ -618,8 +596,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
packed_num_radix_blocks + 1);
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, &shifted_comparisons, &last_left_block_before_sign_block,
|
||||
&last_right_block_before_sign_block, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, 1);
|
||||
&last_right_block_before_sign_block, mem_ptr, bsks, ksks, 1);
|
||||
|
||||
// Compare the sign block separately
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
@@ -633,14 +610,14 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, &shifted_comparisons, &last_left_block, &last_right_block,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
bsks, ksks, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
num_comparisons = packed_num_radix_blocks + 2;
|
||||
|
||||
} else {
|
||||
compare_radix_blocks_kb<Torus>(
|
||||
streams, comparisons, lwe_array_left, lwe_array_right, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, num_radix_blocks - 1);
|
||||
compare_radix_blocks_kb<Torus>(streams, comparisons, lwe_array_left,
|
||||
lwe_array_right, mem_ptr, bsks, ksks,
|
||||
num_radix_blocks - 1);
|
||||
// Compare the sign block separately
|
||||
CudaRadixCiphertextFFI shifted_comparisons;
|
||||
as_radix_ciphertext_slice<Torus>(&shifted_comparisons, comparisons,
|
||||
@@ -653,7 +630,7 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, &shifted_comparisons, &last_left_block, &last_right_block,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->signed_lut, 1,
|
||||
bsks, ksks, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
num_comparisons = num_radix_blocks;
|
||||
}
|
||||
@@ -662,9 +639,9 @@ __host__ void host_integer_radix_difference_check_kb(
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
tree_sign_reduction<Torus>(
|
||||
streams, lwe_array_out, comparisons, mem_ptr->diff_buffer->tree_buffer,
|
||||
reduction_lut_f, bsks, ksks, ms_noise_reduction_key, num_comparisons);
|
||||
tree_sign_reduction<Torus>(streams, lwe_array_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer, reduction_lut_f,
|
||||
bsks, ksks, num_comparisons);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -680,15 +657,13 @@ __host__ uint64_t scratch_cuda_integer_radix_comparison_check_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_maxmin_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_left,
|
||||
CudaRadixCiphertextFFI const *lwe_array_right,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
|
||||
@@ -702,45 +677,38 @@ __host__ void host_integer_radix_maxmin_kb(
|
||||
// Compute the sign
|
||||
host_integer_radix_difference_check_kb<Torus>(
|
||||
streams, mem_ptr->tmp_lwe_array_out, lwe_array_left, lwe_array_right,
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks);
|
||||
mem_ptr, mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks);
|
||||
|
||||
// Selector
|
||||
host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks,
|
||||
ksks, ms_noise_reduction_key);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_are_all_comparisons_block_true_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
are_all_comparisons_block_true<Torus>(streams, lwe_array_out, lwe_array_in,
|
||||
mem_ptr, bsks, ksks, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
// It returns a block encrypting 1 if all input blocks are 1
|
||||
// otherwise the block encrypts 0
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
is_at_least_one_comparisons_block_true<Torus>(streams, lwe_array_out,
|
||||
lwe_array_in, mem_ptr, bsks,
|
||||
ksks, num_radix_blocks);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -344,7 +344,7 @@ host_integer_decompress(CudaStreams streams,
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
|
||||
lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
|
||||
lut->lwe_indexes_in, d_bsks, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
@@ -359,13 +359,9 @@ host_integer_decompress(CudaStreams streams,
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec =
|
||||
lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_event_record(lut->event_scatter_in, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
|
||||
active_streams);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
@@ -378,7 +374,7 @@ host_integer_decompress(CudaStreams streams,
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
|
||||
lwe_trivial_indexes_vec, d_bsks, nullptr, lut->buffer,
|
||||
lwe_trivial_indexes_vec, d_bsks, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
@@ -395,15 +391,8 @@ host_integer_decompress(CudaStreams streams,
|
||||
|
||||
/// Synchronize all GPUs
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], active_streams.stream(j),
|
||||
active_streams.gpu_index(j));
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
|
||||
active_streams);
|
||||
}
|
||||
} else {
|
||||
static_assert(std::is_same_v<Torus, __uint128_t>,
|
||||
|
||||
@@ -24,14 +24,13 @@ void cuda_integer_div_rem_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *bsks, void *const *ksks) {
|
||||
PUSH_RANGE("div")
|
||||
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
|
||||
|
||||
host_integer_div_rem_kb<uint64_t>(
|
||||
CudaStreams(streams), quotient, remainder, numerator, divisor, is_signed,
|
||||
bsks, (uint64_t **)(ksks), ms_noise_reduction_key, mem);
|
||||
host_integer_div_rem_kb<uint64_t>(CudaStreams(streams), quotient, remainder,
|
||||
numerator, divisor, is_signed, bsks,
|
||||
(uint32_t **)(ksks), mem);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/abs.cuh"
|
||||
#include "integer/cast.cuh"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
@@ -31,14 +32,455 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
|
||||
uint32_t *const *ksks, unsigned_int_div_rem_2_2_memory<uint64_t> *mem_ptr) {
|
||||
|
||||
if (streams.count() < 4) {
|
||||
PANIC("GPU count should be greater than 4 when using div_rem_2_2");
|
||||
}
|
||||
if (mem_ptr->params.message_modulus != 4 ||
|
||||
mem_ptr->params.carry_modulus != 4) {
|
||||
PANIC("Only message_modulus == 4 && carry_modulus == 4 parameters are "
|
||||
"supported");
|
||||
}
|
||||
|
||||
// alias
|
||||
auto radix_params = mem_ptr->params;
|
||||
auto num_blocks = quotient->num_radix_blocks;
|
||||
auto remainder_gpu_0 = remainder;
|
||||
auto remainder_gpu_1 = mem_ptr->remainder_gpu_1;
|
||||
auto remainder_gpu_2 = mem_ptr->remainder_gpu_2;
|
||||
auto remainder_gpu_3 = mem_ptr->remainder_gpu_3;
|
||||
auto divisor_gpu_0 = divisor;
|
||||
auto divisor_gpu_1 = mem_ptr->divisor_gpu_1;
|
||||
auto divisor_gpu_2 = mem_ptr->divisor_gpu_2;
|
||||
|
||||
// gpu[0] -> gpu[0]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_gpu_0, numerator);
|
||||
|
||||
// gpu[0] -> gpu[1]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(1), streams.gpu_index(1),
|
||||
remainder_gpu_1, numerator);
|
||||
// gpu[0] -> gpu[1]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(1), streams.gpu_index(1),
|
||||
divisor_gpu_1, divisor);
|
||||
// gpu[0] -> gpu[2]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(2), streams.gpu_index(2),
|
||||
remainder_gpu_2, numerator);
|
||||
// gpu[0] -> gpu[3]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(3), streams.gpu_index(3),
|
||||
remainder_gpu_3, numerator);
|
||||
// gpu[0] -> gpu[2]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(2), streams.gpu_index(2),
|
||||
divisor_gpu_2, divisor);
|
||||
|
||||
// gpu[0]
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), quotient, 0, num_blocks);
|
||||
quotient->num_radix_blocks = 0;
|
||||
|
||||
// Copy divisor_gpu_2 into d1 gpu[2] -> gpu[2]
|
||||
mem_ptr->d1->num_radix_blocks = divisor_gpu_2->num_radix_blocks;
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(2), streams.gpu_index(2),
|
||||
mem_ptr->d1, divisor_gpu_2);
|
||||
|
||||
// Computes 2*d by extending and shifting on gpu[1]
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(
|
||||
mem_ptr->d2, divisor_gpu_1, streams.get_ith(1));
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams.get_ith(1), mem_ptr->d2, 1, mem_ptr->shift_mem, &bsks[1],
|
||||
&ksks[1], mem_ptr->d2->num_radix_blocks);
|
||||
|
||||
// Computes 3*d = 4*d - d using block shift and subtraction on gpu[0]
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(
|
||||
mem_ptr->tmp_gpu_0, divisor_gpu_0, streams.get_ith(0));
|
||||
host_radix_blocks_rotate_right<Torus>(streams.get_ith(0), mem_ptr->d3,
|
||||
mem_ptr->tmp_gpu_0, 1,
|
||||
mem_ptr->tmp_gpu_0->num_radix_blocks);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), mem_ptr->d3, 0, 1);
|
||||
host_sub_and_propagate_single_carry(streams.get_ith(0), mem_ptr->d3,
|
||||
mem_ptr->tmp_gpu_0, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, &bsks[0],
|
||||
&ksks[0], outputFlag::FLAG_NONE, 0);
|
||||
|
||||
// +-----------------+-----------------+-----------------+-----------------+
|
||||
// | GPU[0] | GPU[1] | GPU[2] | GPU[3] |
|
||||
// +-----------------+-----------------+-----------------+-----------------+
|
||||
// | d3 | d2 | d1 | - |
|
||||
// | low3 | low2 | low1 | - |
|
||||
// | rem3 | rem2 | rem1 | rem0 |
|
||||
// | sub_result_1 | sub_result_2 | sub_result_3 | - |
|
||||
// | s_1_overflowed | s_2_overflowed | s_3_overflowed | - |
|
||||
// | cmp_1 | cmp_2 | cmp_3 | - |
|
||||
// | r3 | r2 | r1 | - |
|
||||
// | o3 | o2 | o1 | - |
|
||||
// | c3 = !o3 | c2 = !o2 + o3 | c1 = !o1 + o2 | c0 = o1 |
|
||||
// | z_o_not_1_lut_1 | z_o_not_2_lut_1 | z_o_not_2_lut_2 | z_o_not_1_lut_2 |
|
||||
// +-----------------+-----------------+-----------------+-----------------+
|
||||
for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
|
||||
|
||||
uint32_t slice_len = num_blocks - block_index;
|
||||
|
||||
auto init_low_rem_f =
|
||||
[&](CudaRadixCiphertextFFI *low, CudaRadixCiphertextFFI *xd,
|
||||
CudaRadixCiphertextFFI *rem, CudaRadixCiphertextFFI *cur_remainder,
|
||||
size_t gpu_index, bool init_low) {
|
||||
rem->num_radix_blocks = slice_len;
|
||||
if (init_low) {
|
||||
low->num_radix_blocks = slice_len;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(gpu_index), streams.gpu_index(gpu_index), low, 0,
|
||||
slice_len, xd, 0, slice_len);
|
||||
}
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(gpu_index), streams.gpu_index(gpu_index), rem, 0,
|
||||
slice_len, cur_remainder, block_index, num_blocks);
|
||||
};
|
||||
|
||||
init_low_rem_f(nullptr, nullptr, mem_ptr->rem0, remainder_gpu_3, 3, false);
|
||||
init_low_rem_f(mem_ptr->low1, mem_ptr->d1, mem_ptr->rem1, remainder_gpu_2,
|
||||
2, true);
|
||||
init_low_rem_f(mem_ptr->low2, mem_ptr->d2, mem_ptr->rem2, remainder_gpu_1,
|
||||
1, true);
|
||||
init_low_rem_f(mem_ptr->low3, mem_ptr->d3, mem_ptr->rem3, remainder_gpu_0,
|
||||
0, true);
|
||||
|
||||
auto sub_result_f = [&](CudaStreams streams, size_t gpu_index,
|
||||
CudaRadixCiphertextFFI *sub_result,
|
||||
CudaRadixCiphertextFFI *sub_overflowed,
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem,
|
||||
CudaRadixCiphertextFFI *low,
|
||||
CudaRadixCiphertextFFI *rem, Torus *first_indexes,
|
||||
Torus *second_indexes, Torus *scalar_indexes) {
|
||||
uint32_t compute_overflow = 1;
|
||||
uint32_t uses_input_borrow = 0;
|
||||
sub_result->num_radix_blocks = low->num_radix_blocks;
|
||||
overflow_sub_mem->update_lut_indexes(
|
||||
streams.get_ith(gpu_index), first_indexes, second_indexes,
|
||||
scalar_indexes, rem->num_radix_blocks);
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
streams.get_ith(gpu_index), sub_result, rem, low, sub_overflowed,
|
||||
(const CudaRadixCiphertextFFI *)nullptr, overflow_sub_mem,
|
||||
&bsks[gpu_index], &ksks[gpu_index], compute_overflow,
|
||||
uses_input_borrow);
|
||||
};
|
||||
|
||||
auto cmp_f = [&](CudaStreams streams, size_t gpu_index,
|
||||
CudaRadixCiphertextFFI *out_boolean_block,
|
||||
CudaRadixCiphertextFFI *comparison_blocks,
|
||||
CudaRadixCiphertextFFI *d,
|
||||
int_comparison_buffer<Torus> *comparison_buffer) {
|
||||
CudaRadixCiphertextFFI *d_msb = new CudaRadixCiphertextFFI;
|
||||
uint32_t slice_start = num_blocks - block_index;
|
||||
uint32_t slice_end = d->num_radix_blocks;
|
||||
as_radix_ciphertext_slice<Torus>(d_msb, d, slice_start, slice_end);
|
||||
comparison_blocks->num_radix_blocks = d_msb->num_radix_blocks;
|
||||
if (d_msb->num_radix_blocks == 0) {
|
||||
cuda_memset_async(
|
||||
(Torus *)out_boolean_block->ptr, 0,
|
||||
sizeof(Torus) * (out_boolean_block->lwe_dimension + 1),
|
||||
streams.stream(gpu_index), streams.gpu_index(gpu_index));
|
||||
} else {
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams.get_ith(gpu_index), comparison_blocks, d_msb,
|
||||
comparison_buffer, &bsks[gpu_index], &ksks[gpu_index],
|
||||
d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
|
||||
are_all_comparisons_block_true(
|
||||
streams.get_ith(gpu_index), out_boolean_block, comparison_blocks,
|
||||
comparison_buffer, &bsks[gpu_index], &ksks[gpu_index],
|
||||
comparison_blocks->num_radix_blocks);
|
||||
|
||||
host_negation<Torus>(
|
||||
streams.stream(gpu_index), streams.gpu_index(gpu_index),
|
||||
(Torus *)out_boolean_block->ptr, (Torus *)out_boolean_block->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
// we calculate encoding because this block works only for
|
||||
// message_modulus = 4 and carry_modulus = 4.
|
||||
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams.stream(gpu_index), streams.gpu_index(gpu_index),
|
||||
(Torus *)out_boolean_block->ptr, (Torus *)out_boolean_block->ptr,
|
||||
encoded_scalar, radix_params.big_lwe_dimension, 1);
|
||||
}
|
||||
delete d_msb;
|
||||
};
|
||||
|
||||
for (uint j = 0; j < 3; j++) {
|
||||
cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
|
||||
}
|
||||
|
||||
size_t indexes_id = mem_ptr->rem3->num_radix_blocks - 1;
|
||||
sub_result_f(streams, 0, mem_ptr->sub_result_1, mem_ptr->sub_1_overflowed,
|
||||
mem_ptr->overflow_sub_mem_1, mem_ptr->low3, mem_ptr->rem3,
|
||||
mem_ptr->first_indexes_for_overflow_sub_gpu_0[indexes_id],
|
||||
mem_ptr->second_indexes_for_overflow_sub_gpu_0[indexes_id],
|
||||
mem_ptr->scalars_for_overflow_sub_gpu_0[indexes_id]);
|
||||
sub_result_f(streams, 1, mem_ptr->sub_result_2, mem_ptr->sub_2_overflowed,
|
||||
mem_ptr->overflow_sub_mem_2, mem_ptr->low2, mem_ptr->rem2,
|
||||
mem_ptr->first_indexes_for_overflow_sub_gpu_1[indexes_id],
|
||||
mem_ptr->second_indexes_for_overflow_sub_gpu_1[indexes_id],
|
||||
mem_ptr->scalars_for_overflow_sub_gpu_1[indexes_id]);
|
||||
sub_result_f(streams, 2, mem_ptr->sub_result_3, mem_ptr->sub_3_overflowed,
|
||||
mem_ptr->overflow_sub_mem_3, mem_ptr->low1, mem_ptr->rem1,
|
||||
mem_ptr->first_indexes_for_overflow_sub_gpu_2[indexes_id],
|
||||
mem_ptr->second_indexes_for_overflow_sub_gpu_2[indexes_id],
|
||||
mem_ptr->scalars_for_overflow_sub_gpu_2[indexes_id]);
|
||||
|
||||
cmp_f(mem_ptr->sub_streams_1, 0, mem_ptr->cmp_1,
|
||||
mem_ptr->comparison_blocks_1, mem_ptr->d3,
|
||||
mem_ptr->comparison_buffer_1);
|
||||
cmp_f(mem_ptr->sub_streams_1, 1, mem_ptr->cmp_2,
|
||||
mem_ptr->comparison_blocks_2, mem_ptr->d2,
|
||||
mem_ptr->comparison_buffer_2);
|
||||
cmp_f(mem_ptr->sub_streams_1, 2, mem_ptr->cmp_3,
|
||||
mem_ptr->comparison_blocks_3, mem_ptr->d1,
|
||||
mem_ptr->comparison_buffer_3);
|
||||
|
||||
for (uint j = 0; j < 3; j++) {
|
||||
cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1.stream(j),
|
||||
mem_ptr->sub_streams_1.gpu_index(j));
|
||||
}
|
||||
|
||||
auto r1 = mem_ptr->sub_result_3;
|
||||
auto r2 = mem_ptr->sub_result_2;
|
||||
auto r3 = mem_ptr->sub_result_1;
|
||||
auto o1 = mem_ptr->sub_3_overflowed;
|
||||
auto o2 = mem_ptr->sub_2_overflowed;
|
||||
auto o3 = mem_ptr->sub_1_overflowed;
|
||||
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(streams.get_ith(0), o3, o3, mem_ptr->cmp_1,
|
||||
mem_ptr->bitor_mem_1, &bsks[0], &ksks[0]);
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(streams.get_ith(1), o2, o2, mem_ptr->cmp_2,
|
||||
mem_ptr->bitor_mem_2, &bsks[1], &ksks[1]);
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(streams.get_ith(2), o1, o1, mem_ptr->cmp_3,
|
||||
mem_ptr->bitor_mem_3, &bsks[2], &ksks[2]);
|
||||
|
||||
// cmp_1, cmp_2, cmp_3 are not needed anymore, we can reuse them as c3,
|
||||
// c2, c1. c0 is allocated on gpu[3], we take it from mem_ptr.
|
||||
auto c3 = mem_ptr->cmp_1;
|
||||
auto c2 = mem_ptr->cmp_2;
|
||||
auto c1 = mem_ptr->cmp_3;
|
||||
auto c0 = mem_ptr->c0;
|
||||
|
||||
// move all `o` so that each gpu has required `o` for calculating `c`
|
||||
auto o3_gpu_1 = mem_ptr->tmp_gpu_1;
|
||||
auto o2_gpu_2 = mem_ptr->tmp_gpu_2;
|
||||
auto o1_gpu_3 = mem_ptr->tmp_gpu_3;
|
||||
|
||||
o3_gpu_1->num_radix_blocks = o3->num_radix_blocks;
|
||||
o2_gpu_2->num_radix_blocks = o2->num_radix_blocks;
|
||||
o1_gpu_3->num_radix_blocks = o1->num_radix_blocks;
|
||||
|
||||
for (uint j = 0; j < 4; j++) {
|
||||
cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
|
||||
}
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(1), streams.gpu_index(1),
|
||||
o3_gpu_1, o3);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(2), streams.gpu_index(2),
|
||||
o2_gpu_2, o2);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(3), streams.gpu_index(3),
|
||||
o1_gpu_3, o1);
|
||||
|
||||
// c3 = !o3
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), c3, 0, 1, o3, 0, 1);
|
||||
host_negation<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
(Torus *)c3->ptr, (Torus *)c3->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), (Torus *)c3->ptr,
|
||||
(Torus *)c3->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
|
||||
|
||||
// c2 = !o2 + o3
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(1), streams.gpu_index(1), c2, 0, 1, o2, 0, 1);
|
||||
host_negation<Torus>(streams.stream(1), streams.gpu_index(1),
|
||||
(Torus *)c2->ptr, (Torus *)c2->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams.stream(1), streams.gpu_index(1), (Torus *)c2->ptr,
|
||||
(Torus *)c2->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams.stream(1), streams.gpu_index(1), c2, c2,
|
||||
o3_gpu_1, 1, 4, 4);
|
||||
|
||||
// c1 = !o1 + o2
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(2), streams.gpu_index(2), c1, 0, 1, o1, 0, 1);
|
||||
host_negation<Torus>(streams.stream(2), streams.gpu_index(2),
|
||||
(Torus *)c1->ptr, (Torus *)c1->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams.stream(2), streams.gpu_index(2), (Torus *)c1->ptr,
|
||||
(Torus *)c1->ptr, encoded_scalar, radix_params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams.stream(2), streams.gpu_index(2), c1, c1,
|
||||
o2_gpu_2, 1, 4, 4);
|
||||
|
||||
// c0 = o1 (direct copy)
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams.stream(3),
|
||||
streams.gpu_index(3), mem_ptr->c0,
|
||||
0, 1, o1_gpu_3, 0, 1);
|
||||
|
||||
auto conditional_update = [&](CudaStreams streams, size_t gpu_index,
|
||||
CudaRadixCiphertextFFI *cx,
|
||||
CudaRadixCiphertextFFI *rx,
|
||||
int_radix_lut<Torus> *lut, Torus factor) {
|
||||
auto rx_list = to_lwe_ciphertext_list(rx);
|
||||
host_cleartext_multiplication<Torus>(streams.stream(gpu_index),
|
||||
streams.gpu_index(gpu_index),
|
||||
(Torus *)rx->ptr, &rx_list, factor);
|
||||
host_add_the_same_block_to_all_blocks<Torus>(streams.stream(gpu_index),
|
||||
streams.gpu_index(gpu_index),
|
||||
rx, rx, cx, 4, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams.get_ith(gpu_index), rx, rx, &bsks[gpu_index],
|
||||
&ksks[gpu_index], lut, rx->num_radix_blocks);
|
||||
};
|
||||
|
||||
for (uint j = 0; j < 4; j++) {
|
||||
cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1.stream(j),
|
||||
mem_ptr->sub_streams_1.gpu_index(j));
|
||||
}
|
||||
|
||||
conditional_update(streams, 0, c3, r3, mem_ptr->zero_out_if_not_1_lut_1, 2);
|
||||
conditional_update(streams, 1, c2, r2, mem_ptr->zero_out_if_not_2_lut_1, 3);
|
||||
conditional_update(streams, 2, c1, r1, mem_ptr->zero_out_if_not_2_lut_2, 3);
|
||||
conditional_update(streams, 3, c0, mem_ptr->rem0,
|
||||
mem_ptr->zero_out_if_not_1_lut_2, 2);
|
||||
|
||||
// calculate quotient bits GPU[2]
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1.get_ith(2), mem_ptr->q1, c1, &bsks[2], &ksks[2],
|
||||
mem_ptr->quotient_lut_1, 1);
|
||||
// calculate quotient bits GPU[1]
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1.get_ith(1), mem_ptr->q2, c2, &bsks[1], &ksks[1],
|
||||
mem_ptr->quotient_lut_2, 1);
|
||||
// calculate quotient bits GPU[0]
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1.get_ith(0), mem_ptr->q3, c3, &bsks[0], &ksks[0],
|
||||
mem_ptr->quotient_lut_3, 1);
|
||||
|
||||
for (uint j = 0; j < 4; j++) {
|
||||
cuda_synchronize_stream(streams.stream(j), streams.gpu_index(j));
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1.stream(j),
|
||||
mem_ptr->sub_streams_1.gpu_index(j));
|
||||
}
|
||||
|
||||
// We need to accumulate rem, r1, r2, and r3, but each buffer currently
|
||||
// lives on a different GPU. To gather them on GPU[0], we’ll **reuse**
|
||||
// buffers already allocated on GPU[0]. At this point, the contents of rem3,
|
||||
// tmp_gpu_0, and low3 are no longer needed, so it’s safe to repurpose them.
|
||||
// Aliases for the GPU[0] destinations:
|
||||
auto r3_gpu_0 = r3; // reuse: destination for r3 on GPU[0]
|
||||
auto r2_gpu_0 = mem_ptr->tmp_gpu_0; // reuse: destination for r2 on GPU[0]
|
||||
auto r1_gpu_0 = mem_ptr->low3; // reuse: destination for r1 on GPU[0]
|
||||
auto rem_gpu_0 = mem_ptr->rem3; // reuse: destination for rem on GPU[0]
|
||||
|
||||
r2_gpu_0->num_radix_blocks = r2->num_radix_blocks;
|
||||
// r3 is already on GPU 0, so no need to copy it.
|
||||
|
||||
// Copy r2 from GPU[1] to GPU[0]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
r2_gpu_0, r2);
|
||||
|
||||
// Copy r1 from GPU[2] to GPU[0]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
r1_gpu_0, r1);
|
||||
|
||||
// Copy rem from GPU[3] to GPU[0]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
rem_gpu_0, mem_ptr->rem0);
|
||||
|
||||
// We do the same to accumulate quotient bits q1, q2 and q3. q3 is already
|
||||
// on GPU[0]. To copy q1 and q2 we will reuse buffers allocated on GPU[0]:
|
||||
// sub_1_overflowed and cmp_1.
|
||||
auto q3_gpu_0 = mem_ptr->q3; // q3 is already on GPU[0]
|
||||
auto q2_gpu_0 =
|
||||
mem_ptr->sub_1_overflowed; // reuse: destination for q2 on GPU[0]
|
||||
auto q1_gpu_0 = mem_ptr->cmp_1; // reuse: destination for q1 on GPU[0]
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
q2_gpu_0, mem_ptr->q2);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
q1_gpu_0, mem_ptr->q1);
|
||||
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), rem_gpu_0,
|
||||
rem_gpu_0, r3_gpu_0, rem_gpu_0->num_radix_blocks, 4,
|
||||
4);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), rem_gpu_0,
|
||||
rem_gpu_0, r2_gpu_0, rem_gpu_0->num_radix_blocks, 4,
|
||||
4);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), rem_gpu_0,
|
||||
rem_gpu_0, r1_gpu_0, rem_gpu_0->num_radix_blocks, 4,
|
||||
4);
|
||||
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), q3_gpu_0,
|
||||
q3_gpu_0, q2_gpu_0, 1, 4, 4);
|
||||
host_addition<Torus>(streams.stream(0), streams.gpu_index(0), q3_gpu_0,
|
||||
q3_gpu_0, q1_gpu_0, 1, 4, 4);
|
||||
|
||||
streams.synchronize();
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, rem_gpu_0, rem_gpu_0, bsks, ksks,
|
||||
mem_ptr->message_extract_lut_1, rem_gpu_0->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, q3_gpu_0, q3_gpu_0, bsks, ksks,
|
||||
mem_ptr->message_extract_lut_2, 1);
|
||||
streams.synchronize();
|
||||
mem_ptr->sub_streams_1.synchronize();
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), remainder_gpu_0, block_index,
|
||||
remainder_gpu_0->num_radix_blocks, rem_gpu_0, 0,
|
||||
rem_gpu_0->num_radix_blocks);
|
||||
insert_block_in_radix_ciphertext_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), q3_gpu_0, quotient, 0);
|
||||
|
||||
// Copy remainder_gpu_0 to all other GPUs
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_gpu_1, remainder_gpu_0);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_gpu_2, remainder_gpu_0);
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_gpu_3, remainder_gpu_0);
|
||||
|
||||
// non boolean blocks
|
||||
for (int block_id = 0; block_id < slice_len; block_id++) {
|
||||
mem_ptr->sub_result_1->degrees[block_id] =
|
||||
radix_params.message_modulus - 1;
|
||||
mem_ptr->rem0->degrees[block_id] = radix_params.message_modulus - 1;
|
||||
}
|
||||
|
||||
// boolean blocks
|
||||
mem_ptr->cmp_3->degrees[0] = 0;
|
||||
mem_ptr->cmp_2->degrees[0] = 0;
|
||||
mem_ptr->cmp_1->degrees[0] = 0;
|
||||
mem_ptr->cmp_3->noise_levels[0] = 0;
|
||||
|
||||
streams.synchronize();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
|
||||
uint32_t *const *ksks, unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
|
||||
|
||||
if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
|
||||
remainder->num_radix_blocks != divisor->num_radix_blocks ||
|
||||
@@ -48,6 +490,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
remainder->lwe_dimension != divisor->lwe_dimension ||
|
||||
remainder->lwe_dimension != quotient->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimension must be equal")
|
||||
|
||||
if (mem_ptr->params.message_modulus == 4 &&
|
||||
mem_ptr->params.carry_modulus == 4 && streams.count() >= 4) {
|
||||
host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
|
||||
streams, quotient, remainder, numerator, divisor, bsks, ksks,
|
||||
mem_ptr->div_rem_2_2_mem);
|
||||
return;
|
||||
}
|
||||
auto radix_params = mem_ptr->params;
|
||||
auto num_blocks = quotient->num_radix_blocks;
|
||||
|
||||
@@ -146,7 +596,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
interesting_divisor->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, &last_interesting_divisor_block,
|
||||
&last_interesting_divisor_block, bsks, ksks, ms_noise_reduction_key,
|
||||
&last_interesting_divisor_block, bsks, ksks,
|
||||
mem_ptr->masking_luts_1[shifted_mask], 1);
|
||||
}; // trim_last_interesting_divisor_bits
|
||||
|
||||
@@ -173,7 +623,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, divisor_ms_blocks, divisor_ms_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->masking_luts_2[shifted_mask], 1);
|
||||
mem_ptr->masking_luts_2[shifted_mask], 1);
|
||||
}; // trim_first_divisor_ms_bits
|
||||
|
||||
// This does
|
||||
@@ -195,7 +645,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, interesting_remainder1, 1, mem_ptr->shift_mem_1, bsks, ksks,
|
||||
ms_noise_reduction_key, interesting_remainder1->num_radix_blocks);
|
||||
interesting_remainder1->num_radix_blocks);
|
||||
|
||||
reset_radix_ciphertext_blocks(mem_ptr->tmp_radix,
|
||||
interesting_remainder1->num_radix_blocks);
|
||||
@@ -224,7 +674,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
auto left_shift_interesting_remainder2 = [&](CudaStreams streams) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, interesting_remainder2, 1, mem_ptr->shift_mem_2, bsks, ksks,
|
||||
ms_noise_reduction_key, interesting_remainder2->num_radix_blocks);
|
||||
interesting_remainder2->num_radix_blocks);
|
||||
}; // left_shift_interesting_remainder2
|
||||
|
||||
streams.synchronize();
|
||||
@@ -297,8 +747,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
streams, new_remainder, merged_interesting_remainder,
|
||||
interesting_divisor, subtraction_overflowed,
|
||||
(const CudaRadixCiphertextFFI *)nullptr, mem_ptr->overflow_sub_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, compute_borrow,
|
||||
uses_input_borrow);
|
||||
bsks, ksks, compute_borrow, uses_input_borrow);
|
||||
};
|
||||
|
||||
// fills:
|
||||
@@ -316,13 +765,12 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
// So we can skip some stuff
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, mem_ptr->tmp_1, trivial_blocks, mem_ptr->comparison_buffer,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
trivial_blocks->num_radix_blocks,
|
||||
bsks, ksks, trivial_blocks->num_radix_blocks,
|
||||
mem_ptr->comparison_buffer->eq_buffer->is_non_zero_lut);
|
||||
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, at_least_one_upper_block_is_non_zero, mem_ptr->tmp_1,
|
||||
mem_ptr->comparison_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->comparison_buffer, bsks, ksks,
|
||||
mem_ptr->tmp_1->num_radix_blocks);
|
||||
}
|
||||
};
|
||||
@@ -335,7 +783,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, cleaned_merged_interesting_remainder,
|
||||
cleaned_merged_interesting_remainder, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_1,
|
||||
mem_ptr->message_extract_lut_1,
|
||||
cleaned_merged_interesting_remainder->num_radix_blocks);
|
||||
};
|
||||
|
||||
@@ -373,8 +821,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, cleaned_merged_interesting_remainder,
|
||||
cleaned_merged_interesting_remainder, overflow_sum_radix, bsks,
|
||||
ksks, ms_noise_reduction_key,
|
||||
mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
|
||||
ksks, mem_ptr->zero_out_if_overflow_did_not_happen[factor_lut_id],
|
||||
cleaned_merged_interesting_remainder->num_radix_blocks, factor);
|
||||
};
|
||||
|
||||
@@ -382,8 +829,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
[&](CudaStreams streams) {
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, new_remainder, new_remainder, overflow_sum_radix, bsks,
|
||||
ksks, ms_noise_reduction_key,
|
||||
mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
|
||||
ksks, mem_ptr->zero_out_if_overflow_happened[factor_lut_id],
|
||||
new_remainder->num_radix_blocks, factor);
|
||||
};
|
||||
|
||||
@@ -392,7 +838,6 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, mem_ptr->did_not_overflow, subtraction_overflowed,
|
||||
at_least_one_upper_block_is_non_zero, bsks, ksks,
|
||||
ms_noise_reduction_key,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block], 1,
|
||||
mem_ptr->merge_overflow_flags_luts[pos_in_block]
|
||||
->params.message_modulus);
|
||||
@@ -451,10 +896,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, remainder, remainder, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_1, num_blocks);
|
||||
mem_ptr->message_extract_lut_1, num_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_2, quotient, quotient, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->message_extract_lut_2, num_blocks);
|
||||
mem_ptr->message_extract_lut_2, num_blocks);
|
||||
|
||||
mem_ptr->sub_streams_1.synchronize();
|
||||
mem_ptr->sub_streams_2.synchronize();
|
||||
@@ -465,9 +910,7 @@ __host__ void host_integer_div_rem_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_div_rem_memory<uint64_t> *int_mem_ptr) {
|
||||
uint32_t *const *ksks, int_div_rem_memory<uint64_t> *int_mem_ptr) {
|
||||
if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
|
||||
remainder->num_radix_blocks != divisor->num_radix_blocks ||
|
||||
remainder->num_radix_blocks != quotient->num_radix_blocks)
|
||||
@@ -492,19 +935,16 @@ __host__ void host_integer_div_rem_kb(
|
||||
streams.synchronize();
|
||||
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_1, positive_numerator,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->abs_mem_1, true);
|
||||
bsks, ksks, int_mem_ptr->abs_mem_1, true);
|
||||
host_integer_abs_kb<Torus>(int_mem_ptr->sub_streams_2, positive_divisor,
|
||||
bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->abs_mem_2, true);
|
||||
bsks, ksks, int_mem_ptr->abs_mem_2, true);
|
||||
|
||||
int_mem_ptr->sub_streams_1.synchronize();
|
||||
int_mem_ptr->sub_streams_2.synchronize();
|
||||
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
|
||||
positive_divisor, bsks, ksks, ms_noise_reduction_key,
|
||||
int_mem_ptr->unsigned_mem);
|
||||
positive_divisor, bsks, ksks, int_mem_ptr->unsigned_mem);
|
||||
|
||||
CudaRadixCiphertextFFI numerator_sign;
|
||||
as_radix_ciphertext_slice<Torus>(&numerator_sign, numerator, num_blocks - 1,
|
||||
@@ -514,7 +954,7 @@ __host__ void host_integer_div_rem_kb(
|
||||
num_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
|
||||
&numerator_sign, &divisor_sign, bsks, ksks, ms_noise_reduction_key,
|
||||
&numerator_sign, &divisor_sign, bsks, ksks,
|
||||
int_mem_ptr->compare_signed_bits_lut, 1,
|
||||
int_mem_ptr->compare_signed_bits_lut->params.message_modulus);
|
||||
|
||||
@@ -527,37 +967,36 @@ __host__ void host_integer_div_rem_kb(
|
||||
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(
|
||||
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, nullptr,
|
||||
nullptr, int_mem_ptr->scp_mem_1, bsks, ksks, ms_noise_reduction_key,
|
||||
requested_flag, uses_carry);
|
||||
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1,
|
||||
int_mem_ptr->negated_quotient, nullptr,
|
||||
nullptr, int_mem_ptr->scp_mem_1, bsks,
|
||||
ksks, requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
|
||||
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(
|
||||
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, nullptr,
|
||||
nullptr, int_mem_ptr->scp_mem_2, bsks, ksks, ms_noise_reduction_key,
|
||||
requested_flag, uses_carry);
|
||||
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_2,
|
||||
int_mem_ptr->negated_remainder, nullptr,
|
||||
nullptr, int_mem_ptr->scp_mem_2, bsks,
|
||||
ksks, requested_flag, uses_carry);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(int_mem_ptr->sub_streams_1, quotient,
|
||||
int_mem_ptr->sign_bits_are_different,
|
||||
int_mem_ptr->negated_quotient, quotient,
|
||||
int_mem_ptr->cmux_quotient_mem, bsks,
|
||||
ksks, ms_noise_reduction_key);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_1, quotient,
|
||||
int_mem_ptr->sign_bits_are_different, int_mem_ptr->negated_quotient,
|
||||
quotient, int_mem_ptr->cmux_quotient_mem, bsks, ksks);
|
||||
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
|
||||
int_mem_ptr->negated_remainder, remainder,
|
||||
int_mem_ptr->cmux_remainder_mem, bsks, ksks, ms_noise_reduction_key);
|
||||
int_mem_ptr->cmux_remainder_mem, bsks, ksks);
|
||||
|
||||
int_mem_ptr->sub_streams_1.synchronize();
|
||||
int_mem_ptr->sub_streams_2.synchronize();
|
||||
} else {
|
||||
host_unsigned_integer_div_rem_kb<Torus>(
|
||||
streams, quotient, remainder, numerator, divisor, bsks, ksks,
|
||||
ms_noise_reduction_key, int_mem_ptr->unsigned_mem);
|
||||
host_unsigned_integer_div_rem_kb<Torus>(streams, quotient, remainder,
|
||||
numerator, divisor, bsks, ksks,
|
||||
int_mem_ptr->unsigned_mem);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,13 +29,12 @@ uint64_t scratch_integer_count_of_consecutive_bits_kb_64(
|
||||
void cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {
|
||||
void *const *ksks) {
|
||||
|
||||
host_integer_count_of_consecutive_bits<uint64_t>(
|
||||
host_integer_count_of_consecutive_bits<uint64_t, uint32_t>(
|
||||
CudaStreams(streams), output_ct, input_ct,
|
||||
(int_count_of_consecutive_bits_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key);
|
||||
(uint32_t **)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_count_of_consecutive_bits_kb_64(
|
||||
@@ -81,13 +80,12 @@ void cuda_integer_ilog2_kb_64(
|
||||
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_2,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key) {
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_ilog2<uint64_t>(
|
||||
host_integer_ilog2<uint64_t, uint32_t>(
|
||||
CudaStreams(streams), output_ct, input_ct, trivial_ct_neg_n, trivial_ct_2,
|
||||
trivial_ct_m_minus_1_block, (int_ilog2_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key);
|
||||
(uint32_t **)ksks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_ilog2_kb_64(CudaStreamsFFI streams,
|
||||
|
||||
@@ -5,18 +5,16 @@
|
||||
#include "integer/integer_utilities.h"
|
||||
#include "multiplication.cuh"
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_prepare_count_of_consecutive_bits(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ciphertext,
|
||||
int_prepare_count_of_consecutive_bits_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *bsks, KSTorus *const *ksks) {
|
||||
|
||||
auto tmp = mem_ptr->tmp_ct;
|
||||
|
||||
host_apply_univariate_lut_kb<Torus>(streams, tmp, ciphertext,
|
||||
mem_ptr->univ_lut_mem, ksks,
|
||||
ms_noise_reduction_key, bsks);
|
||||
mem_ptr->univ_lut_mem, ksks, bsks);
|
||||
|
||||
if (mem_ptr->direction == Leading) {
|
||||
host_radix_blocks_reverse_inplace<Torus>(streams, tmp);
|
||||
@@ -24,7 +22,7 @@ __host__ void host_integer_prepare_count_of_consecutive_bits(
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
streams, ciphertext, tmp, mem_ptr->biv_lut_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, ciphertext->num_radix_blocks);
|
||||
ciphertext->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -43,13 +41,12 @@ __host__ uint64_t scratch_integer_count_of_consecutive_bits(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_count_of_consecutive_bits(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct,
|
||||
int_count_of_consecutive_bits_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
KSTorus *const *ksks) {
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto ct_prepared = mem_ptr->ct_prepared;
|
||||
@@ -60,9 +57,8 @@ __host__ void host_integer_count_of_consecutive_bits(
|
||||
|
||||
// Prepare count of consecutive bits
|
||||
//
|
||||
host_integer_prepare_count_of_consecutive_bits(streams, ct_prepared,
|
||||
mem_ptr->prepare_mem, bsks,
|
||||
ksks, ms_noise_reduction_key);
|
||||
host_integer_prepare_count_of_consecutive_bits(
|
||||
streams, ct_prepared, mem_ptr->prepare_mem, bsks, ksks);
|
||||
|
||||
// Perform addition and propagation of prepared cts
|
||||
//
|
||||
@@ -76,12 +72,11 @@ __host__ void host_integer_count_of_consecutive_bits(
|
||||
}
|
||||
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, output_ct, cts, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->sum_mem, counter_num_blocks, ct_prepared->num_radix_blocks);
|
||||
streams, output_ct, cts, bsks, ksks, mem_ptr->sum_mem, counter_num_blocks,
|
||||
ct_prepared->num_radix_blocks);
|
||||
|
||||
host_propagate_single_carry<Torus>(streams, output_ct, nullptr, nullptr,
|
||||
mem_ptr->propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, 0, 0);
|
||||
mem_ptr->propagate_mem, bsks, ksks, 0, 0);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -102,15 +97,15 @@ __host__ uint64_t scratch_integer_ilog2(CudaStreams streams,
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_ilog2(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_2,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block,
|
||||
int_ilog2_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void
|
||||
host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
|
||||
CudaRadixCiphertextFFI const *input_ct,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_2,
|
||||
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block,
|
||||
int_ilog2_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
KSTorus *const *ksks) {
|
||||
|
||||
// Prepare the input ciphertext by computing the number of consecutive
|
||||
// leading zeros for each of its blocks.
|
||||
@@ -118,8 +113,7 @@ __host__ void host_integer_ilog2(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
mem_ptr->ct_in_buffer, input_ct);
|
||||
host_integer_prepare_count_of_consecutive_bits<Torus>(
|
||||
streams, mem_ptr->ct_in_buffer, mem_ptr->prepare_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
streams, mem_ptr->ct_in_buffer, mem_ptr->prepare_mem, bsks, ksks);
|
||||
|
||||
// Build the input for the sum by taking each block's leading zero count
|
||||
// and placing it into a separate, zero-padded ct slot.
|
||||
@@ -148,17 +142,17 @@ __host__ void host_integer_ilog2(
|
||||
//
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, mem_ptr->sum_output_not_propagated, mem_ptr->sum_input_cts, bsks,
|
||||
ksks, ms_noise_reduction_key, mem_ptr->sum_mem,
|
||||
mem_ptr->counter_num_blocks, mem_ptr->input_num_blocks + 1);
|
||||
ksks, mem_ptr->sum_mem, mem_ptr->counter_num_blocks,
|
||||
mem_ptr->input_num_blocks + 1);
|
||||
|
||||
// Apply luts to the partial sum.
|
||||
//
|
||||
host_apply_univariate_lut_kb<Torus>(
|
||||
streams, mem_ptr->message_blocks_not, mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_message_not, ksks, ms_noise_reduction_key, bsks);
|
||||
host_apply_univariate_lut_kb<Torus>(
|
||||
streams, mem_ptr->carry_blocks_not, mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_carry_not, ksks, ms_noise_reduction_key, bsks);
|
||||
host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->message_blocks_not,
|
||||
mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_message_not, ksks, bsks);
|
||||
host_apply_univariate_lut_kb<Torus>(streams, mem_ptr->carry_blocks_not,
|
||||
mem_ptr->sum_output_not_propagated,
|
||||
mem_ptr->lut_carry_not, ksks, bsks);
|
||||
|
||||
// Left-shift the bitwise-negated carry blocks by one position.
|
||||
//
|
||||
@@ -196,12 +190,12 @@ __host__ void host_integer_ilog2(
|
||||
trivial_ct_2, 0, mem_ptr->counter_num_blocks);
|
||||
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->sum_mem, mem_ptr->counter_num_blocks, 3);
|
||||
streams, output_ct, mem_ptr->sum_input_cts, bsks, ksks, mem_ptr->sum_mem,
|
||||
mem_ptr->counter_num_blocks, 3);
|
||||
|
||||
host_full_propagate_inplace<Torus>(
|
||||
streams, output_ct, mem_ptr->final_propagate_mem, ksks,
|
||||
ms_noise_reduction_key, bsks, mem_ptr->counter_num_blocks);
|
||||
host_full_propagate_inplace<Torus>(streams, output_ct,
|
||||
mem_ptr->final_propagate_mem, ksks, bsks,
|
||||
mem_ptr->counter_num_blocks);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -2,18 +2,17 @@
|
||||
#include "integer/negation.cuh"
|
||||
#include <linear_algebra.h>
|
||||
|
||||
void cuda_full_propagation_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *input_blocks,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *input_blocks,
|
||||
int8_t *mem_ptr, void *const *ksks,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
|
||||
int_fullprop_buffer<uint64_t> *buffer =
|
||||
(int_fullprop_buffer<uint64_t> *)mem_ptr;
|
||||
|
||||
host_full_propagate_inplace<uint64_t>(
|
||||
CudaStreams(streams), input_blocks, buffer, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, bsks, num_blocks);
|
||||
host_full_propagate_inplace<uint64_t>(CudaStreams(streams), input_blocks,
|
||||
buffer, (uint32_t **)(ksks), bsks,
|
||||
num_blocks);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_full_propagation_64(
|
||||
@@ -51,8 +50,8 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
@@ -60,7 +59,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
|
||||
|
||||
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
|
||||
num_blocks, params, requested_flag, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
@@ -69,8 +68,8 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_level,
|
||||
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
|
||||
@@ -78,7 +77,7 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
|
||||
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
|
||||
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
|
||||
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
|
||||
num_blocks, params, requested_flag, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
@@ -103,27 +102,24 @@ void cuda_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out, const CudaRadixCiphertextFFI *carry_in,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_propagate_single_carry<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, carry_out, carry_in,
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint32_t **)(ksks),
|
||||
requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_add_and_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_add_and_propagate_single_carry<uint64_t>(
|
||||
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint32_t **)(ksks),
|
||||
requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
@@ -131,15 +127,13 @@ void cuda_integer_overflowing_sub_kb_64_inplace(
|
||||
const CudaRadixCiphertextFFI *rhs_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
void *const *bsks, void *const *ksks, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow) {
|
||||
PUSH_RANGE("overflow sub")
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
CudaStreams(streams), lhs_array, lhs_array, rhs_array, overflow_block,
|
||||
input_borrow, (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key, compute_overflow,
|
||||
uses_input_borrow);
|
||||
(uint32_t **)ksks, compute_overflow, uses_input_borrow);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -218,14 +212,11 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb_64(
|
||||
void cuda_apply_univariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
void *const *ksks, void *const *bsks) {
|
||||
|
||||
host_apply_univariate_lut_kb<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, bsks);
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint32_t **)(ksks), bsks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
@@ -241,14 +232,13 @@ void cleanup_cuda_apply_univariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
void cuda_apply_many_univariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
void *const *ksks, void *const *bsks, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
host_apply_many_univariate_lut_kb<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, bsks, num_many_lut, lut_stride);
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, (uint32_t **)(ksks), bsks,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_bivariate_lut_kb_64(
|
||||
@@ -275,15 +265,13 @@ void cuda_apply_bivariate_lut_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_1,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe_2, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
|
||||
host_apply_bivariate_lut_kb<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe_1,
|
||||
input_radix_lwe_2, (int_radix_lut<uint64_t> *)mem_ptr,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, bsks, num_radix_blocks,
|
||||
shift);
|
||||
(uint32_t **)(ksks), bsks, num_radix_blocks, shift);
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_bivariate_lut_kb_64(CudaStreamsFFI streams,
|
||||
@@ -320,14 +308,12 @@ uint64_t scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
void cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks) {
|
||||
void *const *ksks, void *const *bsks, uint32_t num_radix_blocks) {
|
||||
|
||||
host_compute_prefix_sum_hillis_steele<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, generates_or_propagates,
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, num_radix_blocks);
|
||||
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint32_t **)(ksks),
|
||||
num_radix_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
|
||||
@@ -399,15 +385,12 @@ uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
void *const *ksks, void *const *bsks) {
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
integer_radix_apply_noise_squashing_kb<uint64_t>(
|
||||
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
|
||||
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key);
|
||||
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint32_t **)ksks);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
@@ -242,8 +242,8 @@ __host__ void host_radix_cumulative_sum_in_groups(cudaStream_t stream,
|
||||
auto lwe_size = dest->lwe_dimension + 1;
|
||||
cuda_set_device(gpu_index);
|
||||
// Each CUDA block is responsible for a single group
|
||||
int num_blocks = (num_radix_blocks + group_size - 1) / group_size,
|
||||
num_threads = 512;
|
||||
int num_blocks = CEIL_DIV(num_radix_blocks, group_size);
|
||||
int num_threads = 512;
|
||||
device_radix_cumulative_sum_in_groups<Torus>
|
||||
<<<num_blocks, num_threads, 0, stream>>>(
|
||||
(Torus *)dest->ptr, (Torus *)src->ptr, num_radix_blocks, lwe_size,
|
||||
@@ -503,13 +503,11 @@ __host__ void host_pack_bivariate_blocks_with_single_block(
|
||||
/// num_radix_blocks corresponds to the number of blocks on which to apply the
|
||||
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
|
||||
/// the input and output numbers of blocks
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks) {
|
||||
PUSH_RANGE("apply lut")
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
@@ -547,28 +545,23 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.subset_first_gpu(), lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], (Torus *)lwe_array_in->ptr,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
|
||||
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams.subset_first_gpu(), (Torus *)lwe_array_out->ptr,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
|
||||
ms_noise_reduction_key, lut->buffer, glwe_dimension,
|
||||
streams.get_ith(0), (Torus *)lwe_array_out->ptr, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_event_record(lut->event_scatter_in, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
|
||||
active_streams);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
@@ -590,10 +583,9 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
PUSH_RANGE("gather")
|
||||
@@ -602,16 +594,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
|
||||
active_streams);
|
||||
}
|
||||
for (uint i = 0; i < num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i];
|
||||
@@ -623,13 +607,12 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
KSTorus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
PUSH_RANGE("apply many lut")
|
||||
// apply_lookup_table
|
||||
auto params = lut->params;
|
||||
@@ -664,28 +647,24 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.subset_first_gpu(), lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], (Torus *)lwe_array_in->ptr,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
|
||||
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams.subset_first_gpu(), (Torus *)lwe_array_out->ptr,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
|
||||
ms_noise_reduction_key, lut->buffer, glwe_dimension,
|
||||
streams.get_ith(0), (Torus *)lwe_array_out->ptr, lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_event_record(lut->event_scatter_in, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
|
||||
active_streams);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
PUSH_RANGE("scatter")
|
||||
@@ -706,10 +685,9 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
PUSH_RANGE("gather")
|
||||
@@ -719,16 +697,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
num_radix_blocks, big_lwe_dimension + 1, num_many_lut);
|
||||
POP_RANGE()
|
||||
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
|
||||
active_streams);
|
||||
}
|
||||
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i % lut->num_blocks];
|
||||
@@ -740,14 +710,13 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_1,
|
||||
CudaRadixCiphertextFFI const *lwe_array_2, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_radix_lut<Torus> *lut, uint32_t num_radix_blocks, uint32_t shift) {
|
||||
KSTorus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
PUSH_RANGE("apply bivar lut")
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
@@ -796,27 +765,23 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.subset_first_gpu(), lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], (Torus *)lwe_array_pbs_in->ptr,
|
||||
lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
ks_base_log, ks_level, num_radix_blocks);
|
||||
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
|
||||
(Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, ks_base_log, ks_level,
|
||||
num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams.subset_first_gpu(), (Torus *)(lwe_array_out->ptr),
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
|
||||
ms_noise_reduction_key, lut->buffer, glwe_dimension,
|
||||
streams.get_ith(0), (Torus *)(lwe_array_out->ptr), lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
cuda_event_record(lut->event_scatter_in, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
|
||||
active_streams);
|
||||
|
||||
PUSH_RANGE("scatter")
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
active_streams, lwe_array_in_vec, (Torus *)lwe_array_pbs_in->ptr,
|
||||
@@ -835,10 +800,9 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
PUSH_RANGE("gather")
|
||||
@@ -847,16 +811,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
|
||||
active_streams);
|
||||
}
|
||||
for (uint i = 0; i < num_radix_blocks; i++) {
|
||||
auto degrees_index = lut->h_lut_indexes[i];
|
||||
@@ -1313,13 +1269,11 @@ void generate_many_lut_device_accumulator(
|
||||
// block states: contains the propagation states for the different blocks
|
||||
// depending on the group it belongs to and the internal position within the
|
||||
// block.
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_compute_shifted_blocks_and_states(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t lut_stride, uint32_t num_many_lut) {
|
||||
KSTorus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
|
||||
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
|
||||
@@ -1328,7 +1282,7 @@ void host_compute_shifted_blocks_and_states(
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, shifted_blocks_and_states, lwe_array, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_array_first_step, num_many_lut, lut_stride);
|
||||
luts_array_first_step, num_many_lut, lut_stride);
|
||||
|
||||
auto shifted_blocks = mem->shifted_blocks;
|
||||
auto block_states = mem->block_states;
|
||||
@@ -1342,14 +1296,12 @@ void host_compute_shifted_blocks_and_states(
|
||||
2 * num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_resolve_group_carries_sequentially(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *resolved_carries,
|
||||
CudaRadixCiphertextFFI *grouping_pgns, int_radix_params params,
|
||||
int_seq_group_prop_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_groups) {
|
||||
KSTorus *const *ksks, uint32_t num_groups) {
|
||||
|
||||
auto group_resolved_carries = mem->group_resolved_carries;
|
||||
if (num_groups > 1) {
|
||||
@@ -1398,8 +1350,8 @@ void host_resolve_group_carries_sequentially(
|
||||
blocks_to_solve + 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, &shifted_group_resolved_carries,
|
||||
&shifted_group_resolved_carries, bsks, ksks, ms_noise_reduction_key,
|
||||
luts_sequential, blocks_to_solve);
|
||||
&shifted_group_resolved_carries, bsks, ksks, luts_sequential,
|
||||
blocks_to_solve);
|
||||
|
||||
// Copy the result to the resolved carries array
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -1412,13 +1364,11 @@ void host_resolve_group_carries_sequentially(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_compute_prefix_sum_hillis_steele(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *step_output,
|
||||
CudaRadixCiphertextFFI *generates_or_propagates, int_radix_lut<Torus> *luts,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
void *const *bsks, KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (step_output->lwe_dimension != generates_or_propagates->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -1440,9 +1390,8 @@ void host_compute_prefix_sum_hillis_steele(
|
||||
int cur_total_blocks = num_radix_blocks - space;
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, luts, cur_total_blocks,
|
||||
luts->params.message_modulus);
|
||||
streams, &cur_blocks, &cur_blocks, prev_blocks, bsks, ksks, luts,
|
||||
cur_total_blocks, luts->params.message_modulus);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), generates_or_propagates, space,
|
||||
@@ -1458,13 +1407,12 @@ void host_compute_prefix_sum_hillis_steele(
|
||||
// - calculates the propagation state of each group
|
||||
// - resolves the carries between groups, either sequentially or with hillis
|
||||
// steele
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_compute_propagation_simulators_and_group_carries(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *block_states,
|
||||
int_radix_params params, int_prop_simu_group_carries_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_groups) {
|
||||
void *const *bsks, KSTorus *const *ksks, uint32_t num_radix_blocks,
|
||||
uint32_t num_groups) {
|
||||
|
||||
if (num_radix_blocks > block_states->num_radix_blocks)
|
||||
PANIC("Cuda error: input does not have enough radix blocks")
|
||||
@@ -1481,7 +1429,7 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
auto luts_array_second_step = mem->luts_array_second_step;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, propagation_cum_sums, propagation_cum_sums, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_array_second_step, num_radix_blocks);
|
||||
luts_array_second_step, num_radix_blocks);
|
||||
|
||||
host_integer_radix_scalar_addition_inplace<Torus>(
|
||||
streams, propagation_cum_sums, mem->scalar_array_cum_sum,
|
||||
@@ -1500,10 +1448,9 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
auto resolved_carries = mem->resolved_carries;
|
||||
if (mem->use_sequential_algorithm_to_resolve_group_carries) {
|
||||
// Resolve group carries sequentially
|
||||
host_resolve_group_carries_sequentially(streams, resolved_carries,
|
||||
grouping_pgns, params,
|
||||
mem->seq_group_prop_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, num_groups);
|
||||
host_resolve_group_carries_sequentially(
|
||||
streams, resolved_carries, grouping_pgns, params,
|
||||
mem->seq_group_prop_mem, bsks, ksks, num_groups);
|
||||
} else {
|
||||
// Resolve group carries with hillis steele
|
||||
auto luts_carry_propagation_sum = mem->hs_group_prop_mem->lut_hillis_steele;
|
||||
@@ -1512,8 +1459,7 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
resolved_carries, 1, num_groups);
|
||||
host_compute_prefix_sum_hillis_steele<Torus>(
|
||||
streams, &shifted_resolved_carries, grouping_pgns,
|
||||
luts_carry_propagation_sum, bsks, ksks, ms_noise_reduction_key,
|
||||
num_groups - 1);
|
||||
luts_carry_propagation_sum, bsks, ksks, num_groups - 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1523,13 +1469,11 @@ void host_compute_propagation_simulators_and_group_carries(
|
||||
// block states: contains the propagation states for the different blocks
|
||||
// depending on the group it belongs to and the internal position within the
|
||||
// block.
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_compute_shifted_blocks_and_borrow_states(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t lut_stride, uint32_t num_many_lut) {
|
||||
KSTorus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
|
||||
auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
|
||||
@@ -1537,7 +1481,7 @@ void host_compute_shifted_blocks_and_borrow_states(
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, shifted_blocks_and_borrow_states, lwe_array, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_array_first_step, num_many_lut, lut_stride);
|
||||
luts_array_first_step, num_many_lut, lut_stride);
|
||||
|
||||
auto shifted_blocks = mem->shifted_blocks;
|
||||
auto borrow_states = mem->borrow_states;
|
||||
@@ -1558,17 +1502,14 @@ void host_compute_shifted_blocks_and_borrow_states(
|
||||
* * (lwe_dimension + 1) * sizeof(Torus) big_lwe_vector: output of pbs should
|
||||
* have size = 2 * (glwe_dimension * polynomial_size + 1) * sizeof(Torus)
|
||||
*/
|
||||
template <typename Torus>
|
||||
void host_full_propagate_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *input_blocks,
|
||||
int_fullprop_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_blocks) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_full_propagate_inplace(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *input_blocks,
|
||||
int_fullprop_buffer<Torus> *mem_ptr,
|
||||
KSTorus *const *ksks, void *const *bsks,
|
||||
uint32_t num_blocks) {
|
||||
auto params = mem_ptr->lut->params;
|
||||
|
||||
int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
|
||||
int small_lwe_size = (params.small_lwe_dimension + 1);
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t num_many_lut = 1;
|
||||
uint32_t lut_stride = 0;
|
||||
@@ -1578,8 +1519,7 @@ void host_full_propagate_inplace(
|
||||
|
||||
/// Since the keyswitch is done on one input only, use only 1 GPU
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.subset_first_gpu(),
|
||||
(Torus *)(mem_ptr->tmp_small_lwe_vector->ptr),
|
||||
streams.get_ith(0), (Torus *)(mem_ptr->tmp_small_lwe_vector->ptr),
|
||||
mem_ptr->lut->lwe_trivial_indexes, (Torus *)cur_input_block.ptr,
|
||||
mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
|
||||
params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);
|
||||
@@ -1589,12 +1529,12 @@ void host_full_propagate_inplace(
|
||||
1, 2, mem_ptr->tmp_small_lwe_vector, 0, 1);
|
||||
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams.subset_first_gpu(), (Torus *)mem_ptr->tmp_big_lwe_vector->ptr,
|
||||
streams.get_ith(0), (Torus *)mem_ptr->tmp_big_lwe_vector->ptr,
|
||||
mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
|
||||
mem_ptr->lut->lut_indexes_vec,
|
||||
(Torus *)mem_ptr->tmp_small_lwe_vector->ptr,
|
||||
mem_ptr->lut->lwe_trivial_indexes, bsks, ms_noise_reduction_key,
|
||||
mem_ptr->lut->buffer, params.glwe_dimension, params.small_lwe_dimension,
|
||||
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
|
||||
params.glwe_dimension, params.small_lwe_dimension,
|
||||
params.polynomial_size, params.pbs_base_log, params.pbs_level,
|
||||
params.grouping_factor, 2, params.pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
@@ -1724,14 +1664,13 @@ __host__ void scalar_pack_blocks(cudaStream_t stream, uint32_t gpu_index,
|
||||
* Thus, lwe_array_out must be allocated with num_radix_blocks * bits_per_block
|
||||
* * (lwe_dimension+1) * sizeeof(Torus) bytes
|
||||
*/
|
||||
template <typename Torus>
|
||||
__host__ void extract_n_bits(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t effective_num_radix_blocks, uint32_t num_radix_blocks,
|
||||
int_bit_extract_luts_buffer<Torus> *bit_extract) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void
|
||||
extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
|
||||
KSTorus *const *ksks, uint32_t effective_num_radix_blocks,
|
||||
uint32_t num_radix_blocks,
|
||||
int_bit_extract_luts_buffer<Torus> *bit_extract) {
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array_out, 0,
|
||||
@@ -1745,19 +1684,17 @@ __host__ void extract_n_bits(
|
||||
}
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_out, bsks, ksks, ms_noise_reduction_key,
|
||||
bit_extract->lut, effective_num_radix_blocks);
|
||||
streams, lwe_array_out, lwe_array_out, bsks, ksks, bit_extract->lut,
|
||||
effective_num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void reduce_signs(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
|
||||
CudaRadixCiphertextFFI *signs_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_sign_blocks) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void
|
||||
reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
|
||||
CudaRadixCiphertextFFI *signs_array_in,
|
||||
int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
KSTorus *const *ksks, uint32_t num_sign_blocks) {
|
||||
|
||||
if (signs_array_out->lwe_dimension != signs_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -1803,8 +1740,7 @@ __host__ void reduce_signs(
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
|
||||
signs_a, num_sign_blocks, message_modulus);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, signs_a, signs_b, bsks, ksks, ms_noise_reduction_key, lut,
|
||||
num_sign_blocks / 2);
|
||||
streams, signs_a, signs_b, bsks, ksks, lut, num_sign_blocks / 2);
|
||||
|
||||
if (num_sign_blocks % 2 == 1)
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -1834,8 +1770,7 @@ __host__ void reduce_signs(
|
||||
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
|
||||
signs_a, num_sign_blocks, message_modulus);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, signs_array_out, signs_b, bsks, ksks, ms_noise_reduction_key,
|
||||
lut, 1);
|
||||
streams, signs_array_out, signs_b, bsks, ksks, lut, 1);
|
||||
|
||||
} else {
|
||||
|
||||
@@ -1853,8 +1788,7 @@ __host__ void reduce_signs(
|
||||
lut->broadcast_lut(lut->active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, signs_array_out, signs_a, bsks, ksks, ms_noise_reduction_key,
|
||||
lut, 1);
|
||||
streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1880,17 +1814,16 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_univariate_lut_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_apply_univariate_lut_kb(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in,
|
||||
int_radix_lut<Torus> *mem, KSTorus *const *ksks,
|
||||
void *const *bsks) {
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, ms_noise_reduction_key,
|
||||
mem, radix_lwe_out->num_radix_blocks);
|
||||
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem,
|
||||
radix_lwe_out->num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -1916,17 +1849,16 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_apply_many_univariate_lut_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut<Torus> *mem,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
KSTorus *const *ksks, void *const *bsks, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
integer_radix_apply_many_univariate_lookup_table_kb<Torus>(
|
||||
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, ms_noise_reduction_key,
|
||||
mem, num_many_lut, lut_stride);
|
||||
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem, num_many_lut,
|
||||
lut_stride);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -1951,43 +1883,43 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_apply_bivariate_lut_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_1,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_2, int_radix_lut<Torus> *mem,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks, uint32_t num_radix_blocks, uint32_t shift) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_apply_bivariate_lut_kb(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_1,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_in_2,
|
||||
int_radix_lut<Torus> *mem, KSTorus *const *ksks,
|
||||
void *const *bsks, uint32_t num_radix_blocks,
|
||||
uint32_t shift) {
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks,
|
||||
ms_noise_reduction_key, mem, num_radix_blocks, shift);
|
||||
streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks, mem,
|
||||
num_radix_blocks, shift);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
|
||||
CudaStreams streams, int_sc_prop_memory<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
|
||||
uint32_t uses_carry, bool allocate_gpu_memory) {
|
||||
bool allocate_gpu_memory) {
|
||||
PUSH_RANGE("scratch add & propagate sc")
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_sc_prop_memory<Torus>(streams, params, num_radix_blocks,
|
||||
requested_flag, uses_carry,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
requested_flag, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
// This function perform the three steps of Thomas' new carry propagation
|
||||
// includes the logic to extract overflow when requested
|
||||
template <typename Torus>
|
||||
void host_propagate_single_carry(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_propagate_single_carry(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries,
|
||||
int_sc_prop_memory<Torus> *mem,
|
||||
void *const *bsks, KSTorus *const *ksks,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
PUSH_RANGE("propagate sc")
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
@@ -2010,8 +1942,8 @@ void host_propagate_single_carry(
|
||||
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_states<Torus>(
|
||||
streams, lwe_array, mem->shifted_blocks_state_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_stride, num_many_lut);
|
||||
streams, lwe_array, mem->shifted_blocks_state_mem, bsks, ksks, lut_stride,
|
||||
num_many_lut);
|
||||
auto block_states = mem->shifted_blocks_state_mem->block_states;
|
||||
|
||||
if (requested_flag == outputFlag::FLAG_CARRY) {
|
||||
@@ -2022,7 +1954,7 @@ void host_propagate_single_carry(
|
||||
// Step 2
|
||||
host_compute_propagation_simulators_and_group_carries<Torus>(
|
||||
streams, block_states, params, mem->prop_simu_group_carries_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, num_radix_blocks, mem->num_groups);
|
||||
ksks, num_radix_blocks, mem->num_groups);
|
||||
|
||||
auto group_size = mem->prop_simu_group_carries_mem->group_size;
|
||||
|
||||
@@ -2063,7 +1995,7 @@ void host_propagate_single_carry(
|
||||
num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, mem->output_flag, prepared_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks + 1);
|
||||
mem->lut_message_extract, num_radix_blocks + 1);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lwe_array, 0, num_radix_blocks,
|
||||
@@ -2074,22 +2006,21 @@ void host_propagate_single_carry(
|
||||
} else {
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
message_extract, num_radix_blocks);
|
||||
streams, lwe_array, prepared_blocks, bsks, ksks, message_extract,
|
||||
num_radix_blocks);
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
// This function perform the three steps of Thomas' new carry propagation
|
||||
// includes the logic to extract overflow when requested
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_add_and_propagate_single_carry(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
void *const *bsks, KSTorus *const *ksks, uint32_t requested_flag,
|
||||
uint32_t uses_carry) {
|
||||
PUSH_RANGE("add & propagate sc")
|
||||
if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be the same")
|
||||
@@ -2116,9 +2047,6 @@ void host_add_and_propagate_single_carry(
|
||||
|
||||
auto num_radix_blocks = lhs_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
|
||||
auto lut_stride = mem->lut_stride;
|
||||
auto num_many_lut = mem->num_many_lut;
|
||||
CudaRadixCiphertextFFI output_flag;
|
||||
@@ -2145,15 +2073,14 @@ void host_add_and_propagate_single_carry(
|
||||
}
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_states<Torus>(
|
||||
streams, lhs_array, mem->shifted_blocks_state_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_stride, num_many_lut);
|
||||
streams, lhs_array, mem->shifted_blocks_state_mem, bsks, ksks, lut_stride,
|
||||
num_many_lut);
|
||||
auto block_states = mem->shifted_blocks_state_mem->block_states;
|
||||
if (requested_flag == outputFlag::FLAG_OVERFLOW) {
|
||||
auto lut_overflow_prep = mem->lut_overflow_flag_prep;
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, &output_flag, mem->last_lhs, mem->last_rhs, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_overflow_prep, 1,
|
||||
lut_overflow_prep->params.message_modulus);
|
||||
lut_overflow_prep, 1, lut_overflow_prep->params.message_modulus);
|
||||
} else if (requested_flag == outputFlag::FLAG_CARRY) {
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), &output_flag, 0, 1,
|
||||
@@ -2163,7 +2090,7 @@ void host_add_and_propagate_single_carry(
|
||||
// Step 2
|
||||
host_compute_propagation_simulators_and_group_carries<Torus>(
|
||||
streams, block_states, params, mem->prop_simu_group_carries_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, num_radix_blocks, mem->num_groups);
|
||||
ksks, num_radix_blocks, mem->num_groups);
|
||||
|
||||
auto group_size = mem->prop_simu_group_carries_mem->group_size;
|
||||
|
||||
@@ -2216,7 +2143,7 @@ void host_add_and_propagate_single_carry(
|
||||
num_radix_blocks, num_radix_blocks + 1, &output_flag, 0, 1);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, mem->output_flag, prepared_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, mem->lut_message_extract, num_radix_blocks + 1);
|
||||
mem->lut_message_extract, num_radix_blocks + 1);
|
||||
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), lhs_array, 0, num_radix_blocks,
|
||||
@@ -2227,7 +2154,7 @@ void host_add_and_propagate_single_carry(
|
||||
mem->output_flag, num_radix_blocks, num_radix_blocks + 1);
|
||||
} else {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lhs_array, prepared_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
streams, lhs_array, prepared_blocks, bsks, ksks,
|
||||
mem->lut_message_extract, num_radix_blocks);
|
||||
}
|
||||
POP_RANGE()
|
||||
@@ -2249,15 +2176,16 @@ uint64_t scratch_cuda_integer_overflowing_sub(
|
||||
|
||||
// This function perform the three steps of Thomas' new borrow propagation
|
||||
// includes the logic to extract overflow when requested
|
||||
template <typename Torus>
|
||||
void host_single_borrow_propagate(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow,
|
||||
int_borrow_prop_memory<Torus> *mem, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_groups, uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow) {
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_single_borrow_propagate(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow,
|
||||
int_borrow_prop_memory<Torus> *mem,
|
||||
void *const *bsks, KSTorus *const *ksks,
|
||||
uint32_t num_groups,
|
||||
uint32_t compute_overflow,
|
||||
uint32_t uses_input_borrow) {
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -2279,7 +2207,7 @@ void host_single_borrow_propagate(
|
||||
// Step 1
|
||||
host_compute_shifted_blocks_and_borrow_states<Torus>(
|
||||
streams, lwe_array, mem->shifted_blocks_borrow_state_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_stride, num_many_lut);
|
||||
lut_stride, num_many_lut);
|
||||
|
||||
auto borrow_states = mem->shifted_blocks_borrow_state_mem->borrow_states;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -2289,7 +2217,7 @@ void host_single_borrow_propagate(
|
||||
// Step 2
|
||||
host_compute_propagation_simulators_and_group_carries<Torus>(
|
||||
streams, borrow_states, params, mem->prop_simu_group_carries_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, num_radix_blocks, num_groups);
|
||||
ksks, num_radix_blocks, num_groups);
|
||||
|
||||
auto shifted_blocks =
|
||||
(Torus *)mem->shifted_blocks_borrow_state_mem->shifted_blocks->ptr;
|
||||
@@ -2343,7 +2271,7 @@ void host_single_borrow_propagate(
|
||||
auto borrow_flag = mem->lut_borrow_flag;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_1, overflow_block, mem->overflow_block, bsks, ksks,
|
||||
ms_noise_reduction_key, borrow_flag, 1);
|
||||
borrow_flag, 1);
|
||||
}
|
||||
for (int j = 0; j < mem->active_streams.count(); j++) {
|
||||
cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1.stream(j),
|
||||
@@ -2365,7 +2293,7 @@ void host_single_borrow_propagate(
|
||||
auto message_extract = mem->lut_message_extract;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->sub_streams_2, lwe_array, prepared_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, message_extract, num_radix_blocks);
|
||||
message_extract, num_radix_blocks);
|
||||
|
||||
for (int j = 0; j < mem->active_streams.count(); j++) {
|
||||
cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2.stream(j),
|
||||
@@ -2380,17 +2308,15 @@ void host_single_borrow_propagate(
|
||||
/// num_radix_blocks corresponds to the number of blocks on which to apply the
|
||||
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
|
||||
/// the input and output numbers of blocks
|
||||
template <typename InputTorus>
|
||||
template <typename InputTorus, typename KSTorus>
|
||||
__host__ void integer_radix_apply_noise_squashing_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
|
||||
InputTorus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
KSTorus *const *ksks) {
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
@@ -2426,11 +2352,10 @@ __host__ void integer_radix_apply_noise_squashing_kb(
|
||||
streams.active_gpu_subset(lwe_array_out->num_radix_blocks);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
streams.subset_first_gpu(), lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], (InputTorus *)lwe_array_pbs_in->ptr,
|
||||
lut->lwe_indexes_in, ksks, lut->input_big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
lwe_array_out->num_radix_blocks);
|
||||
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
|
||||
(InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in, ksks,
|
||||
lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
|
||||
ks_level, lwe_array_out->num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
@@ -2438,13 +2363,12 @@ __host__ void integer_radix_apply_noise_squashing_kb(
|
||||
/// int_noise_squashing_lut doesn't support a different output or lut
|
||||
/// indexing than the trivial
|
||||
execute_pbs_async<uint64_t, __uint128_t>(
|
||||
streams.subset_first_gpu(), (__uint128_t *)lwe_array_out->ptr,
|
||||
streams.get_ith(0), (__uint128_t *)lwe_array_out->ptr,
|
||||
lwe_trivial_indexes_vec[0], lut->lut_vec, lwe_trivial_indexes_vec,
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
|
||||
ms_noise_reduction_key, lut->pbs_buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
|
||||
0);
|
||||
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, lwe_array_out->num_radix_blocks,
|
||||
params.pbs_type, 0, 0);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
@@ -2468,10 +2392,10 @@ __host__ void integer_radix_apply_noise_squashing_kb(
|
||||
execute_pbs_async<uint64_t, __uint128_t>(
|
||||
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
|
||||
lut->lut_vec, lwe_trivial_indexes_vec, lwe_after_ks_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, lwe_array_out->num_radix_blocks,
|
||||
params.pbs_type, 0, 0);
|
||||
lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
|
||||
0);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
/// In apply noise squashing we always use trivial indexes
|
||||
|
||||
@@ -128,59 +128,51 @@ void cuda_integer_mult_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int8_t *mem_ptr, uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
void *const *bsks, void *const *ksks, int8_t *mem_ptr,
|
||||
uint32_t polynomial_size, uint32_t num_blocks) {
|
||||
PUSH_RANGE("mul")
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<256>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
radix_lwe_right, is_bool_right, bsks, (uint32_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 512:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<512>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
radix_lwe_right, is_bool_right, bsks, (uint32_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 1024:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<1024>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
radix_lwe_right, is_bool_right, bsks, (uint32_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 2048:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<2048>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
radix_lwe_right, is_bool_right, bsks, (uint32_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 4096:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<4096>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
radix_lwe_right, is_bool_right, bsks, (uint32_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 8192:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<8192>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
radix_lwe_right, is_bool_right, bsks, (uint32_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
case 16384:
|
||||
host_integer_mult_radix_kb<uint64_t, AmortizedDegree<16384>>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
|
||||
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, (int_mul_memory<uint64_t> *)mem_ptr,
|
||||
num_blocks);
|
||||
radix_lwe_right, is_bool_right, bsks, (uint32_t **)(ksks),
|
||||
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
|
||||
@@ -225,8 +217,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *radix_lwe_vec, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *ksks) {
|
||||
|
||||
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
@@ -234,8 +225,7 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
"output's number of radix blocks")
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
|
||||
CudaStreams(streams), radix_lwe_out, radix_lwe_vec, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, mem,
|
||||
radix_lwe_out->num_radix_blocks,
|
||||
(uint32_t **)(ksks), mem, radix_lwe_out->num_radix_blocks,
|
||||
radix_lwe_vec->num_radix_blocks / radix_lwe_out->num_radix_blocks);
|
||||
}
|
||||
|
||||
|
||||
@@ -290,8 +290,7 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
CudaRadixCiphertextFFI *terms, void *const *bsks, uint32_t *const *ksks,
|
||||
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
@@ -398,17 +397,15 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.subset_first_gpu(), (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
|
||||
total_messages);
|
||||
streams.get_ith(0), (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in,
|
||||
(Torus *)current_blocks->ptr, d_pbs_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, total_messages);
|
||||
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams.subset_first_gpu(), (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
streams.get_ith(0), (Torus *)current_blocks->ptr, d_pbs_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
(Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
@@ -422,7 +419,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, current_blocks, current_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_message_carry, total_ciphertexts);
|
||||
luts_message_carry, total_ciphertexts);
|
||||
}
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
std::swap(d_columns, d_new_columns);
|
||||
@@ -451,16 +448,15 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.subset_first_gpu(), (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
|
||||
streams.get_ith(0), (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in,
|
||||
(Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams.subset_first_gpu(), (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
streams.get_ith(0), (Torus *)current_blocks->ptr, d_pbs_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
(Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
@@ -474,7 +470,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
active_streams, current_blocks, radix_lwe_out, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_message_carry, num_blocks_in_apply_lut);
|
||||
luts_message_carry, num_blocks_in_apply_lut);
|
||||
}
|
||||
calculate_final_degrees(radix_lwe_out->degrees, terms->degrees,
|
||||
num_radix_blocks, num_radix_in_vec, chunk_size,
|
||||
@@ -496,9 +492,8 @@ __host__ void host_integer_mult_radix_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
|
||||
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
|
||||
void *const *bsks, uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_mul_memory<Torus> *mem_ptr, uint32_t num_blocks) {
|
||||
void *const *bsks, uint32_t *const *ksks, int_mul_memory<Torus> *mem_ptr,
|
||||
uint32_t num_blocks) {
|
||||
|
||||
if (radix_lwe_out->lwe_dimension != radix_lwe_left->lwe_dimension ||
|
||||
radix_lwe_right->lwe_dimension != radix_lwe_left->lwe_dimension)
|
||||
@@ -516,14 +511,14 @@ __host__ void host_integer_mult_radix_kb(
|
||||
if (is_bool_right) {
|
||||
zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_left, radix_lwe_right,
|
||||
mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
|
||||
bsks, ksks, ms_noise_reduction_key, num_blocks);
|
||||
bsks, ksks, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
if (is_bool_left) {
|
||||
zero_out_if<Torus>(streams, radix_lwe_out, radix_lwe_right, radix_lwe_left,
|
||||
mem_ptr->zero_out_mem, mem_ptr->zero_out_predicate_lut,
|
||||
bsks, ksks, ms_noise_reduction_key, num_blocks);
|
||||
bsks, ksks, num_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -592,8 +587,7 @@ __host__ void host_integer_mult_radix_kb(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, block_mul_res, block_mul_res, vector_result_sb, bsks, ksks,
|
||||
ms_noise_reduction_key, luts_array, total_block_count,
|
||||
luts_array->params.message_modulus);
|
||||
luts_array, total_block_count, luts_array->params.message_modulus);
|
||||
|
||||
vector_result_lsb = block_mul_res;
|
||||
as_radix_ciphertext_slice<Torus>(&vector_result_msb, block_mul_res,
|
||||
@@ -621,15 +615,14 @@ __host__ void host_integer_mult_radix_kb(
|
||||
}
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<Torus>(
|
||||
streams, radix_lwe_out, vector_result_sb, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->sum_ciphertexts_mem, num_blocks,
|
||||
2 * num_blocks);
|
||||
mem_ptr->sum_ciphertexts_mem, num_blocks, 2 * num_blocks);
|
||||
|
||||
auto scp_mem_ptr = mem_ptr->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, radix_lwe_out, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
host_propagate_single_carry<Torus>(streams, radix_lwe_out, nullptr, nullptr,
|
||||
scp_mem_ptr, bsks, ksks, requested_flag,
|
||||
uses_carry);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -126,7 +126,7 @@ __host__ uint64_t scratch_cuda_integer_overflowing_sub_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_overflowing_sub(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI *input_left,
|
||||
@@ -134,9 +134,7 @@ __host__ void host_integer_overflowing_sub(
|
||||
CudaRadixCiphertextFFI *overflow_block,
|
||||
const CudaRadixCiphertextFFI *input_borrow,
|
||||
int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
KSTorus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {
|
||||
PUSH_RANGE("overflowing sub")
|
||||
if (output->num_radix_blocks != input_left->num_radix_blocks ||
|
||||
output->num_radix_blocks != input_right->num_radix_blocks)
|
||||
@@ -166,7 +164,7 @@ __host__ void host_integer_overflowing_sub(
|
||||
host_single_borrow_propagate<Torus>(
|
||||
streams, output, overflow_block, input_borrow,
|
||||
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
|
||||
ms_noise_reduction_key, num_groups, compute_overflow, uses_input_borrow);
|
||||
num_groups, compute_overflow, uses_input_borrow);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
@@ -5,10 +5,9 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
|
||||
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
|
||||
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t grouping_factor, uint32_t num_blocks_to_process,
|
||||
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
uint32_t message_bits_per_block, uint32_t total_random_bits,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, uint32_t message_bits_per_block,
|
||||
uint32_t total_random_bits, PBS_MS_REDUCTION_T noise_reduction_type) {
|
||||
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
@@ -18,20 +17,19 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
|
||||
|
||||
return scratch_cuda_integer_grouped_oprf<uint64_t>(
|
||||
CudaStreams(streams), (int_grouped_oprf_memory<uint64_t> **)mem_ptr,
|
||||
params, num_blocks_to_process, num_blocks, message_bits_per_block,
|
||||
total_random_bits, allocate_gpu_memory);
|
||||
params, num_blocks_to_process, message_bits_per_block, total_random_bits,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cuda_integer_grouped_oprf_async_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input, uint32_t num_blocks_to_process, int8_t *mem,
|
||||
void *const *bsks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void cuda_integer_grouped_oprf_async_64(CudaStreamsFFI streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const void *seeded_lwe_input,
|
||||
uint32_t num_blocks_to_process,
|
||||
int8_t *mem, void *const *bsks) {
|
||||
|
||||
host_integer_grouped_oprf<uint64_t>(
|
||||
CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,
|
||||
num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks,
|
||||
ms_noise_reduction_key);
|
||||
num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
|
||||
|
||||
@@ -8,34 +8,34 @@ template <typename Torus>
|
||||
uint64_t scratch_cuda_integer_grouped_oprf(
|
||||
CudaStreams streams, int_grouped_oprf_memory<Torus> **mem_ptr,
|
||||
int_radix_params params, uint32_t num_blocks_to_process,
|
||||
uint32_t num_blocks, uint32_t message_bits_per_block,
|
||||
uint64_t total_random_bits, bool allocate_gpu_memory) {
|
||||
uint32_t message_bits_per_block, uint64_t total_random_bits,
|
||||
bool allocate_gpu_memory) {
|
||||
uint64_t size_tracker = 0;
|
||||
|
||||
*mem_ptr = new int_grouped_oprf_memory<Torus>(
|
||||
streams, params, num_blocks_to_process, num_blocks,
|
||||
message_bits_per_block, total_random_bits, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
streams, params, num_blocks_to_process, message_bits_per_block,
|
||||
total_random_bits, allocate_gpu_memory, size_tracker);
|
||||
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void host_integer_grouped_oprf(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const Torus *seeded_lwe_input, uint32_t num_blocks_to_process,
|
||||
int_grouped_oprf_memory<Torus> *mem_ptr, void *const *bsks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void host_integer_grouped_oprf(CudaStreams streams,
|
||||
CudaRadixCiphertextFFI *radix_lwe_out,
|
||||
const Torus *seeded_lwe_input,
|
||||
uint32_t num_blocks_to_process,
|
||||
int_grouped_oprf_memory<Torus> *mem_ptr,
|
||||
void *const *bsks) {
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
|
||||
auto lut = mem_ptr->luts;
|
||||
|
||||
if (active_streams.count() == 1) {
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
streams, (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out,
|
||||
streams.get_ith(0), (Torus *)(radix_lwe_out->ptr), lut->lwe_indexes_out,
|
||||
lut->lut_vec, lut->lut_indexes_vec,
|
||||
const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in, bsks,
|
||||
ms_noise_reduction_key, lut->buffer, mem_ptr->params.glwe_dimension,
|
||||
lut->buffer, mem_ptr->params.glwe_dimension,
|
||||
mem_ptr->params.small_lwe_dimension, mem_ptr->params.polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, num_blocks_to_process,
|
||||
@@ -45,48 +45,35 @@ void host_integer_grouped_oprf(
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
cuda_event_record(lut->event_scatter_in, streams.stream(0),
|
||||
streams.gpu_index(0));
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(j), lut->event_scatter_in,
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
|
||||
if (!lut->using_trivial_lwe_indexes) {
|
||||
PANIC("lut->using_trivial_lwe_indexes should be true");
|
||||
}
|
||||
lut->multi_gpu_scatter_barrier.local_streams_wait_for_stream_0(
|
||||
active_streams);
|
||||
|
||||
PUSH_RANGE("scatter")
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
active_streams, lwe_array_in_vec, seeded_lwe_input, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
active_streams.count(), num_blocks_to_process,
|
||||
mem_ptr->params.small_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
active_streams, lwe_after_pbs_vec, lwe_trivial_indexes_vec,
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_array_in_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key, lut->buffer,
|
||||
lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
mem_ptr->params.glwe_dimension, mem_ptr->params.small_lwe_dimension,
|
||||
mem_ptr->params.polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
num_blocks_to_process, mem_ptr->params.pbs_type, 1, 0);
|
||||
|
||||
PUSH_RANGE("gather")
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
active_streams, (Torus *)radix_lwe_out->ptr, lwe_after_pbs_vec,
|
||||
lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_blocks_to_process,
|
||||
mem_ptr->params.big_lwe_dimension + 1);
|
||||
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams.stream(j),
|
||||
streams.gpu_index(j));
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_streams.count(); j++) {
|
||||
cuda_stream_wait_event(streams.stream(0), lut->event_scatter_out[j],
|
||||
streams.gpu_index(0));
|
||||
}
|
||||
POP_RANGE()
|
||||
lut->multi_gpu_gather_barrier.stream_0_wait_for_local_streams(
|
||||
active_streams);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_blocks_to_process; i++) {
|
||||
|
||||
@@ -7,6 +7,13 @@
|
||||
#include "utils/helper_profile.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
inline CudaLweCiphertextListFFI
|
||||
to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
|
||||
return {.ptr = radix->ptr,
|
||||
.num_radix_blocks = radix->num_radix_blocks,
|
||||
.lwe_dimension = radix->lwe_dimension};
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void create_zero_radix_ciphertext_async(cudaStream_t const stream,
|
||||
uint32_t const gpu_index,
|
||||
|
||||
@@ -4,15 +4,13 @@ void cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_input, void const *clear_blocks,
|
||||
void const *h_clear_blocks, uint32_t num_clear_blocks, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_radix_scalar_bitop_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_input,
|
||||
static_cast<const uint64_t *>(clear_blocks),
|
||||
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key);
|
||||
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint32_t **)(ksks));
|
||||
}
|
||||
|
||||
void update_degrees_after_scalar_bitand(uint64_t *output_degrees,
|
||||
|
||||
@@ -4,13 +4,12 @@
|
||||
#include "integer/bitwise_ops.cuh"
|
||||
#include <omp.h>
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_scalar_bitop_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
CudaRadixCiphertextFFI const *input, Torus const *clear_blocks,
|
||||
Torus const *h_clear_blocks, uint32_t num_clear_blocks,
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks, KSTorus *const *ksks) {
|
||||
|
||||
if (output->num_radix_blocks != input->num_radix_blocks)
|
||||
PANIC("Cuda error: input and output num radix blocks must be equal")
|
||||
@@ -50,8 +49,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
lut->broadcast_lut(active_streams, false);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, output, input, bsks, ksks, ms_noise_reduction_key, lut,
|
||||
num_clear_blocks);
|
||||
streams, output, input, bsks, ksks, lut, num_clear_blocks);
|
||||
memcpy(output->degrees, degrees, num_clear_blocks * sizeof(uint64_t));
|
||||
|
||||
if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
|
||||
|
||||
@@ -35,9 +35,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, void const *scalar_blocks,
|
||||
void const *h_scalar_blocks, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_scalar_blocks) {
|
||||
void *const *ksks, uint32_t num_scalar_blocks) {
|
||||
|
||||
// The output ciphertext might be a boolean block or a radix ciphertext
|
||||
// depending on the case (eq/gt vs max/min) so the amount of blocks to
|
||||
@@ -51,8 +49,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
host_integer_radix_scalar_equality_check_kb<uint64_t>(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks), buffer, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
(uint32_t **)(ksks), num_radix_blocks, num_scalar_blocks);
|
||||
break;
|
||||
case GT:
|
||||
case GE:
|
||||
@@ -65,8 +62,8 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks),
|
||||
static_cast<const uint64_t *>(h_scalar_blocks), buffer,
|
||||
buffer->diff_buffer->operator_f, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, num_radix_blocks, num_scalar_blocks);
|
||||
buffer->diff_buffer->operator_f, bsks, (uint32_t **)(ksks),
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
break;
|
||||
case MAX:
|
||||
case MIN:
|
||||
@@ -77,8 +74,7 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
|
||||
CudaStreams(streams), lwe_array_out, lwe_array_in,
|
||||
static_cast<const uint64_t *>(scalar_blocks),
|
||||
static_cast<const uint64_t *>(h_scalar_blocks), buffer, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
(uint32_t **)(ksks), num_radix_blocks, num_scalar_blocks);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
|
||||
@@ -24,14 +24,12 @@ Torus is_x_less_than_y_given_input_borrow(Torus last_x_block,
|
||||
return output_sign_bit ^ overflow_flag;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void scalar_compare_radix_blocks_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks) {
|
||||
|
||||
if (num_radix_blocks == 0)
|
||||
return;
|
||||
@@ -71,8 +69,8 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
// Apply LUT to compare to 0
|
||||
auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, subtracted_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, sign_lut, num_radix_blocks);
|
||||
streams, lwe_array_out, subtracted_blocks, bsks, ksks, sign_lut,
|
||||
num_radix_blocks);
|
||||
|
||||
// FIXME: without this sync signed scalar eq tests fail, I don't understand
|
||||
// the reason
|
||||
@@ -84,15 +82,13 @@ __host__ void scalar_compare_radix_blocks_kb(
|
||||
streams, lwe_array_out, message_modulus, carry_modulus);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
if (lwe_array_in->num_radix_blocks < num_radix_blocks)
|
||||
@@ -132,11 +128,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
// means scalar is zero
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, mem_ptr->tmp_lwe_array_out, lwe_array_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_lwe_array_out,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->tmp_lwe_array_out->num_radix_blocks);
|
||||
mem_ptr, bsks, ksks, mem_ptr->tmp_lwe_array_out->num_radix_blocks);
|
||||
|
||||
auto scalar_last_leaf_lut_f = [sign_handler_f](Torus x) -> Torus {
|
||||
x = (x == 1 ? IS_EQUAL : IS_SUPERIOR);
|
||||
@@ -154,8 +149,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks, lut, 1);
|
||||
|
||||
} else if (num_scalar_blocks < num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
@@ -207,7 +201,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
@@ -215,15 +209,15 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, &lwe_array_msb_out, &msb, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, &lwe_array_msb_out, &lwe_array_msb_out, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, lwe_array_msb_out.num_radix_blocks);
|
||||
ksks, lwe_array_msb_out.num_radix_blocks);
|
||||
lsb_streams.synchronize();
|
||||
msb_streams.synchronize();
|
||||
|
||||
@@ -250,7 +244,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_lsb_out, &lwe_array_msb_out, bsks,
|
||||
ksks, ms_noise_reduction_key, lut, 1, lut->params.message_modulus);
|
||||
ksks, lut, 1, lut->params.message_modulus);
|
||||
|
||||
} else {
|
||||
if (num_radix_blocks == 1) {
|
||||
@@ -283,8 +277,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
one_block_lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, bsks, ksks,
|
||||
ms_noise_reduction_key, one_block_lut, 1);
|
||||
streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
|
||||
one_block_lut->release(streams);
|
||||
delete one_block_lut;
|
||||
} else {
|
||||
@@ -314,7 +307,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
auto comparisons = mem_ptr->tmp_lwe_array_out;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
@@ -322,20 +315,18 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
tree_sign_reduction<Torus>(streams, lwe_array_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
sign_handler_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
num_lsb_radix_blocks);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -376,10 +367,10 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
auto are_all_msb_zeros = mem_ptr->tmp_lwe_array_out;
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, are_all_msb_zeros, lwe_array_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
num_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, are_all_msb_zeros, are_all_msb_zeros, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, are_all_msb_zeros->num_radix_blocks);
|
||||
are_all_msb_zeros->num_radix_blocks);
|
||||
CudaRadixCiphertextFFI sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(&sign_block, lwe_array_in,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
@@ -430,8 +421,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1, lut->params.message_modulus);
|
||||
streams, lwe_array_out, are_all_msb_zeros, &sign_block, bsks, ksks, lut,
|
||||
1, lut->params.message_modulus);
|
||||
|
||||
} else if (num_scalar_blocks < num_radix_blocks) {
|
||||
// We have to handle both part of the work described above
|
||||
@@ -477,7 +468,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
auto comparisons = mem_ptr->tmp_block_comparisons;
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, comparisons, diff_buffer->tmp_packed, (Torus *)rhs.ptr,
|
||||
mem_ptr, bsks, ksks, ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
|
||||
// Reduces a vec containing radix blocks that encrypts a sign
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
@@ -485,17 +476,17 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
tree_sign_reduction<Torus>(lsb_streams, lwe_array_lsb_out, comparisons,
|
||||
mem_ptr->diff_buffer->tree_buffer,
|
||||
mem_ptr->identity_lut_f, bsks, ksks,
|
||||
ms_noise_reduction_key, num_lsb_radix_blocks);
|
||||
num_lsb_radix_blocks);
|
||||
//////////////
|
||||
// msb
|
||||
// We remove the last block (which is the sign)
|
||||
auto are_all_msb_zeros = lwe_array_msb_out;
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, &are_all_msb_zeros, &msb, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
num_msb_radix_blocks, mem_ptr->is_zero_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, &are_all_msb_zeros, &are_all_msb_zeros, mem_ptr, bsks,
|
||||
ksks, ms_noise_reduction_key, are_all_msb_zeros.num_radix_blocks);
|
||||
ksks, are_all_msb_zeros.num_radix_blocks);
|
||||
|
||||
auto sign_bit_pos = (int)log2(message_modulus) - 1;
|
||||
|
||||
@@ -536,15 +527,14 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
&sign_block, &msb, num_msb_radix_blocks - 1, num_msb_radix_blocks);
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, &lwe_array_msb_out, &sign_block, &are_all_msb_zeros, bsks,
|
||||
ksks, ms_noise_reduction_key, signed_msb_lut, 1,
|
||||
signed_msb_lut->params.message_modulus);
|
||||
ksks, signed_msb_lut, 1, signed_msb_lut->params.message_modulus);
|
||||
lsb_streams.synchronize();
|
||||
msb_streams.synchronize();
|
||||
|
||||
//////////////
|
||||
// Reduce the two blocks into one final
|
||||
reduce_signs<Torus>(streams, lwe_array_out, lwe_array_lsb_out, mem_ptr,
|
||||
sign_handler_f, bsks, ksks, ms_noise_reduction_key, 2);
|
||||
sign_handler_f, bsks, ksks, 2);
|
||||
|
||||
} else {
|
||||
if (num_radix_blocks == 1) {
|
||||
@@ -579,8 +569,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
one_block_lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, bsks, ksks,
|
||||
ms_noise_reduction_key, one_block_lut, 1);
|
||||
streams, lwe_array_out, lwe_array_in, bsks, ksks, one_block_lut, 1);
|
||||
one_block_lut->release(streams);
|
||||
delete one_block_lut;
|
||||
} else {
|
||||
@@ -619,8 +608,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// - 2 if lhs > rhs
|
||||
scalar_compare_radix_blocks_kb<Torus>(
|
||||
lsb_streams, lwe_array_ct_out, diff_buffer->tmp_packed,
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, ms_noise_reduction_key,
|
||||
num_lsb_radix_blocks);
|
||||
(Torus *)rhs.ptr, mem_ptr, bsks, ksks, num_lsb_radix_blocks);
|
||||
CudaRadixCiphertextFFI encrypted_sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(&encrypted_sign_block, lwe_array_in,
|
||||
num_radix_blocks - 1, num_radix_blocks);
|
||||
@@ -636,8 +624,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
msb_streams, &lwe_array_sign_out, &encrypted_sign_block,
|
||||
trivial_sign_block, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->signed_lut, 1, mem_ptr->signed_lut->params.message_modulus);
|
||||
trivial_sign_block, bsks, ksks, mem_ptr->signed_lut, 1,
|
||||
mem_ptr->signed_lut->params.message_modulus);
|
||||
lsb_streams.synchronize();
|
||||
msb_streams.synchronize();
|
||||
|
||||
@@ -645,21 +633,18 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
// (inferior, equal, superior) to one single radix block containing the
|
||||
// final sign
|
||||
reduce_signs<Torus>(streams, lwe_array_out, lwe_array_ct_out, mem_ptr,
|
||||
sign_handler_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_lsb_radix_blocks + 1);
|
||||
sign_handler_f, bsks, ksks, num_lsb_radix_blocks + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input lwe dimensions must be the same")
|
||||
@@ -671,24 +656,23 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
|
||||
// is signed and scalar is positive
|
||||
integer_radix_signed_scalar_difference_check_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
|
||||
mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
} else {
|
||||
integer_radix_unsigned_scalar_difference_check_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_in, scalar_blocks, h_scalar_blocks,
|
||||
mem_ptr, sign_handler_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
mem_ptr, sign_handler_f, bsks, ksks, num_radix_blocks,
|
||||
num_scalar_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
|
||||
void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
void *const *bsks, KSTorus *const *ksks, uint32_t num_radix_blocks,
|
||||
uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
||||
@@ -706,8 +690,7 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
auto sign = mem_ptr->tmp_lwe_array_out;
|
||||
host_integer_radix_scalar_difference_check_kb<Torus>(
|
||||
streams, sign, lwe_array_in, scalar_blocks, h_scalar_blocks, mem_ptr,
|
||||
mem_ptr->identity_lut_f, bsks, ksks, ms_noise_reduction_key,
|
||||
num_radix_blocks, num_scalar_blocks);
|
||||
mem_ptr->identity_lut_f, bsks, ksks, num_radix_blocks, num_scalar_blocks);
|
||||
|
||||
// There is no optimized CMUX for scalars, so we convert to a trivial
|
||||
// ciphertext
|
||||
@@ -721,20 +704,17 @@ __host__ void host_integer_radix_scalar_maxmin_kb(
|
||||
|
||||
// Selector
|
||||
// CMUX for Max or Min
|
||||
host_integer_radix_cmux_kb<Torus>(streams, lwe_array_out,
|
||||
mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks,
|
||||
ksks, ms_noise_reduction_key);
|
||||
host_integer_radix_cmux_kb<Torus>(
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, lwe_array_left,
|
||||
lwe_array_right, mem_ptr->cmux_buffer, bsks, ksks);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
|
||||
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
|
||||
|
||||
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimensions must be the same")
|
||||
@@ -807,8 +787,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
lsb_streams, mem_ptr->tmp_lwe_array_out, mem_ptr->tmp_packed_input,
|
||||
bsks, ksks, ms_noise_reduction_key, scalar_comparison_luts,
|
||||
num_halved_lsb_radix_blocks);
|
||||
bsks, ksks, scalar_comparison_luts, num_halved_lsb_radix_blocks);
|
||||
}
|
||||
//////////////
|
||||
// msb_in
|
||||
@@ -825,12 +804,12 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
PANIC("Cuda error: integer operation not supported")
|
||||
}
|
||||
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
msb_streams, &msb_out, &msb_in, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, num_msb_radix_blocks, msb_lut);
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
msb_streams, &msb_out, &msb_out, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, msb_out.num_radix_blocks);
|
||||
host_compare_blocks_with_zero<Torus>(msb_streams, &msb_out, &msb_in,
|
||||
mem_ptr, bsks, ksks,
|
||||
num_msb_radix_blocks, msb_lut);
|
||||
are_all_comparisons_block_true<Torus>(msb_streams, &msb_out, &msb_out,
|
||||
mem_ptr, bsks, ksks,
|
||||
msb_out.num_radix_blocks);
|
||||
}
|
||||
|
||||
lsb_streams.synchronize();
|
||||
@@ -840,13 +819,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
case COMPARISON_TYPE::EQ:
|
||||
are_all_comparisons_block_true<Torus>(
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
case COMPARISON_TYPE::NE:
|
||||
is_at_least_one_comparisons_block_true<Torus>(
|
||||
streams, lwe_array_out, mem_ptr->tmp_lwe_array_out, mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key,
|
||||
num_halved_scalar_blocks + (num_msb_radix_blocks > 0));
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -24,13 +24,12 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
void cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
host_integer_unsigned_scalar_div_radix<uint64_t>(
|
||||
CudaStreams(streams), numerator_ct,
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi);
|
||||
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint32_t **)ksks,
|
||||
scalar_divisor_ffi);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_radix_kb_64(
|
||||
@@ -69,13 +68,12 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
void cuda_integer_signed_scalar_div_radix_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
|
||||
|
||||
host_integer_signed_scalar_div_radix_kb<uint64_t>(
|
||||
CudaStreams(streams), numerator_ct,
|
||||
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
|
||||
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint32_t **)ksks,
|
||||
scalar_divisor_ffi, numerator_bits);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_radix_kb_64(CudaStreamsFFI streams,
|
||||
@@ -115,9 +113,7 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
const CudaModulusSwitchNoiseReductionKeyFFI *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
void const *clear_blocks, void const *h_clear_blocks,
|
||||
@@ -126,9 +122,9 @@ void cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
|
||||
CudaStreams(streams), quotient_ct, remainder_ct,
|
||||
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
|
||||
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
|
||||
(uint64_t *)clear_blocks, (uint64_t *)h_clear_blocks, num_clear_blocks);
|
||||
(uint32_t **)ksks, scalar_divisor_ffi, divisor_has_at_least_one_set,
|
||||
decomposed_divisor, num_scalars_divisor, (uint64_t *)clear_blocks,
|
||||
(uint64_t *)h_clear_blocks, num_clear_blocks);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_kb_64(
|
||||
@@ -168,9 +164,7 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
void *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
uint32_t numerator_bits) {
|
||||
@@ -178,9 +172,8 @@ void cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
host_integer_signed_scalar_div_rem_radix<uint64_t>(
|
||||
CudaStreams(streams), quotient_ct, remainder_ct,
|
||||
(int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key, scalar_divisor_ffi,
|
||||
divisor_has_at_least_one_set, decomposed_divisor, num_scalars_divisor,
|
||||
numerator_bits);
|
||||
(uint32_t **)ksks, scalar_divisor_ffi, divisor_has_at_least_one_set,
|
||||
decomposed_divisor, num_scalars_divisor, numerator_bits);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_signed_scalar_div_rem_radix_kb_64(
|
||||
|
||||
@@ -23,13 +23,11 @@ __host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_unsigned_scalar_div_radix(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
if (scalar_divisor_ffi->is_abs_divisor_one) {
|
||||
return;
|
||||
@@ -38,7 +36,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->ilog2_divisor,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
numerator_ct->num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
@@ -65,26 +63,24 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
numerator_cpy, numerator_ct);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(
|
||||
streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks,
|
||||
ms_noise_reduction_key, bsks, scalar_divisor_ffi);
|
||||
streams, numerator_cpy, mem_ptr->scalar_mul_high_mem, ksks, bsks,
|
||||
scalar_divisor_ffi);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, numerator_ct->num_radix_blocks);
|
||||
bsks, ksks, numerator_ct->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, ms_noise_reduction_key, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_post - (uint32_t)1,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
return;
|
||||
@@ -92,16 +88,16 @@ __host__ void host_integer_unsigned_scalar_div_radix(
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_pre,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
numerator_ct->num_radix_blocks);
|
||||
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(
|
||||
streams, numerator_ct, mem_ptr->scalar_mul_high_mem, ksks,
|
||||
ms_noise_reduction_key, bsks, scalar_divisor_ffi);
|
||||
host_integer_radix_scalar_mul_high_kb<Torus>(streams, numerator_ct,
|
||||
mem_ptr->scalar_mul_high_mem,
|
||||
ksks, bsks, scalar_divisor_ffi);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, numerator_ct, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
numerator_ct->num_radix_blocks);
|
||||
}
|
||||
|
||||
@@ -121,13 +117,12 @@ __host__ uint64_t scratch_integer_signed_scalar_div_radix_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
|
||||
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
|
||||
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint32_t numerator_bits) {
|
||||
|
||||
if (scalar_divisor_ffi->is_abs_divisor_one) {
|
||||
if (scalar_divisor_ffi->is_divisor_negative) {
|
||||
@@ -158,23 +153,20 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, tmp,
|
||||
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
tmp->num_radix_blocks);
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, tmp->num_radix_blocks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, tmp, scalar_divisor_ffi->chosen_multiplier_num_bits,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
} else if (!scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
@@ -182,12 +174,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
|
||||
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
|
||||
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
|
||||
ms_noise_reduction_key, bsks);
|
||||
bsks);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, tmp, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
@@ -195,12 +186,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, xsign, numerator_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
bsks, ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
} else {
|
||||
|
||||
@@ -209,16 +199,15 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
|
||||
host_integer_radix_signed_scalar_mul_high_kb<Torus>(
|
||||
streams, tmp, mem_ptr->scalar_mul_high_mem, ksks, scalar_divisor_ffi,
|
||||
ms_noise_reduction_key, bsks);
|
||||
bsks);
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
|
||||
ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, tmp, scalar_divisor_ffi->shift_post,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
CudaRadixCiphertextFFI *xsign = mem_ptr->xsign_ffi;
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
@@ -226,12 +215,11 @@ __host__ void host_integer_signed_scalar_div_radix_kb(
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
|
||||
streams, xsign, numerator_bits - 1,
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks,
|
||||
ms_noise_reduction_key);
|
||||
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks);
|
||||
|
||||
host_sub_and_propagate_single_carry<Torus>(
|
||||
streams, tmp, xsign, nullptr, nullptr, mem_ptr->sub_and_propagate_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
bsks, ksks, FLAG_NONE, (uint32_t)0);
|
||||
}
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_negative) {
|
||||
@@ -258,14 +246,12 @@ __host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_unsigned_scalar_div_rem_radix(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct,
|
||||
int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
Torus const *clear_blocks, Torus const *h_clear_blocks,
|
||||
@@ -275,18 +261,17 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_ct, quotient_ct);
|
||||
|
||||
host_integer_unsigned_scalar_div_radix(
|
||||
streams, quotient_ct, mem_ptr->unsigned_div_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi);
|
||||
host_integer_unsigned_scalar_div_radix(streams, quotient_ct,
|
||||
mem_ptr->unsigned_div_mem, bsks, ksks,
|
||||
scalar_divisor_ffi);
|
||||
|
||||
if (scalar_divisor_ffi->is_divisor_pow2) {
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, numerator_ct);
|
||||
host_integer_radix_scalar_bitop_kb(streams, remainder_ct, remainder_ct,
|
||||
clear_blocks, h_clear_blocks,
|
||||
num_clear_blocks, mem_ptr->bitop_mem,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
host_integer_radix_scalar_bitop_kb(
|
||||
streams, remainder_ct, remainder_ct, clear_blocks, h_clear_blocks,
|
||||
num_clear_blocks, mem_ptr->bitop_mem, bsks, ksks);
|
||||
|
||||
} else {
|
||||
if (!scalar_divisor_ffi->is_divisor_zero) {
|
||||
@@ -299,15 +284,13 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, remainder_ct, decomposed_divisor,
|
||||
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
num_scalars_divisor);
|
||||
mem_ptr->params.message_modulus, num_scalars_divisor);
|
||||
}
|
||||
}
|
||||
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, numerator_ct, remainder_ct, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, numerator_ct);
|
||||
@@ -330,14 +313,12 @@ __host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
|
||||
CudaRadixCiphertextFFI *remainder_ct,
|
||||
int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
uint64_t const *divisor_has_at_least_one_set,
|
||||
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
|
||||
uint32_t numerator_bits) {
|
||||
@@ -346,13 +327,13 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
numerator_ct, quotient_ct);
|
||||
|
||||
host_integer_signed_scalar_div_radix_kb(
|
||||
streams, quotient_ct, mem_ptr->signed_div_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, scalar_divisor_ffi, numerator_bits);
|
||||
host_integer_signed_scalar_div_radix_kb(streams, quotient_ct,
|
||||
mem_ptr->signed_div_mem, bsks, ksks,
|
||||
scalar_divisor_ffi, numerator_bits);
|
||||
|
||||
host_propagate_single_carry<Torus>(
|
||||
streams, quotient_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, FLAG_NONE, (uint32_t)0);
|
||||
host_propagate_single_carry<Torus>(streams, quotient_ct, nullptr, nullptr,
|
||||
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE,
|
||||
(uint32_t)0);
|
||||
|
||||
if (!scalar_divisor_ffi->is_divisor_negative &&
|
||||
scalar_divisor_ffi->is_divisor_pow2) {
|
||||
@@ -361,7 +342,7 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
streams, remainder_ct, scalar_divisor_ffi->ilog2_divisor,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, ksks,
|
||||
remainder_ct->num_radix_blocks);
|
||||
|
||||
} else if (!scalar_divisor_ffi->is_divisor_zero) {
|
||||
@@ -375,15 +356,13 @@ __host__ void host_integer_signed_scalar_div_rem_radix(
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, remainder_ct, decomposed_divisor,
|
||||
divisor_has_at_least_one_set, mem_ptr->scalar_mul_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
num_scalars_divisor);
|
||||
mem_ptr->params.message_modulus, num_scalars_divisor);
|
||||
}
|
||||
}
|
||||
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, numerator_ct, remainder_ct, nullptr, nullptr,
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
FLAG_NONE, (uint32_t)0);
|
||||
mem_ptr->sub_and_propagate_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams.stream(0), streams.gpu_index(0),
|
||||
remainder_ct, numerator_ct);
|
||||
|
||||
@@ -22,15 +22,13 @@ uint64_t scratch_cuda_integer_scalar_mul_kb_64(
|
||||
void cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
uint64_t const *decomposed_scalar, uint64_t const *has_at_least_one_set,
|
||||
int8_t *mem, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t polynomial_size, uint32_t message_modulus, uint32_t num_scalars) {
|
||||
int8_t *mem, void *const *bsks, void *const *ksks, uint32_t polynomial_size,
|
||||
uint32_t message_modulus, uint32_t num_scalars) {
|
||||
|
||||
host_integer_scalar_mul_radix<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, decomposed_scalar, has_at_least_one_set,
|
||||
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, message_modulus,
|
||||
num_scalars);
|
||||
(uint32_t **)(ksks), message_modulus, num_scalars);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_mul(CudaStreamsFFI streams,
|
||||
|
||||
@@ -41,12 +41,11 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_mul_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
template <typename T, typename KSTorus>
|
||||
__host__ void host_integer_scalar_mul_radix(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
T const *decomposed_scalar, T const *has_at_least_one_set,
|
||||
int_scalar_mul_buffer<T> *mem, void *const *bsks, T *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_scalar_mul_buffer<T> *mem, void *const *bsks, KSTorus *const *ksks,
|
||||
uint32_t message_modulus, uint32_t num_scalars) {
|
||||
|
||||
auto num_radix_blocks = lwe_array->num_radix_blocks;
|
||||
@@ -69,7 +68,7 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
num_radix_blocks, lwe_array, 0, num_radix_blocks);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<T>(
|
||||
streams, &shift_input, shift_amount, mem->logical_scalar_shift_buffer,
|
||||
bsks, ksks, ms_noise_reduction_key, num_radix_blocks);
|
||||
bsks, ksks, num_radix_blocks);
|
||||
} else {
|
||||
// create trivial assign for value = 0
|
||||
set_zero_radix_ciphertext_slice_async<T>(
|
||||
@@ -113,15 +112,14 @@ __host__ void host_integer_scalar_mul_radix(
|
||||
} else {
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<T>(
|
||||
streams, lwe_array, all_shifted_buffer, bsks, ksks,
|
||||
ms_noise_reduction_key, mem->sum_ciphertexts_vec_mem, num_radix_blocks,
|
||||
j);
|
||||
mem->sum_ciphertexts_vec_mem, num_radix_blocks, j);
|
||||
|
||||
auto scp_mem_ptr = mem->sc_prop_mem;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
uint32_t uses_carry = 0;
|
||||
host_propagate_single_carry<T>(
|
||||
streams, lwe_array, nullptr, nullptr, scp_mem_ptr, bsks, ksks,
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
host_propagate_single_carry<T>(streams, lwe_array, nullptr, nullptr,
|
||||
scp_mem_ptr, bsks, ksks, requested_flag,
|
||||
uses_carry);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,11 +164,10 @@ __host__ void host_integer_small_scalar_mul_radix(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
int_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_scalar_mul_high_buffer<Torus> *mem_ptr, KSTorus *const *ksks,
|
||||
void *const *bsks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
|
||||
@@ -190,30 +187,27 @@ __host__ void host_integer_radix_scalar_mul_high_kb(
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint32_t **)ksks,
|
||||
tmp_ffi->num_radix_blocks);
|
||||
|
||||
} else {
|
||||
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
|
||||
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
|
||||
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
scalar_divisor_ffi->num_scalars);
|
||||
mem_ptr->scalar_mul_mem, bsks, (uint32_t **)ksks,
|
||||
mem_ptr->params.message_modulus, scalar_divisor_ffi->num_scalars);
|
||||
}
|
||||
}
|
||||
|
||||
host_trim_radix_blocks_lsb<Torus>(ct, tmp_ffi, streams);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *ct,
|
||||
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, Torus *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, KSTorus *const *ksks,
|
||||
const CudaScalarDivisorFFI *scalar_divisor_ffi, void *const *bsks) {
|
||||
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(
|
||||
@@ -225,7 +219,7 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
|
||||
host_extend_radix_with_sign_msb<Torus>(
|
||||
streams, tmp_ffi, ct, mem_ptr->extend_radix_mem, ct->num_radix_blocks,
|
||||
bsks, (uint64_t **)ksks, ms_noise_reduction_key);
|
||||
bsks, (uint32_t **)ksks);
|
||||
|
||||
if (scalar_divisor_ffi->active_bits != (uint32_t)0 &&
|
||||
!scalar_divisor_ffi->is_abs_chosen_multiplier_one &&
|
||||
@@ -234,15 +228,14 @@ __host__ void host_integer_radix_signed_scalar_mul_high_kb(
|
||||
if (scalar_divisor_ffi->is_chosen_multiplier_pow2) {
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, tmp_ffi, scalar_divisor_ffi->ilog2_chosen_multiplier,
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, tmp_ffi->num_radix_blocks);
|
||||
mem_ptr->logical_scalar_shift_mem, bsks, (uint32_t **)ksks,
|
||||
tmp_ffi->num_radix_blocks);
|
||||
} else {
|
||||
host_integer_scalar_mul_radix<Torus>(
|
||||
streams, tmp_ffi, scalar_divisor_ffi->decomposed_chosen_multiplier,
|
||||
scalar_divisor_ffi->chosen_multiplier_has_at_least_one_set,
|
||||
mem_ptr->scalar_mul_mem, bsks, (uint64_t **)ksks,
|
||||
ms_noise_reduction_key, mem_ptr->params.message_modulus,
|
||||
scalar_divisor_ffi->num_scalars);
|
||||
mem_ptr->scalar_mul_mem, bsks, (uint32_t **)ksks,
|
||||
mem_ptr->params.message_modulus, scalar_divisor_ffi->num_scalars);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,13 +22,12 @@ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb_64(
|
||||
|
||||
void cuda_integer_radix_scalar_rotate_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_radix_scalar_rotate_kb_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, n,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
(uint32_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_scalar_rotate(CudaStreamsFFI streams,
|
||||
|
||||
@@ -24,12 +24,11 @@ __host__ uint64_t scratch_cuda_integer_radix_scalar_rotate_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t n,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
KSTorus *const *ksks) {
|
||||
|
||||
auto num_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
@@ -74,8 +73,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_bivariate, num_blocks,
|
||||
lut_bivariate->params.message_modulus);
|
||||
lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
|
||||
|
||||
} else {
|
||||
// rotate left as the blocks are from LSB to MSB
|
||||
@@ -99,8 +97,7 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array, receiver_blocks, giver_blocks, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_bivariate, num_blocks,
|
||||
lut_bivariate->params.message_modulus);
|
||||
lut_bivariate, num_blocks, lut_bivariate->params.message_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -26,13 +26,12 @@ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
|
||||
/// rotations - 1 The remaining blocks are padded with zeros
|
||||
void cuda_integer_radix_logical_scalar_shift_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, shift,
|
||||
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key, lwe_array->num_radix_blocks);
|
||||
(uint32_t **)(ksks), lwe_array->num_radix_blocks);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
@@ -64,13 +63,12 @@ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb_64(
|
||||
/// zeros as would be done in the logical shift.
|
||||
void cuda_integer_radix_arithmetic_scalar_shift_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
int8_t *mem_ptr, void *const *bsks, void *const *ksks) {
|
||||
|
||||
host_integer_radix_arithmetic_scalar_shift_kb_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, shift,
|
||||
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
(uint32_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_logical_scalar_shift(CudaStreamsFFI streams,
|
||||
|
||||
@@ -24,13 +24,11 @@ __host__ uint64_t scratch_cuda_integer_radix_logical_scalar_shift_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t num_blocks) {
|
||||
KSTorus *const *ksks, uint32_t num_blocks) {
|
||||
|
||||
if (lwe_array->num_radix_blocks < num_blocks)
|
||||
PANIC("Cuda error: input does not have enough blocks")
|
||||
@@ -81,9 +79,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, &partial_current_blocks, &partial_current_blocks,
|
||||
&partial_previous_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
&partial_previous_blocks, bsks, ksks, lut_bivariate,
|
||||
partial_block_count, lut_bivariate->params.message_modulus);
|
||||
|
||||
} else {
|
||||
// right shift
|
||||
@@ -113,8 +110,8 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, partial_current_blocks, partial_current_blocks,
|
||||
&partial_next_blocks, bsks, ksks, ms_noise_reduction_key, lut_bivariate,
|
||||
partial_block_count, lut_bivariate->params.message_modulus);
|
||||
&partial_next_blocks, bsks, ksks, lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,12 +128,11 @@ __host__ uint64_t scratch_cuda_integer_radix_arithmetic_scalar_shift_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
|
||||
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
KSTorus *const *ksks) {
|
||||
|
||||
auto num_blocks = lwe_array->num_radix_blocks;
|
||||
auto params = mem->params;
|
||||
@@ -205,9 +201,8 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, partial_current_blocks, partial_current_blocks,
|
||||
&partial_next_blocks, bsks, ksks, ms_noise_reduction_key,
|
||||
lut_bivariate, partial_block_count,
|
||||
lut_bivariate->params.message_modulus);
|
||||
&partial_next_blocks, bsks, ksks, lut_bivariate,
|
||||
partial_block_count, lut_bivariate->params.message_modulus);
|
||||
}
|
||||
// Since our CPU threads will be working on different streams we shall
|
||||
// Ensure the work in the main stream is completed
|
||||
@@ -216,7 +211,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
mem->lut_buffers_univariate[num_bits_in_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_1, &padding_block, &last_block_copy, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_univariate_padding_block, 1);
|
||||
lut_univariate_padding_block, 1);
|
||||
// Replace blocks 'pulled' from the left with the correct padding
|
||||
// block
|
||||
for (uint i = 0; i < rotations; i++) {
|
||||
@@ -230,7 +225,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
|
||||
mem->lut_buffers_univariate[shift_within_block - 1];
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem->local_streams_2, &last_block, &last_block_copy, bsks, ksks,
|
||||
ms_noise_reduction_key, lut_univariate_shift_last_block, 1);
|
||||
lut_univariate_shift_last_block, 1);
|
||||
}
|
||||
|
||||
mem->local_streams_1.synchronize();
|
||||
|
||||
@@ -22,13 +22,12 @@ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb_64(
|
||||
void cuda_integer_radix_shift_and_rotate_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *ksks) {
|
||||
|
||||
host_integer_radix_shift_and_rotate_kb_inplace<uint64_t>(
|
||||
CudaStreams(streams), lwe_array, lwe_shift,
|
||||
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)(ksks), ms_noise_reduction_key);
|
||||
(uint32_t **)(ksks));
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_radix_shift_and_rotate(CudaStreamsFFI streams,
|
||||
|
||||
@@ -24,13 +24,12 @@ __host__ uint64_t scratch_cuda_integer_radix_shift_and_rotate_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
__host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
|
||||
CudaRadixCiphertextFFI const *lwe_shift,
|
||||
int_shift_and_rotate_buffer<Torus> *mem, void *const *bsks,
|
||||
Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
KSTorus *const *ksks) {
|
||||
cuda_set_device(streams.gpu_index(0));
|
||||
|
||||
if (lwe_array->num_radix_blocks != lwe_shift->num_radix_blocks)
|
||||
@@ -57,7 +56,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// Extract all bits
|
||||
auto bits = mem->tmp_bits;
|
||||
extract_n_bits<Torus>(streams, bits, lwe_array, bsks, ksks,
|
||||
ms_noise_reduction_key,
|
||||
num_radix_blocks * bits_per_block, num_radix_blocks,
|
||||
mem->bit_extract_luts);
|
||||
|
||||
@@ -79,8 +77,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// so that it is already aligned to the correct position of the cmux input
|
||||
// and we reduce noise growth
|
||||
extract_n_bits<Torus>(streams, shift_bits, lwe_shift, bsks, ksks,
|
||||
ms_noise_reduction_key, max_num_bits_that_tell_shift,
|
||||
num_radix_blocks, mem->bit_extract_luts_with_offset_2);
|
||||
max_num_bits_that_tell_shift, num_radix_blocks,
|
||||
mem->bit_extract_luts_with_offset_2);
|
||||
|
||||
// If signed, do an "arithmetic shift" by padding with the sign bit
|
||||
CudaRadixCiphertextFFI last_bit;
|
||||
@@ -163,8 +161,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// we have
|
||||
// control_bit|b|a
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, input_bits_a, mux_inputs, bsks, ksks, ms_noise_reduction_key,
|
||||
mux_lut, total_nb_bits);
|
||||
streams, input_bits_a, mux_inputs, bsks, ksks, mux_lut, total_nb_bits);
|
||||
}
|
||||
|
||||
// Initializes the output
|
||||
@@ -196,8 +193,8 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
|
||||
// To give back a clean ciphertext
|
||||
auto cleaning_lut = mem->cleaning_lut;
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array, lwe_array, bsks, ksks, ms_noise_reduction_key,
|
||||
cleaning_lut, num_radix_blocks);
|
||||
streams, lwe_array, lwe_array, bsks, ksks, cleaning_lut,
|
||||
num_radix_blocks);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -23,14 +23,12 @@ void cuda_sub_and_propagate_single_carry_kb_64_inplace(
|
||||
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *carry_in, int8_t *mem_ptr, void *const *bsks,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
void *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
|
||||
PUSH_RANGE("sub")
|
||||
host_sub_and_propagate_single_carry<uint64_t>(
|
||||
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
|
||||
(int_sub_and_propagate<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
|
||||
ms_noise_reduction_key, requested_flag, uses_carry);
|
||||
(int_sub_and_propagate<uint64_t> *)mem_ptr, bsks, (uint32_t **)(ksks),
|
||||
requested_flag, uses_carry);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
|
||||
@@ -27,13 +27,12 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename Torus, typename KSTorus>
|
||||
void host_sub_and_propagate_single_carry(
|
||||
CudaStreams streams, CudaRadixCiphertextFFI *lhs_array,
|
||||
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
|
||||
const CudaRadixCiphertextFFI *input_carries,
|
||||
int_sub_and_propagate<Torus> *mem, void *const *bsks, Torus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
int_sub_and_propagate<Torus> *mem, void *const *bsks, KSTorus *const *ksks,
|
||||
uint32_t requested_flag, uint32_t uses_carry) {
|
||||
|
||||
host_integer_radix_negation<Torus>(
|
||||
@@ -42,8 +41,7 @@ void host_sub_and_propagate_single_carry(
|
||||
|
||||
host_add_and_propagate_single_carry<Torus>(
|
||||
streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
|
||||
mem->sc_prop_mem, bsks, ksks, ms_noise_reduction_key, requested_flag,
|
||||
uses_carry);
|
||||
mem->sc_prop_mem, bsks, ksks, requested_flag, uses_carry);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -204,20 +204,20 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_2_2_params(
|
||||
}
|
||||
|
||||
template <typename InputTorus, typename OutputTorus>
|
||||
void execute_pbs_async(
|
||||
CudaStreams streams, const LweArrayVariant<OutputTorus> &lwe_array_out,
|
||||
const LweArrayVariant<InputTorus> &lwe_output_indexes,
|
||||
const std::vector<OutputTorus *> lut_vec,
|
||||
const std::vector<InputTorus *> lut_indexes_vec,
|
||||
const LweArrayVariant<InputTorus> &lwe_array_in,
|
||||
const LweArrayVariant<InputTorus> &lwe_input_indexes,
|
||||
void *const *bootstrapping_keys,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
void execute_pbs_async(CudaStreams streams,
|
||||
const LweArrayVariant<OutputTorus> &lwe_array_out,
|
||||
const LweArrayVariant<InputTorus> &lwe_output_indexes,
|
||||
const std::vector<OutputTorus *> lut_vec,
|
||||
const std::vector<InputTorus *> lut_indexes_vec,
|
||||
const LweArrayVariant<InputTorus> &lwe_array_in,
|
||||
const LweArrayVariant<InputTorus> &lwe_input_indexes,
|
||||
void *const *bootstrapping_keys,
|
||||
std::vector<int8_t *> pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
|
||||
// 32 bits
|
||||
@@ -310,17 +310,13 @@ void execute_pbs_async(
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
void *zeros = nullptr;
|
||||
if (ms_noise_reduction_key != nullptr &&
|
||||
ms_noise_reduction_key->ptr != nullptr)
|
||||
zeros = ms_noise_reduction_key->ptr[i];
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], ms_noise_reduction_key, zeros, pbs_buffer[i],
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -374,16 +370,11 @@ void execute_pbs_async(
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
void *zeros = nullptr;
|
||||
if (ms_noise_reduction_key != nullptr &&
|
||||
ms_noise_reduction_key->ptr != nullptr)
|
||||
zeros = ms_noise_reduction_key->ptr[i];
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
lut_vec[i], current_lwe_array_in, bootstrapping_keys[i],
|
||||
ms_noise_reduction_key, zeros, pbs_buffer[i], lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_inputs_on_gpu);
|
||||
pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, num_inputs_on_gpu);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -80,9 +80,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
(noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
|
||||
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
|
||||
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
|
||||
@@ -650,33 +650,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *ms_drift_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
// If the parameters contain drift noise reduction key, then apply it
|
||||
if (buffer->noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_drift_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer->temp_lwe_array_in,
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(ms_drift_noise_reduction_ptr),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
} else {
|
||||
buffer->temp_lwe_array_in =
|
||||
const_cast<uint64_t *>(static_cast<const uint64_t *>(lwe_array_in));
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
@@ -687,7 +669,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
@@ -702,7 +684,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
@@ -714,7 +696,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
|
||||
@@ -56,9 +56,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
(noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
|
||||
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
|
||||
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
|
||||
@@ -36,7 +36,7 @@ uint64_t scratch_cuda_programmable_bootstrap_128(
|
||||
template <typename InputTorus>
|
||||
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -83,7 +83,7 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
template <typename InputTorus>
|
||||
void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -132,36 +132,17 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
__uint128_t const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_drift_noise_reduction_ptr,
|
||||
void const *ms_noise_reduction_ptr,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
// If the parameters contain drift noise reduction key, then apply it
|
||||
if (buffer->noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_drift_modulus_switch<InputTorus>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
static_cast<InputTorus const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(buffer->trivial_indexes),
|
||||
static_cast<const InputTorus *>(ms_noise_reduction_ptr),
|
||||
lwe_dimension + 1, num_samples, ms_drift_noise_reduction_ptr->num_zeros,
|
||||
ms_drift_noise_reduction_ptr->ms_input_variance,
|
||||
ms_drift_noise_reduction_ptr->ms_r_sigma,
|
||||
ms_drift_noise_reduction_ptr->ms_bound, log_modulus);
|
||||
} else {
|
||||
buffer->temp_lwe_array_in =
|
||||
const_cast<InputTorus *>(static_cast<const InputTorus *>(lwe_array_in));
|
||||
}
|
||||
switch (buffer->pbs_variant) {
|
||||
case DEFAULT:
|
||||
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<InputTorus>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
lut_vector, static_cast<InputTorus const *>(lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
@@ -169,7 +150,7 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
|
||||
InputTorus>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
lut_vector, static_cast<InputTorus const *>(lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
@@ -234,9 +215,7 @@ void host_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *streams, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void const *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *buffer =
|
||||
@@ -245,9 +224,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
host_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
|
||||
streams, gpu_index, lwe_array_out,
|
||||
static_cast<const __uint128_t *>(lut_vector), lwe_array_in,
|
||||
bootstrapping_key, ms_noise_reduction_key, ms_noise_reduction_ptr, buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples);
|
||||
bootstrapping_key, buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, num_samples);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -668,7 +668,7 @@ uint64_t scratch_cuda_programmable_bootstrap_128_vector(
|
||||
template <typename InputTorus, class params, bool first_iter>
|
||||
__host__ void execute_step_one_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
InputTorus *lwe_array_in, double const *bootstrapping_key,
|
||||
InputTorus const *lwe_array_in, double const *bootstrapping_key,
|
||||
__uint128_t *global_accumulator, double *global_join_buffer,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
@@ -752,7 +752,7 @@ __host__ void execute_step_two_128(
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void host_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
|
||||
@@ -84,9 +84,7 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
(noise_reduction_type == PBS_MS_REDUCTION_T::DRIFT)
|
||||
? &lwe_array_in[blockIdx.x * (lwe_dimension + 1)]
|
||||
: &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
|
||||
|
||||
@@ -43,8 +43,7 @@ uint64_t scratch_cuda_expand_without_verification_64(
|
||||
void cuda_expand_without_verification_64(
|
||||
CudaStreamsFFI streams, void *lwe_array_out,
|
||||
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
|
||||
void *const *bsks, void *const *computing_ksks, void *const *casting_keys,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
void *const *bsks, void *const *computing_ksks, void *const *casting_keys) {
|
||||
|
||||
auto expand_buffer = reinterpret_cast<zk_expand_mem<uint64_t> *>(mem_ptr);
|
||||
|
||||
@@ -54,49 +53,49 @@ void cuda_expand_without_verification_64(
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
(uint64_t **)(computing_ksks));
|
||||
break;
|
||||
case 512:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<512>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
(uint64_t **)(computing_ksks));
|
||||
break;
|
||||
case 1024:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<1024>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
(uint64_t **)(computing_ksks));
|
||||
break;
|
||||
case 2048:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<2048>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
(uint64_t **)(computing_ksks));
|
||||
break;
|
||||
case 4096:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<4096>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
(uint64_t **)(computing_ksks));
|
||||
break;
|
||||
case 8192:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<8192>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
(uint64_t **)(computing_ksks));
|
||||
break;
|
||||
case 16384:
|
||||
host_expand_without_verification<uint64_t, AmortizedDegree<16384>>(
|
||||
streams, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
|
||||
expand_buffer, (uint64_t **)casting_keys, bsks,
|
||||
(uint64_t **)(computing_ksks), ms_noise_reduction_key);
|
||||
(uint64_t **)(computing_ksks));
|
||||
break;
|
||||
default:
|
||||
PANIC("CUDA error: lwe_dimension not supported."
|
||||
|
||||
@@ -19,8 +19,7 @@ template <typename Torus, class params>
|
||||
__host__ void host_expand_without_verification(
|
||||
CudaStreams streams, Torus *lwe_array_out,
|
||||
const Torus *lwe_flattened_compact_array_in, zk_expand_mem<Torus> *mem_ptr,
|
||||
Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks) {
|
||||
// Expand
|
||||
auto casting_key_type = mem_ptr->casting_key_type;
|
||||
auto expanded_lwes = mem_ptr->tmp_expanded_lwes;
|
||||
@@ -77,7 +76,7 @@ __host__ void host_expand_without_verification(
|
||||
|
||||
// apply keyswitch to BIG
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.subset_first_gpu(), ksed_small_to_big_expanded_lwes,
|
||||
streams.get_ith(0), ksed_small_to_big_expanded_lwes,
|
||||
lwe_trivial_indexes_vec[0], expanded_lwes, lwe_trivial_indexes_vec[0],
|
||||
casting_keys, casting_input_dimension, casting_output_dimension,
|
||||
casting_ks_base_log, casting_ks_level, num_lwes);
|
||||
@@ -96,8 +95,8 @@ __host__ void host_expand_without_verification(
|
||||
auto input = new CudaRadixCiphertextFFI;
|
||||
into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, output, input, bsks, ksks, ms_noise_reduction_key,
|
||||
message_and_carry_extract_luts, 2 * num_lwes);
|
||||
streams, output, input, bsks, ksks, message_and_carry_extract_luts,
|
||||
2 * num_lwes);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -191,9 +191,9 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
|
||||
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
|
||||
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, nullptr, nullptr,
|
||||
pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, number_of_inputs, num_many_lut, lut_stride);
|
||||
(void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, number_of_inputs, num_many_lut, lut_stride);
|
||||
// Copy result back
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
(glwe_dimension * polynomial_size + 1) *
|
||||
|
||||
@@ -98,37 +98,8 @@ pub const PBS_TYPE_MULTI_BIT: PBS_TYPE = 0;
|
||||
pub const PBS_TYPE_CLASSICAL: PBS_TYPE = 1;
|
||||
pub type PBS_TYPE = ffi::c_uint;
|
||||
pub const PBS_MS_REDUCTION_T_NO_REDUCTION: PBS_MS_REDUCTION_T = 0;
|
||||
pub const PBS_MS_REDUCTION_T_DRIFT: PBS_MS_REDUCTION_T = 1;
|
||||
pub const PBS_MS_REDUCTION_T_CENTERED: PBS_MS_REDUCTION_T = 2;
|
||||
pub const PBS_MS_REDUCTION_T_CENTERED: PBS_MS_REDUCTION_T = 1;
|
||||
pub type PBS_MS_REDUCTION_T = ffi::c_uint;
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaModulusSwitchNoiseReductionKeyFFI {
|
||||
pub ptr: *const *mut ffi::c_void,
|
||||
pub num_zeros: u32,
|
||||
pub ms_bound: f64,
|
||||
pub ms_r_sigma: f64,
|
||||
pub ms_input_variance: f64,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaModulusSwitchNoiseReductionKeyFFI"]
|
||||
[::std::mem::size_of::<CudaModulusSwitchNoiseReductionKeyFFI>() - 40usize];
|
||||
["Alignment of CudaModulusSwitchNoiseReductionKeyFFI"]
|
||||
[::std::mem::align_of::<CudaModulusSwitchNoiseReductionKeyFFI>() - 8usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::num_zeros"]
|
||||
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, num_zeros) - 8usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_bound"]
|
||||
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ms_bound) - 16usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_r_sigma"]
|
||||
[::std::mem::offset_of!(CudaModulusSwitchNoiseReductionKeyFFI, ms_r_sigma) - 24usize];
|
||||
["Offset of field: CudaModulusSwitchNoiseReductionKeyFFI::ms_input_variance"][::std::mem::offset_of!(
|
||||
CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
ms_input_variance
|
||||
) - 32usize];
|
||||
};
|
||||
pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0;
|
||||
pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1;
|
||||
pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2;
|
||||
@@ -281,6 +252,55 @@ const _: () = {
|
||||
divisor_has_more_bits_than_numerator
|
||||
) - 60usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaLweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub num_radix_blocks: u32,
|
||||
pub lwe_dimension: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
|
||||
["Alignment of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaPackedGlweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub storage_log_modulus: u32,
|
||||
pub lwe_per_glwe: u32,
|
||||
pub total_lwe_bodies_count: u32,
|
||||
pub glwe_dimension: u32,
|
||||
pub polynomial_size: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
|
||||
["Alignment of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
|
||||
};
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_univariate_lut_kb_64(
|
||||
streams: CudaStreamsFFI,
|
||||
@@ -333,7 +353,6 @@ unsafe extern "C" {
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
@@ -373,7 +392,6 @@ unsafe extern "C" {
|
||||
input_radix_lwe_2: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_radix_blocks: u32,
|
||||
shift: u32,
|
||||
@@ -392,7 +410,6 @@ unsafe extern "C" {
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_luts: u32,
|
||||
lut_stride: u32,
|
||||
@@ -423,7 +440,6 @@ unsafe extern "C" {
|
||||
input_blocks: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
@@ -463,7 +479,6 @@ unsafe extern "C" {
|
||||
is_bool_right: bool,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
mem_ptr: *mut i8,
|
||||
polynomial_size: u32,
|
||||
num_blocks: u32,
|
||||
@@ -523,7 +538,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -556,7 +570,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -602,7 +615,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -643,7 +655,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -656,7 +667,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
num_scalar_blocks: u32,
|
||||
);
|
||||
}
|
||||
@@ -694,7 +704,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -708,7 +717,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -745,7 +753,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -781,7 +788,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -808,7 +814,6 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
@@ -831,7 +836,6 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
@@ -845,7 +849,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
);
|
||||
@@ -860,7 +863,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
);
|
||||
@@ -906,7 +908,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
compute_overflow: u32,
|
||||
uses_input_borrow: u32,
|
||||
);
|
||||
@@ -947,7 +948,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -986,7 +986,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
polynomial_size: u32,
|
||||
message_modulus: u32,
|
||||
num_scalars: u32,
|
||||
@@ -1031,7 +1030,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1066,7 +1064,6 @@ unsafe extern "C" {
|
||||
generates_or_propagates: *mut CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
num_blocks: u32,
|
||||
);
|
||||
@@ -1113,7 +1110,6 @@ unsafe extern "C" {
|
||||
is_signed: bool,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1148,7 +1144,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
num_radix_blocks: u32,
|
||||
);
|
||||
}
|
||||
@@ -1187,7 +1182,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
num_radix_blocks: u32,
|
||||
);
|
||||
}
|
||||
@@ -1241,7 +1235,6 @@ unsafe extern "C" {
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
@@ -1283,7 +1276,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
requested_flag: u32,
|
||||
uses_carry: u32,
|
||||
);
|
||||
@@ -1322,7 +1314,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
);
|
||||
}
|
||||
@@ -1362,7 +1353,6 @@ unsafe extern "C" {
|
||||
num_additional_blocks: u32,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1399,7 +1389,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
numerator_bits: u32,
|
||||
);
|
||||
@@ -1440,7 +1429,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
divisor_has_at_least_one_set: *const u64,
|
||||
decomposed_divisor: *const u64,
|
||||
@@ -1486,7 +1474,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
scalar_divisor_ffi: *const CudaScalarDivisorFFI,
|
||||
divisor_has_at_least_one_set: *const u64,
|
||||
decomposed_divisor: *const u64,
|
||||
@@ -1531,7 +1518,6 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1553,7 +1539,6 @@ unsafe extern "C" {
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_blocks_to_process: u32,
|
||||
num_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
@@ -1571,7 +1556,6 @@ unsafe extern "C" {
|
||||
num_blocks_to_process: u32,
|
||||
mem: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1613,61 +1597,11 @@ unsafe extern "C" {
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_ilog2_kb_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaLweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub num_radix_blocks: u32,
|
||||
pub lwe_dimension: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
|
||||
["Alignment of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaPackedGlweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub storage_log_modulus: u32,
|
||||
pub lwe_per_glwe: u32,
|
||||
pub total_lwe_bodies_count: u32,
|
||||
pub glwe_dimension: u32,
|
||||
pub polynomial_size: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
|
||||
["Alignment of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
|
||||
};
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: CudaStreamsFFI,
|
||||
@@ -1795,6 +1729,78 @@ unsafe extern "C" {
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_aes_encrypt_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
num_aes_inputs: u32,
|
||||
sbox_parallelism: u32,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_aes_ctr_encrypt_64(
|
||||
streams: CudaStreamsFFI,
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
iv: *const CudaRadixCiphertextFFI,
|
||||
round_keys: *const CudaRadixCiphertextFFI,
|
||||
counter_bits_le_all_blocks: *const u64,
|
||||
num_aes_inputs: u32,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_aes_encrypt_64(streams: CudaStreamsFFI, mem_ptr_void: *mut *mut i8);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_key_expansion_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_key_expansion_64(
|
||||
streams: CudaStreamsFFI,
|
||||
expanded_keys: *mut CudaRadixCiphertextFFI,
|
||||
key: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_key_expansion_64(
|
||||
streams: CudaStreamsFFI,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
|
||||
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
|
||||
pub type KS_TYPE = ffi::c_uint;
|
||||
@@ -1835,7 +1841,6 @@ unsafe extern "C" {
|
||||
bsks: *const *mut ffi::c_void,
|
||||
computing_ksks: *const *mut ffi::c_void,
|
||||
casting_keys: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -2303,8 +2308,6 @@ unsafe extern "C" {
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
ms_noise_reduction_ptr: *mut ffi::c_void,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
@@ -2324,8 +2327,6 @@ unsafe extern "C" {
|
||||
lut_vector: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
ms_noise_reduction_ptr: *const ffi::c_void,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "cuda/include/ciphertext.h"
|
||||
#include "cuda/include/integer/compression/compression.h"
|
||||
#include "cuda/include/integer/integer.h"
|
||||
#include "cuda/include/aes/aes.h"
|
||||
#include "cuda/include/zk/zk.h"
|
||||
#include "cuda/include/keyswitch/keyswitch.h"
|
||||
#include "cuda/include/keyswitch/ks_enums.h"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user