Compare commits

...

43 Commits

Author SHA1 Message Date
tmontaigu
51372d9860 feat(hlapi): add flip operation 2025-09-05 16:09:47 +02:00
tmontaigu
dae7aff3b7 feat(integer): add flip operation
Add the flip(condition: BooleanBlock, a: T, b: T) -> (T, T)
operation that homomorphically flip/swap two values if the
given encrypted boolean encrypts true
2025-09-05 16:09:47 +02:00
Nicolas Sarlin
adcf9bc1f3 fix(zk): handle limit cases in the four_squares algorithm 2025-09-05 15:34:44 +02:00
pgardratzama
0a1651adf3 fix(hpu): update firmware in bitstream to allow SIMD operations 2025-09-05 10:42:36 +02:00
pgardratzama
11b540c456 chore(hpu): adds cost of hpu setups 2025-09-05 10:42:36 +02:00
pgardratzama
bd7df4a03b chore(hpu): enable hpu hlapi workflow and throughput bench in integer workflow 2025-09-05 10:42:36 +02:00
pgardratzama
2279d0deb8 chore(hpu): update hpu firmware (fix 2 bits operations issue) 2025-09-05 10:42:36 +02:00
pgardratzama
6fe24c6ab3 chore(hpu): update hpu integer bench scalar op names 2025-09-05 10:42:36 +02:00
pgardratzama
46c6adb0dc feat(hpu): create a new workflow to launch HLAPI benches for HPU 2025-09-05 10:42:36 +02:00
pgardratzama
c6aa1adbe7 chore(hpu): update benches to run new operations 2025-09-05 10:42:36 +02:00
Helder Campos
d3a867ecfe feat(hpu): High bandwidth HPU 2025-09-05 10:42:36 +02:00
Helder Campos
a83c92f28f feat(hpu): Soft Reset Support and fix some runtime registers 2025-09-05 10:42:36 +02:00
Helder Campos
3b48ef301e feat(hpu): Made two SIMD IOPs, ADD and ERC20. 2025-09-05 10:42:36 +02:00
Helder Campos
827a6e912c feat(hpu): Adding a massively parallel multiplier operation 2025-09-05 10:42:36 +02:00
Guillermo Oyarzun
eeccace7b3 fix(gpu): add missing syncs when releasing scalar ops and returning to old lut release 2025-09-05 09:53:00 +02:00
dependabot[bot]
01d1fa96d7 chore(deps): bump on-headers and serve in /tfhe/web_wasm_parallel_tests
Bumps [on-headers](https://github.com/jshttp/on-headers) to 1.1.0 and updates ancestor dependency [serve](https://github.com/vercel/serve). These dependencies need to be updated together.


Updates `on-headers` from 1.0.2 to 1.1.0
- [Release notes](https://github.com/jshttp/on-headers/releases)
- [Changelog](https://github.com/jshttp/on-headers/blob/master/HISTORY.md)
- [Commits](https://github.com/jshttp/on-headers/compare/v1.0.2...v1.1.0)

Updates `serve` from 14.2.3 to 14.2.5
- [Release notes](https://github.com/vercel/serve/releases)
- [Commits](https://github.com/vercel/serve/compare/14.2.3...v14.2.5)

---
updated-dependencies:
- dependency-name: on-headers
  dependency-version: 1.1.0
  dependency-type: indirect
- dependency-name: serve
  dependency-version: 14.2.5
  dependency-type: direct:development
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-09-05 09:14:55 +02:00
Arthur Meyre
10789ba3d1 chore(ci): configure tfhe-ntt tests to have an avx512 + IFMA instance
- ubuntu-latest is replaced by m6i.4xlarge to make sure all code is tested
in the tfhe-ntt crate
2025-09-05 09:14:12 +02:00
David Testé
4a0658389e chore(bench): make bits to prove customizable in zk benchmarks
Some application like blockchain, may wants to prove less bits
than CRS size allows to.
2025-09-05 09:03:24 +02:00
David Testé
97574bdae8 chore(bench): add noise squash benchmark with compressions
This new benchmark is extracted from a use case.
From a compressed ciphertext, it measures the decompression, then noise squashes it and finally compresses again the result.
2025-09-04 15:13:08 +02:00
Guillermo Oyarzun
60d137de6e feat(gpu): use mempools to optimize mem reuse 2025-09-04 13:23:18 +02:00
Guillermo Oyarzun
c2e816a86c fix(gpu): change mininum number of elements in benches 2025-09-04 11:03:27 +02:00
Pedro Alves
b42ba79145 feat(gpu): implement support for 128-bit compression on the HL API 2025-09-03 14:33:08 -03:00
Agnes Leroy
69b055c03f chore(gpu): update parameters for classical pbs128 2025-09-03 17:22:52 +02:00
Nicolas Sarlin
e2c7359057 chore(csprng): use getrandom as random source for unix seeder 2025-09-03 17:21:22 +02:00
Guillermo Oyarzun
baad6a6b49 feat(gpu): change broadcast lut to communicate the minimum possible 2025-09-03 15:20:58 +02:00
Guillermo Oyarzun
88c3df8331 feat(gpu): improve communication scheme 2025-09-03 15:20:58 +02:00
Nicolas Sarlin
e3686ed4ba chore(fft): remove dead store in stockham dif16 2025-09-02 16:48:56 +02:00
Nicolas Sarlin
b8a9a15883 doc: explain how to run first example 2025-09-02 16:48:33 +02:00
Nicolas Sarlin
a7d931449a doc(core): remove warning about glwe polynomial size of 1 2025-09-02 15:49:15 +02:00
Nicolas Sarlin
099bccd85f chore(safe_ser): check serialization header version 2025-09-01 17:29:47 +02:00
Nicolas Sarlin
b9d75c9f8f fix: remove references to 2^-64 pfail for GPU 2025-09-01 17:01:15 +02:00
Nicolas Sarlin
543517cea5 chore(core): use checked_mul for container indexing 2025-09-01 15:36:44 +02:00
Nicolas Sarlin
fed5c1db1e fix(core): potential overflow for glwe encrypt on 32b platforms 2025-09-01 15:36:06 +02:00
Nicolas Sarlin
c9249fe991 chore(core): size checks in Fourier128GgswCiphertext::from_container 2025-09-01 15:35:58 +02:00
Nicolas Sarlin
d308305eb1 doc(core): add some "panics" comments 2025-09-01 15:35:41 +02:00
Nicolas Sarlin
f66730deb6 chore(core)!: add ExactSizeIterator to izip macro, renamed izip_eq 2025-09-01 15:35:41 +02:00
dependabot[bot]
cd92146c38 chore(deps): bump actions/cache from 4.2.0 to 4.2.4
Bumps [actions/cache](https://github.com/actions/cache) from 4.2.0 to 4.2.4.
- [Release notes](https://github.com/actions/cache/releases)
- [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md)
- [Commits](https://github.com/actions/cache/compare/v4.2.0...0400d5f644dc74513175e3cd8d07132dd4860809)

---
updated-dependencies:
- dependency-name: actions/cache
  dependency-version: 4.2.4
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-09-01 11:26:00 +02:00
dependabot[bot]
568f77f5f6 chore(deps): bump actions/setup-node from 4.0.2 to 4.4.0
Bumps [actions/setup-node](https://github.com/actions/setup-node) from 4.0.2 to 4.4.0.
- [Release notes](https://github.com/actions/setup-node/releases)
- [Commits](60edb5dd54...49933ea528)

---
updated-dependencies:
- dependency-name: actions/setup-node
  dependency-version: 4.4.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-09-01 11:25:51 +02:00
dependabot[bot]
f610712e97 chore(deps): bump foundry-rs/foundry-toolchain from 1.3.1 to 1.4.0
Bumps [foundry-rs/foundry-toolchain](https://github.com/foundry-rs/foundry-toolchain) from 1.3.1 to 1.4.0.
- [Release notes](https://github.com/foundry-rs/foundry-toolchain/releases)
- [Changelog](https://github.com/foundry-rs/foundry-toolchain/blob/master/RELEASE.md)
- [Commits](de808b1eea...82dee4ba65)

---
updated-dependencies:
- dependency-name: foundry-rs/foundry-toolchain
  dependency-version: 1.4.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-09-01 11:25:42 +02:00
dependabot[bot]
5d8f0b8532 chore(deps): bump actions/checkout from 4.1.7 to 5.0.0
Bumps [actions/checkout](https://github.com/actions/checkout) from 4.1.7 to 5.0.0.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v4.1.7...08c6903cd8c0fde910a37f88322edcfb5dd907a8)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-version: 5.0.0
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-09-01 11:25:35 +02:00
dependabot[bot]
11c04d0cc9 chore(deps): bump docker/login-action from 3.3.0 to 3.5.0
Bumps [docker/login-action](https://github.com/docker/login-action) from 3.3.0 to 3.5.0.
- [Release notes](https://github.com/docker/login-action/releases)
- [Commits](9780b0c442...184bdaa072)

---
updated-dependencies:
- dependency-name: docker/login-action
  dependency-version: 3.5.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-09-01 11:25:27 +02:00
Pedro Alves
57ea3e3e88 chore(gpu): refactor the entry points for PBS in the backend 2025-08-29 16:46:27 -03:00
Pedro Alves
cad4070ebe fix(gpu): fix the decompression function signature in the backend 2025-08-29 21:09:40 +02:00
132 changed files with 4939 additions and 1471 deletions

View File

@@ -0,0 +1,98 @@
# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
name: Hpu Hlapi Benchmarks
on:
workflow_dispatch:
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
permissions: {}
jobs:
hlapi-benchmarks-hpu:
name: Execute HLAPI benchmarks for HPU backend
runs-on: v80-desktop
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
timeout-minutes: 1440 # 24 hours
steps:
# Needed as long as hw_regmap repository is private
- name: Configure SSH
uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
fetch-depth: 0
persist-credentials: 'false'
lfs: true
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Get benchmark details
run: |
COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict "${SHA}");
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=${COMMIT_DATE}";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
env:
SHA: ${{ github.sha }}
- name: Install rust
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
repository: zama-ai/slab
path: slab
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Run benchmarks
run: |
make pull_hpu_files
export V80_SERIAL_NUMBER=XFL12E4XJXWK
source /opt/xilinx/Vivado/2024.2/settings64.sh
make bench_hlapi_erc20_hpu
make bench_hlapi_hpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
--database tfhe_rs \
--hardware "hpu_x1" \
--backend hpu \
--project-version "${COMMIT_HASH}" \
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--walk-subdirs
env:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: ${{ github.sha }}_hlapi_benchmarks
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${JOB_SECRET}" \
--slab-url "${SLAB_URL}"
env:
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_URL: ${{ secrets.SLAB_URL }}

View File

@@ -3,6 +3,15 @@ name: Hpu Integer Benchmarks
on:
workflow_dispatch:
inputs:
bench_type:
description: "Benchmarks type"
type: choice
default: both
options:
- latency
- throughput
- both
env:
CARGO_TERM_COLOR: always
@@ -14,13 +23,46 @@ env:
permissions: {}
jobs:
prepare-matrix:
name: Prepare operations matrix
runs-on: v80-desktop
outputs:
bench_type: ${{ steps.set_bench_type.outputs.bench_type }}
steps:
- name: Set benchmark types
if: github.event_name == 'workflow_dispatch'
run: |
if [[ -z $INPUTS_BENCH_TYPE || "${INPUTS_BENCH_TYPE}" == "both" ]]; then
echo "BENCH_TYPE=[\"latency\", \"throughput\"]" >> "${GITHUB_ENV}"
else
echo "BENCH_TYPE=[\"${INPUTS_BENCH_TYPE}\"]" >> "${GITHUB_ENV}"
fi
env:
INPUTS_BENCH_TYPE: ${{ inputs.bench_type }}
- name: Default benchmark type
if: github.event_name != 'workflow_dispatch'
run: |
echo "BENCH_TYPE=[\"latency\"]" >> "${GITHUB_ENV}"
- name: Set benchmark types output
id: set_bench_type
run: | # zizmor: ignore[template-injection] this env variable is safe
echo "bench_type=${{ toJSON(env.BENCH_TYPE) }}" >> "${GITHUB_OUTPUT}"
integer-benchmarks-hpu:
name: Execute integer & erc20 benchmarks for HPU backend
needs: prepare-matrix
runs-on: v80-desktop
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
timeout-minutes: 1440 # 24 hours
strategy:
max-parallel: 1
matrix:
bench_type: ${{ fromJSON(needs.prepare-matrix.outputs.bench_type) }}
steps:
# Needed as long as hw_regmap repository is private
- name: Configure SSH
@@ -63,8 +105,11 @@ jobs:
- name: Run benchmarks
run: |
make pull_hpu_files
make bench_integer_hpu
make bench_hlapi_erc20_hpu
export V80_SERIAL_NUMBER=XFL12E4XJXWK
source /opt/xilinx/Vivado/2024.2/settings64.sh
make BENCH_TYPE="${BENCH_TYPE}" bench_integer_hpu
env:
BENCH_TYPE: ${{ matrix.bench_type }}
- name: Parse results
run: |
@@ -76,14 +121,16 @@ jobs:
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--walk-subdirs
--walk-subdirs \
--bench-type "${BENCH_TYPE}"
env:
REF_NAME: ${{ github.ref_name }}
BENCH_TYPE: ${{ matrix.bench_type }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
with:
name: ${{ github.sha }}_integer_benchmarks
name: ${{ github.sha }}_${{ matrix.bench_type }}_integer_benchmarks
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab

View File

@@ -11,6 +11,7 @@ env:
CARGO_TERM_COLOR: always
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}${{ github.ref == 'refs/heads/main' && github.sha || '' }}
@@ -31,7 +32,7 @@ jobs:
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
fetch-depth: 0
persist-credentials: 'false'
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
@@ -45,18 +46,46 @@ jobs:
- tfhe-ntt/**
- '.github/workflows/cargo_test_ntt.yml'
cargo-tests-ntt:
setup-instance:
needs: should-run
if: needs.should-run.outputs.ntt_test == 'true'
runs-on: ubuntu-latest
outputs:
matrix_os: ${{ steps.set-os-matrix.outputs.matrix_os }}
runner-name: ${{ steps.start-remote-instance.outputs.label }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: aws
profile: cpu-small
- name: Set os matrix
id: set-os-matrix
env:
SLAB_INSTANCE: ${{ steps.start-remote-instance.outputs.label }}
run: |
INSTANCE_TO_USE="${SLAB_INSTANCE:-ubuntu-latest}"
echo "matrix_os=[\"${INSTANCE_TO_USE}\", \"macos-latest\", \"windows-latest\"]" >> "$GITHUB_OUTPUT"
cargo-tests-ntt:
needs: [should-run, setup-instance]
if: needs.should-run.outputs.ntt_test == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-latest, macos-latest, windows-latest ]
os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
fail-fast: false
steps:
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
persist-credentials: 'false'
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install Rust
@@ -72,16 +101,16 @@ jobs:
run: make test_ntt_no_std
cargo-tests-ntt-nightly:
needs: should-run
needs: [should-run, setup-instance]
if: needs.should-run.outputs.ntt_test == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-latest, macos-latest, windows-latest ]
os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
steps:
- uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
persist-credentials: 'false'
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install Rust
@@ -97,7 +126,7 @@ jobs:
run: make test_ntt_no_std_nightly
cargo-tests-ntt-successful:
needs: [ should-run, cargo-tests-ntt, cargo-tests-ntt-nightly ]
needs: [should-run, cargo-tests-ntt, cargo-tests-ntt-nightly]
if: ${{ always() }}
runs-on: ubuntu-latest
steps:
@@ -120,3 +149,28 @@ jobs:
run: |
echo "Some tfhe-ntt tests failed"
exit 1
teardown-instance:
name: Teardown instance (cargo-tests-ntt-successful)
if: ${{ always() && needs.setup-instance.result == 'success' }}
needs: [setup-instance, cargo-tests-ntt-successful]
runs-on: ubuntu-latest
steps:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cargo-tests-ntt) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -100,7 +100,7 @@ jobs:
git lfs install
- name: Checkout tfhe-rs
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
path: tfhe-rs
persist-credentials: false
@@ -111,7 +111,7 @@ jobs:
ls
- name: Checkout fhevm
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
with:
repository: zama-ai/fhevm
persist-credentials: 'false'
@@ -162,10 +162,10 @@ jobs:
cargo install sqlx-cli
- name: Install foundry
uses: foundry-rs/foundry-toolchain@de808b1eea699e761c404bda44ba8f21aba30b2c
uses: foundry-rs/foundry-toolchain@82dee4ba654bd2146511f85f0d013af94670c4de
- name: Cache cargo
uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
uses: actions/cache@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4
with:
path: |
~/.cargo/registry
@@ -175,7 +175,7 @@ jobs:
restore-keys: ${{ runner.os }}-cargo-
- name: Login to GitHub Container Registry
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
with:
registry: ghcr.io
username: ${{ github.actor }}
@@ -186,7 +186,7 @@ jobs:
working-directory: fhevm/coprocessor/fhevm-engine/coprocessor
- name: Use Node.js
uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.2
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
with:
node-version: 20.x
@@ -257,7 +257,7 @@ jobs:
path: fhevm/$${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
with:
repository: zama-ai/slab
path: slab

View File

@@ -33,6 +33,7 @@ rand = "0.8"
rayon = "1.11"
serde = { version = "1.0", default-features = false }
wasm-bindgen = "0.2.100"
getrandom = "0.2.8"
[profile.bench]
lto = "fat"

View File

@@ -1312,11 +1312,11 @@ bench_signed_integer_gpu: install_rs_check_toolchain
.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
bench_integer_hpu: install_rs_check_toolchain
source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark --
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain
@@ -1497,11 +1497,13 @@ bench_hlapi_gpu: install_rs_check_toolchain
--bench hlapi \
--features=integer,gpu,internal-keycache,nightly-avx512 -p tfhe-benchmark --
.PHONY: bench_hlapi_hpu # Run benchmarks for integer operations on HPU
.PHONY: bench_hlapi_hpu # Run benchmarks for HLAPI operations on HPU
bench_hlapi_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi \
--features=integer,hpu,hpu-v80,internal-keycache,nightly-avx512 -p tfhe-benchmark --
--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --
.PHONY: bench_hlapi_erc20 # Run benchmarks for ERC20 operations
bench_hlapi_erc20: install_rs_check_toolchain
@@ -1529,11 +1531,11 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
bench_hlapi_erc20_hpu: install_rs_check_toolchain
source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
source ./setup_hpu.sh --config $(HPU_CONFIG) -p ; \
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark --
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
bench_tfhe_zk_pok: install_rs_check_toolchain

View File

@@ -201,11 +201,9 @@ When a new update is published in the Lattice Estimator, we update parameters ac
### Security model
By default, the parameter sets used in the High-Level API with the x86 CPU backend have a failure probability $\le 2^{128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
By default, the parameter sets used in the High-Level API have a failure probability $\le 2^{-128}$ to securely work in the IND-CPA^D model using the algorithmic techniques provided in our code base [1].
If you want to work within the IND-CPA security model, which is less strict than the IND-CPA-D model, the parameter sets can easily be changed and would have slightly better performance. More details can be found in the [TFHE-rs documentation](https://docs.zama.ai/tfhe-rs).
The default parameters used in the High-Level API with the GPU backend are chosen considering the IND-CPA security model, and are selected with a bootstrapping failure probability fixed at $p_{error} \le 2^{-128}$. In particular, it is assumed that the results of decrypted computations are not shared by the secret key owner with any third parties, as such an action can lead to leakage of the secret encryption key. If you are designing an application where decryptions must be shared, you will need to craft custom encryption parameters which are chosen in consideration of the IND-CPA^D security model [2].
[1] Bernard, Olivier, et al. "Drifting Towards Better Error Probabilities in Fully Homomorphic Encryption Schemes". https://eprint.iacr.org/2024/1718.pdf
[2] Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022. https://eprint.iacr.org/2022/816.pdf

View File

@@ -119,6 +119,8 @@ void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
int cuda_get_number_of_gpus();
int cuda_get_number_of_sms();
void cuda_synchronize_device(uint32_t gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index);

View File

@@ -16,24 +16,27 @@ int32_t cuda_setup_multi_gpu(int device_0_id);
template <typename Torus>
using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
// Macro to define the visitor logic using std::holds_alternative for vectors
#define GET_VARIANT_ELEMENT(variant, index) \
[&] { \
if (std::holds_alternative<std::vector<Torus *>>(variant)) { \
return std::get<std::vector<Torus *>>(variant)[index]; \
} else { \
return std::get<Torus *>(variant); \
} \
}()
// Macro to define the visitor logic using std::holds_alternative for vectors
#define GET_VARIANT_ELEMENT_64BIT(variant, index) \
[&] { \
if (std::holds_alternative<std::vector<uint64_t *>>(variant)) { \
return std::get<std::vector<uint64_t *>>(variant)[index]; \
} else { \
return std::get<uint64_t *>(variant); \
} \
}()
/// get_variant_element() resolves access when the input may be either a single
/// pointer or a vector of pointers. If the variant holds a single pointer, the
/// index is ignored and that pointer is returned; if it holds a vector, the
/// element at `index` is returned.
///
/// This function replaces the previous macro:
/// - Easier to debug and read than a macro
/// - Deduces the pointer type from the variant (no need to name a Torus type
/// explicitly)
/// - Defined in a header, so its eligible for inlining by the optimizer
template <typename Torus>
inline Torus
get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
size_t index) {
if (std::holds_alternative<std::vector<Torus>>(variant)) {
return std::get<std::vector<Torus>>(variant)[index];
} else {
return std::get<Torus>(variant);
}
}
int get_active_gpu_count(int num_inputs, int gpu_count);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

View File

@@ -75,7 +75,7 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
bool allocate_gpu_memory, bool allocate_ms_array);
bool allocate_gpu_memory);
void cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,

View File

@@ -115,8 +115,10 @@ template <typename Torus> struct int_decompression {
effective_compression_carry_modulus,
encryption_params.message_modulus, encryption_params.carry_modulus,
decompression_rescale_f, gpu_memory_allocated);
decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count =
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
decompression_rescale_lut->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,

View File

@@ -6,7 +6,6 @@
#include "integer/radix_ciphertext.h"
#include "keyswitch/keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include "pbs/programmable_bootstrap_128.cuh"
#include "utils/helper_multi_gpu.cuh"
#include <cmath>
#include <functional>
@@ -321,10 +320,15 @@ template <typename Torus> struct int_radix_lut {
std::vector<Torus *> lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec;
std::vector<Torus *> lwe_aligned_vec;
uint32_t *gpu_indexes;
bool gpu_memory_allocated;
cudaEvent_t event_scatter_in;
cudaEvent_t *event_scatter_out;
cudaEvent_t event_broadcast;
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
@@ -343,7 +347,6 @@ template <typename Torus> struct int_radix_lut {
///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
int8_t *gpu_pbs_buffer;
@@ -360,10 +363,21 @@ template <typename Torus> struct int_radix_lut {
if (i == 0) {
size_tracker += size;
}
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
buffer.push_back(gpu_pbs_buffer);
}
// We create the events only if we have multiple GPUs
if (active_gpu_count > 1) {
event_scatter_in = cuda_create_event(gpu_indexes[0]);
event_broadcast = cuda_create_event(gpu_indexes[0]);
event_scatter_out =
(cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
for (int i = 0; i < active_gpu_count; i++) {
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
}
}
// Allocate LUT
// LUT is used as a trivial encryption and must be initialized outside
// this constructor
@@ -382,8 +396,6 @@ template <typename Torus> struct int_radix_lut {
lut_vec.push_back(lut);
lut_indexes_vec.push_back(lut_indexes);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -499,11 +511,8 @@ template <typename Torus> struct int_radix_lut {
cuda_memset_with_size_tracking_async(lut_indexes, 0, lut_indexes_size,
streams[i], gpu_indexes[i],
allocate_gpu_memory);
lut_vec.push_back(lut);
lut_indexes_vec.push_back(lut_indexes);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -560,7 +569,6 @@ template <typename Torus> struct int_radix_lut {
///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_set_device(gpu_indexes[i]);
int8_t *gpu_pbs_buffer;
@@ -577,10 +585,19 @@ template <typename Torus> struct int_radix_lut {
if (i == 0) {
size_tracker += size;
}
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
buffer.push_back(gpu_pbs_buffer);
}
// We create the events only if we have multiple GPUs
if (active_gpu_count > 1) {
event_scatter_in = cuda_create_event(gpu_indexes[0]);
event_broadcast = cuda_create_event(gpu_indexes[0]);
event_scatter_out =
(cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
for (int i = 0; i < active_gpu_count; i++) {
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
}
}
// Allocate LUT
// LUT is used as a trivial encryption and must be initialized outside
// this constructor
@@ -596,11 +613,8 @@ template <typename Torus> struct int_radix_lut {
cuda_memset_with_size_tracking_async(lut_indexes, 0, lut_indexes_size,
streams[i], gpu_indexes[i],
allocate_gpu_memory);
lut_vec.push_back(lut);
lut_indexes_vec.push_back(lut_indexes);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
@@ -651,11 +665,9 @@ template <typename Torus> struct int_radix_lut {
multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
lwe_trivial_indexes_vec, num_radix_blocks,
size_tracker, allocate_gpu_memory);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
lwe_trivial_indexes_vec, lwe_trivial_indexes,
num_radix_blocks, allocate_gpu_memory);
multi_gpu_copy_array_from_cpu_async(
streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
h_lwe_indexes_in, num_radix_blocks, allocate_gpu_memory);
// Keyswitch
tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -712,29 +724,87 @@ template <typename Torus> struct int_radix_lut {
// Broadcast luts from device gpu_indexes[0] to all active gpus
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes) {
int active_device = cuda_get_device();
// We only do broadcast if there are more than 1 active GPU
if (active_gpu_count > 1) {
int active_device = cuda_get_device();
uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
auto src_lut = lut_vec[0];
auto src_lut_indexes = lut_indexes_vec[0];
auto src_lut = lut_vec[0];
auto src_lut_indexes = lut_indexes_vec[0];
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
if (gpu_indexes[i] != gpu_indexes[0]) {
auto dst_lut = lut_vec[i];
auto dst_lut_indexes = lut_indexes_vec[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
gpu_indexes[i], gpu_memory_allocated);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
streams[i], gpu_indexes[i], gpu_memory_allocated);
cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
if (gpu_indexes[i] != gpu_indexes[0]) {
cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
auto dst_lut = lut_vec[i];
auto dst_lut_indexes = lut_indexes_vec[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
gpu_indexes[i], gpu_memory_allocated);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
streams[i], gpu_indexes[i], gpu_memory_allocated);
}
}
// Ensure the device set at the end of this method is the same as it was
// set at the beginning
cuda_set_device(active_device);
}
}
// Broadcast luts from device gpu_indexes[0] to all active gpus
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t new_active_gpu_count,
bool broadcast_lut_values = true) {
// We only do broadcast if there are more than 1 active GPU
if (new_active_gpu_count > 1) {
int active_device = cuda_get_device();
uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
auto src_lut = lut_vec[0];
auto src_lut_indexes = lut_indexes_vec[0];
if (active_gpu_count > 1)
cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
for (uint i = 0; i < new_active_gpu_count; i++) {
if (gpu_indexes[i] != gpu_indexes[0]) {
cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
if (broadcast_lut_values) {
auto dst_lut = lut_vec[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
streams[i], gpu_indexes[i], gpu_memory_allocated);
}
auto dst_lut_indexes = lut_indexes_vec[i];
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
streams[i], gpu_indexes[i], gpu_memory_allocated);
}
}
// Ensure the device set at the end of this method is the same as it was
// set at the beginning
cuda_set_device(active_device);
}
}
void allocate_lwe_vector_for_non_trivial_indexes(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t active_gpu_count, uint64_t max_num_radix_blocks,
uint64_t &size_tracker, bool allocate_gpu_memory) {
// We need to create the auxiliary array only in GPU 0
lwe_aligned_vec.resize(active_gpu_count);
for (uint i = 0; i < active_gpu_count; i++) {
uint64_t size_tracker_on_array_i = 0;
auto inputs_on_gpu = std::max(
THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(max_num_radix_blocks, i, active_gpu_count));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
streams[0], gpu_indexes[0], size_tracker_on_array_i,
allocate_gpu_memory);
lwe_aligned_vec[i] = d_array;
size_tracker += size_tracker_on_array_i;
}
// Ensure the device set at the end of this method is the same as it was set
// at the beginning
cuda_set_device(active_device);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -746,7 +816,6 @@ template <typename Torus> struct int_radix_lut {
cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams[i],
gpu_indexes[i], gpu_memory_allocated);
}
cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(lwe_indexes_out, streams[0],
@@ -793,6 +862,23 @@ template <typename Torus> struct int_radix_lut {
lwe_after_ks_vec.clear();
lwe_after_pbs_vec.clear();
lwe_trivial_indexes_vec.clear();
if (active_gpu_count > 1) {
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
}
cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
cuda_event_destroy(event_broadcast, gpu_indexes[0]);
free(event_scatter_out);
}
if (lwe_aligned_vec.size() > 0) {
for (uint i = 0; i < active_gpu_count; i++) {
cuda_drop_with_size_tracking_async(lwe_aligned_vec[i], streams[0],
gpu_indexes[0],
gpu_memory_allocated);
}
lwe_aligned_vec.clear();
}
}
free(h_lut_indexes);
free(degrees);
@@ -841,6 +927,8 @@ template <typename InputTorus> struct int_noise_squashing_lut {
bool using_trivial_lwe_indexes = true;
bool gpu_memory_allocated;
std::vector<InputTorus *> lwe_aligned_scatter_vec;
std::vector<__uint128_t *> lwe_aligned_gather_vec;
// noise squashing constructor
int_noise_squashing_lut(cudaStream_t const *streams,
uint32_t const *input_gpu_indexes, uint32_t gpu_count,
@@ -876,11 +964,11 @@ template <typename InputTorus> struct int_noise_squashing_lut {
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count));
int8_t *gpu_pbs_buffer;
uint64_t size = 0;
execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
params.small_lwe_dimension, params.glwe_dimension,
params.polynomial_size, params.pbs_level,
num_radix_blocks_on_gpu, allocate_gpu_memory,
params.noise_reduction_type, size);
execute_scratch_pbs<__uint128_t>(
streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
params.grouping_factor, num_radix_blocks_on_gpu, params.pbs_type,
allocate_gpu_memory, params.noise_reduction_type, size);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
if (i == 0) {
size_tracker += size;
@@ -982,7 +1070,10 @@ template <typename InputTorus> struct int_noise_squashing_lut {
&pbs_buffer[i]);
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
if (lwe_aligned_gather_vec.size() > 0) {
multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_gather_vec);
multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_scatter_vec);
}
multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
@@ -1045,7 +1136,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
lut->get_lut_indexes(0, 0), h_lut_indexes,
num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
gpu_indexes[0], allocate_gpu_memory);
lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count =
get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
/**
* the input indexes should take the first bits_per_block PBS to target
@@ -1071,6 +1165,9 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
h_lwe_indexes_out);
lut->allocate_lwe_vector_for_non_trivial_indexes(
streams, gpu_indexes, active_gpu_count,
num_radix_blocks * bits_per_block, size_tracker, allocate_gpu_memory);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
free(h_lwe_indexes_in);
@@ -1212,7 +1309,9 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
mux_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count_mux =
get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
mux_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_mux);
auto cleaning_lut_f = [params](Torus x) -> Torus {
return x % params.message_modulus;
@@ -1222,7 +1321,10 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
cleaning_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count_cleaning =
get_active_gpu_count(num_radix_blocks, gpu_count);
cleaning_lut->broadcast_lut(streams, gpu_indexes,
active_gpu_count_cleaning);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1311,8 +1413,8 @@ template <typename Torus> struct int_fullprop_buffer {
cuda_memcpy_with_size_tracking_async_to_gpu(
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams[0],
gpu_indexes[0], allocate_gpu_memory);
lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(2, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -1447,9 +1549,11 @@ template <typename Torus> struct int_overflowing_sub_memory {
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_message_acc, gpu_memory_allocated);
luts_array->broadcast_lut(streams, gpu_indexes);
luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes);
message_acc->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
message_acc->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1558,9 +1662,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
uint32_t total_messages = 0;
current_columns.next_accumulation(total_ciphertexts, total_messages,
_needs_processing);
uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
if (!mem_reuse) {
uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
if (total_ciphertexts > 0 ||
reduce_degrees_for_single_carry_propagation) {
uint64_t size_tracker = 0;
@@ -1568,6 +1671,11 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
pbs_count, true, size_tracker);
allocated_luts_message_carry = true;
auto active_gpu_count =
get_active_gpu_count(this->max_total_blocks_in_vec, gpu_count);
luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
size_tracker, true);
}
}
if (allocated_luts_message_carry) {
@@ -1595,7 +1703,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, gpu_memory_allocated);
luts_message_carry->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count_mc = get_active_gpu_count(pbs_count, gpu_count);
luts_message_carry->broadcast_lut(streams, gpu_indexes,
active_gpu_count_mc);
}
}
int_sum_ciphertexts_vec_memory(
@@ -1615,6 +1725,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
this->allocated_luts_message_carry = false;
this->reduce_degrees_for_single_carry_propagation =
reduce_degrees_for_single_carry_propagation;
setup_index_buffers(streams, gpu_indexes, size_tracker);
// because we setup_lut in host function for sum_ciphertexts to save memory
// the size_tracker is topped up here to have a max bound on the used memory
@@ -1662,6 +1773,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
this->current_blocks = current_blocks;
this->small_lwe_vector = small_lwe_vector;
this->luts_message_carry = reused_lut;
this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
size_tracker, allocate_gpu_memory);
setup_index_buffers(streams, gpu_indexes, size_tracker);
}
@@ -1745,8 +1859,9 @@ template <typename Torus> struct int_seq_group_prop_memory {
cuda_memcpy_with_size_tracking_async_to_gpu(
seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_seq_luts, gpu_count);
lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
free(h_seq_lut_indexes);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -1801,8 +1916,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
lut_hillis_steele->get_degree(0), lut_hillis_steele->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_lut_hillis_steele, gpu_memory_allocated);
lut_hillis_steele->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_groups, gpu_count);
lut_hillis_steele->broadcast_lut(streams, gpu_indexes, active_gpu_count);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -1978,8 +2093,9 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
gpu_indexes[0], allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
luts_array_first_step->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -2240,7 +2356,9 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
scalar_array_cum_sum, h_scalar_array_cum_sum,
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
allocate_gpu_memory);
luts_array_second_step->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
luts_array_second_step->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -2259,14 +2377,17 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
// needed for the division to update the lut indexes
void update_lut_indexes(cudaStream_t const *streams,
uint32_t const *gpu_indexes, Torus *new_lut_indexes,
Torus *new_scalars, uint32_t new_num_blocks) {
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *new_lut_indexes, Torus *new_scalars,
uint32_t new_num_blocks) {
Torus *lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0], gpu_memory_allocated);
luts_array_second_step->broadcast_lut(streams, gpu_indexes);
auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
// We just need to update the lut indexes so we use false here
luts_array_second_step->broadcast_lut(streams, gpu_indexes,
new_active_gpu_count, false);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
scalar_array_cum_sum, new_scalars, new_num_blocks * sizeof(Torus),
@@ -2431,7 +2552,9 @@ template <typename Torus> struct int_sc_prop_memory {
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
gpu_memory_allocated);
lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
}
// For the final cleanup in case of overflow or carry (it seems that I can)
@@ -2500,7 +2623,9 @@ template <typename Torus> struct int_sc_prop_memory {
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
allocate_gpu_memory);
}
lut_message_extract->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count =
get_active_gpu_count(num_radix_blocks + 1, gpu_count);
lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -2695,19 +2820,23 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
gpu_indexes[0], allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
luts_array_first_step->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
};
// needed for the division to update the lut indexes
void update_lut_indexes(cudaStream_t const *streams,
uint32_t const *gpu_indexes, Torus *new_lut_indexes,
uint32_t new_num_blocks) {
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *new_lut_indexes, uint32_t new_num_blocks) {
Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0], gpu_memory_allocated);
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
// We just need to update the lut indexes so we use false here
luts_array_first_step->broadcast_lut(streams, gpu_indexes,
new_active_gpu_count, false);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -2806,8 +2935,9 @@ template <typename Torus> struct int_borrow_prop_memory {
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_message_extract,
gpu_memory_allocated);
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
lut_message_extract->broadcast_lut(streams, gpu_indexes);
lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
if (compute_overflow) {
lut_borrow_flag = new int_radix_lut<Torus>(
@@ -2823,8 +2953,7 @@ template <typename Torus> struct int_borrow_prop_memory {
lut_borrow_flag->get_degree(0), lut_borrow_flag->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_borrow_flag, gpu_memory_allocated);
lut_borrow_flag->broadcast_lut(streams, gpu_indexes);
lut_borrow_flag->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
@@ -2852,15 +2981,15 @@ template <typename Torus> struct int_borrow_prop_memory {
// needed for the division to update the lut indexes
void update_lut_indexes(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *first_indexes_for_div,
Torus *second_indexes_for_div, Torus *scalars_for_div,
uint32_t new_num_blocks) {
shifted_blocks_borrow_state_mem->update_lut_indexes(
streams, gpu_indexes, first_indexes_for_div, new_num_blocks);
streams, gpu_indexes, gpu_count, first_indexes_for_div, new_num_blocks);
prop_simu_group_carries_mem->update_lut_indexes(
streams, gpu_indexes, second_indexes_for_div, scalars_for_div,
new_num_blocks);
streams, gpu_indexes, gpu_count, second_indexes_for_div,
scalars_for_div, new_num_blocks);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
@@ -2991,7 +3120,10 @@ template <typename Torus> struct int_mul_memory {
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_predicate_lut_f, gpu_memory_allocated);
zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
zero_out_mem = new int_zero_out_if_buffer<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
@@ -3064,8 +3196,8 @@ template <typename Torus> struct int_mul_memory {
streams[0], gpu_indexes[0],
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
luts_array->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(total_block_count, gpu_count);
luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
@@ -3197,7 +3329,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
@@ -3281,13 +3414,15 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (auto &buffer : lut_buffers_bivariate) {
buffer->release(streams, gpu_indexes, gpu_count);
delete buffer;
@@ -3385,7 +3520,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
shift_last_block_lut_univariate->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
}
@@ -3410,7 +3547,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
padding_block_lut_f, gpu_memory_allocated);
padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -3449,7 +3588,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
blocks_lut_f, gpu_memory_allocated);
shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
}
@@ -3457,6 +3598,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
@@ -3563,9 +3705,13 @@ template <typename Torus> struct int_cmux_buffer {
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
allocate_gpu_memory);
predicate_lut->broadcast_lut(streams, gpu_indexes);
message_extract_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count_pred =
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
predicate_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_pred);
auto active_gpu_count_msg =
get_active_gpu_count(num_radix_blocks, gpu_count);
message_extract_lut->broadcast_lut(streams, gpu_indexes,
active_gpu_count_msg);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3637,7 +3783,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
is_max_value->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(max_chunks, gpu_count);
is_max_value->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3697,7 +3844,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, operator_f, gpu_memory_allocated);
operator_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
operator_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
// f(x) -> x == 0
Torus total_modulus = params.message_modulus * params.carry_modulus;
@@ -3715,7 +3863,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
is_non_zero_lut->broadcast_lut(streams, gpu_indexes);
is_non_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -3734,8 +3882,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, gpu_memory_allocated);
}
scalar_comparison_luts->broadcast_lut(streams, gpu_indexes);
scalar_comparison_luts->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3806,8 +3954,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
block_selector_f, gpu_memory_allocated);
tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -3994,8 +4142,7 @@ template <typename Torus> struct int_comparison_buffer {
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
identity_lut->broadcast_lut(streams, gpu_indexes);
identity_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -4012,7 +4159,7 @@ template <typename Torus> struct int_comparison_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_zero_f, gpu_memory_allocated);
is_zero_lut->broadcast_lut(streams, gpu_indexes);
is_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
switch (op) {
case COMPARISON_TYPE::MAX:
@@ -4094,8 +4241,8 @@ template <typename Torus> struct int_comparison_buffer {
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
signed_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
signed_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
}
@@ -4144,6 +4291,7 @@ template <typename Torus> struct int_comparison_buffer {
delete signed_msb_lut;
delete tmp_trivial_sign_block;
}
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint j = 0; j < active_gpu_count; j++) {
cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
@@ -4310,17 +4458,23 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
streams, gpu_indexes, gpu_count, params, 1, num_blocks,
allocate_gpu_memory, size_tracker);
int_radix_lut<Torus> *luts[2] = {masking_luts_1[i], masking_luts_2[i]};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], masking_luts_1[i]->get_lut(0, 0),
masking_luts_1[i]->get_degree(0),
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_gpu_count1 = get_active_gpu_count(1, gpu_count);
masking_luts_1[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count1);
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, lut_f_masking,
gpu_memory_allocated);
luts[j]->broadcast_lut(streams, gpu_indexes);
}
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], masking_luts_2[i]->get_lut(0, 0),
masking_luts_2[i]->get_degree(0),
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_gpu_count2 = get_active_gpu_count(num_blocks, gpu_count);
masking_luts_2[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count2);
}
// create and generate message_extract_lut_1 and message_extract_lut_2
@@ -4340,13 +4494,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
luts[j]->broadcast_lut(streams, gpu_indexes);
luts[j]->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
// Give name to closures to improve readability
@@ -4382,7 +4537,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, params.message_modulus - 2,
gpu_memory_allocated);
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes);
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams[0], gpu_indexes[0],
zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
@@ -4391,7 +4547,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, params.message_modulus - 1,
gpu_memory_allocated);
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes);
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
// create and generate zero_out_if_overflow_happened
zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -4418,7 +4575,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
gpu_memory_allocated);
zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes);
zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams[0], gpu_indexes[0],
zero_out_if_overflow_happened[1]->get_lut(0, 0),
@@ -4427,10 +4585,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
gpu_memory_allocated);
zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes);
zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
// merge_overflow_flags_luts
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
auto active_gpu_count_for_bits = get_active_gpu_count(1, gpu_count);
for (int i = 0; i < num_bits_in_message; i++) {
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
return (x == 0 && y == 0) << i;
@@ -4447,7 +4607,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
merge_overflow_flags_luts[i]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes);
merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes,
active_gpu_count_for_bits);
}
}
@@ -4703,6 +4864,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
delete[] merge_overflow_flags_luts;
// release sub streams
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
@@ -4764,7 +4926,7 @@ template <typename Torus> struct int_bitop_buffer {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
switch (op) {
case BITAND:
case BITOR:
@@ -4791,7 +4953,7 @@ template <typename Torus> struct int_bitop_buffer {
lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
break;
default:
@@ -4821,7 +4983,7 @@ template <typename Torus> struct int_bitop_buffer {
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
}
}
@@ -5106,7 +5268,10 @@ template <typename Torus> struct int_div_rem_memory {
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
f_compare_extracted_signed_bits, gpu_memory_allocated);
compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count_cmp =
get_active_gpu_count(1, gpu_count); // only 1 block needed
compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes,
active_gpu_count_cmp);
}
}
@@ -5148,6 +5313,7 @@ template <typename Torus> struct int_div_rem_memory {
delete compare_signed_bits_lut;
// release sub streams
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < gpu_count; i++) {
cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
@@ -5776,7 +5942,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
this->direction = direction;
this->bit_value = bit_value;
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
this->univ_lut_mem = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
@@ -5815,7 +5981,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
if (allocate_gpu_memory) {
univ_lut_mem->broadcast_lut(streams, gpu_indexes);
univ_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
auto generate_bi_lut_lambda =
@@ -5834,7 +6000,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
if (allocate_gpu_memory) {
biv_lut_mem->broadcast_lut(streams, gpu_indexes);
biv_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
this->tmp_ct = new CudaRadixCiphertextFFI;
@@ -6052,7 +6218,8 @@ template <typename Torus> struct int_grouped_oprf_memory {
cuda_memcpy_async_to_gpu(luts->get_lut_indexes(0, 0), this->h_lut_indexes,
num_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
luts->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
luts->broadcast_lut(streams, gpu_indexes, active_gpu_count);
free(h_corrections);
}

View File

@@ -232,8 +232,13 @@ template <typename Torus> struct zk_expand_mem {
num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
allocate_gpu_memory);
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
active_gpu_count);
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
streams, gpu_indexes, active_gpu_count, 2 * num_lwes, size_tracker,
allocate_gpu_memory);
// The expanded LWEs will always be on the casting key format
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),

View File

@@ -157,12 +157,12 @@ void execute_keyswitch_async(cudaStream_t const *streams,
for (uint i = 0; i < gpu_count; i++) {
int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
Torus *current_lwe_array_out = get_variant_element(lwe_array_out, i);
Torus *current_lwe_output_indexes =
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
get_variant_element(lwe_output_indexes, i);
Torus *current_lwe_array_in = get_variant_element(lwe_array_in, i);
Torus *current_lwe_input_indexes =
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
get_variant_element(lwe_input_indexes, i);
// Compute Keyswitch
host_keyswitch_lwe_ciphertext_vector<Torus>(

View File

@@ -202,9 +202,9 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
auto stride_KSK_buffer = glwe_accumulator_size * level_count;
// Shared memory requirement is 4096, 8192, and 16384 bytes respectively for
// 32, 64, and 128-bit Torus elements We want to keep this as a sanity check
uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
// Shared memory requirement is 4096, 8192, and 16384 bytes respectively for
// 32, 64, and 128-bit Torus elements
// Sanity check: the shared memory size is a constant defined by the algorithm
GPU_ASSERT(shared_mem_size <= 1024 * sizeof(Torus),
"GEMM kernel error: shared memory required might be too large");

View File

@@ -1,15 +1,88 @@
#include "device.h"
#include <cstdint>
#include <cuda_runtime.h>
#include <mutex>
uint32_t cuda_get_device() {
int device;
check_cuda_error(cudaGetDevice(&device));
return static_cast<uint32_t>(device);
}
std::mutex pool_mutex;
bool mem_pools_enabled = false;
// We use memory pools to reduce some overhead of memory allocations due
// to our scratch/release pattern. This function is the simplest way of using
// mempools, it modifies the default memory pool to use a threshold of 5% of the
// free memory:
// - Enabled opportunistic reuse to maximize reuse in malloc/free patterns
// - Prevent memory from being released back to the OS too soon if is within
// our threshold
// - Warm up the pool by allocating and freeing a large block of memory
// This function is called only once, the first time a GPU is set, and it
// configures all the GPUs available.
// We have measured an improvement of around 10% in our integer operations,
// especially the ones involving many allocations.
// We tested more complex configurations of mempools, but they did not yield
// better results.
void cuda_setup_mempool(uint32_t caller_gpu_index) {
if (!mem_pools_enabled) {
pool_mutex.lock();
if (mem_pools_enabled)
return; // If mem pools are already enabled, we don't need to do anything
// We do it only once for all GPUs
mem_pools_enabled = true;
uint32_t num_gpus = cuda_get_number_of_gpus();
for (uint32_t gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
cuda_set_device(gpu_index);
size_t total_mem, free_mem;
check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
// If we have more than 5% of free memory, we can set up the mempool
uint64_t mem_pool_threshold = total_mem / 20; // 5% of total memory
mem_pool_threshold =
mem_pool_threshold - (mem_pool_threshold % 1024); // Align to 1KB
if (mem_pool_threshold < free_mem) {
// Get default memory pool
cudaMemPool_t default_pool;
check_cuda_error(cudaDeviceGetDefaultMemPool(&default_pool, gpu_index));
// Enable opportunistic reuse
int reuse = 1;
check_cuda_error(cudaMemPoolSetAttribute(
default_pool, cudaMemPoolReuseAllowOpportunistic, &reuse));
// Prevent memory from being released back to the OS too soon
check_cuda_error(cudaMemPoolSetAttribute(
default_pool, cudaMemPoolAttrReleaseThreshold,
&mem_pool_threshold));
// Warm up the pool by allocating and freeing a large block
cudaStream_t stream;
stream = cuda_create_stream(gpu_index);
void *warmup_ptr = nullptr;
warmup_ptr = cuda_malloc_async(mem_pool_threshold, stream, gpu_index);
cuda_drop_async(warmup_ptr, stream, gpu_index);
// Sync to ensure pool is grown
cuda_synchronize_stream(stream, gpu_index);
// Clean up
cuda_destroy_stream(stream, gpu_index);
}
}
// We return to the original gpu_index
cuda_set_device(caller_gpu_index);
pool_mutex.unlock();
}
}
void cuda_set_device(uint32_t gpu_index) {
check_cuda_error(cudaSetDevice(gpu_index));
// Mempools are initialized only once in all the GPUS available
cuda_setup_mempool(gpu_index);
}
cudaEvent_t cuda_create_event(uint32_t gpu_index) {
@@ -329,6 +402,13 @@ int cuda_get_number_of_gpus() {
return num_gpus;
}
int cuda_get_number_of_sms() {
int num_sms = 0;
check_cuda_error(
cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, 0));
return num_sms;
}
/// Drop a cuda array
void cuda_drop(void *ptr, uint32_t gpu_index) {
cuda_set_device(gpu_index);

View File

@@ -148,7 +148,8 @@ __host__ void are_all_comparisons_block_true(
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
h_lut_indexes, num_chunks * sizeof(Torus),
streams[0], gpu_indexes[0]);
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_chunks, gpu_count);
is_max_value_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
}
lut = is_max_value_lut;
}
@@ -167,7 +168,10 @@ __host__ void are_all_comparisons_block_true(
is_max_value_lut->h_lut_indexes,
is_max_value_lut->num_blocks * sizeof(Torus),
streams[0], gpu_indexes[0]);
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count_is_max =
get_active_gpu_count(is_max_value_lut->num_blocks, gpu_count);
is_max_value_lut->broadcast_lut(streams, gpu_indexes,
active_gpu_count_is_max, false);
reset_radix_ciphertext_blocks(lwe_array_out, 1);
return;
} else {
@@ -499,7 +503,9 @@ __host__ void tree_sign_reduction(
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f, true);
last_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
last_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
// Last leaf
integer_radix_apply_univariate_lookup_table_kb<Torus>(

View File

@@ -100,7 +100,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
carry_modulus, allocate_gpu_memory);
carry_modulus, PBS_MS_REDUCTION_T::NO_REDUCTION);
return scratch_cuda_compress_integer_radix_ciphertext<__uint128_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
@@ -112,14 +112,15 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
bool allocate_gpu_memory, bool allocate_ms_array) {
bool allocate_gpu_memory) {
// 128-bit decompression doesn't run PBSs, so we don't need encryption_params
int_radix_params compression_params(
PBS_TYPE::CLASSICAL, compression_glwe_dimension,
compression_polynomial_size,
compression_glwe_dimension * compression_polynomial_size, lwe_dimension,
0, 0, 0, 0, 0, message_modulus, carry_modulus, allocate_ms_array);
0, 0, 0, 0, 0, message_modulus, carry_modulus,
PBS_MS_REDUCTION_T::NO_REDUCTION);
return scratch_cuda_integer_decompress_radix_ciphertext<__uint128_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,

View File

@@ -344,7 +344,7 @@ host_integer_decompress(cudaStream_t const *streams,
auto active_gpu_count =
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
if (active_gpu_count == 1) {
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
extracted_lwe, lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
@@ -363,18 +363,21 @@ host_integer_decompress(cudaStream_t const *streams,
lut->lwe_trivial_indexes_vec;
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in,
gpu_indexes[j]);
}
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
extracted_lwe, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->active_gpu_count, num_blocks_to_decompress,
extracted_lwe, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, lut->active_gpu_count, num_blocks_to_decompress,
compression_params.small_lwe_dimension + 1);
/// Apply PBS
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, d_bsks, nullptr,
@@ -388,13 +391,20 @@ host_integer_decompress(cudaStream_t const *streams,
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
encryption_params.big_lwe_dimension + 1);
lwe_after_pbs_vec, lut->lwe_indexes_out,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j],
gpu_indexes[j]);
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
}
}
} else {

View File

@@ -311,8 +311,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
mem_ptr->scalars_for_overflow_sub
[merged_interesting_remainder->num_radix_blocks - 1];
mem_ptr->overflow_sub_mem->update_lut_indexes(
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
merged_interesting_remainder->num_radix_blocks);
streams, gpu_indexes, gpu_count, first_indexes, second_indexes,
scalar_indexes, merged_interesting_remainder->num_radix_blocks);
host_integer_overflowing_sub<uint64_t>(
streams, gpu_indexes, gpu_count, new_remainder,
merged_interesting_remainder, interesting_divisor,

View File

@@ -558,7 +558,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, 1, (Torus *)lwe_array_out->ptr,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
@@ -567,16 +567,20 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
}
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
big_lwe_dimension + 1);
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -586,7 +590,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
@@ -595,15 +599,20 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
(Torus *)lwe_array_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
PUSH_RANGE("gather")
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
}
}
for (uint i = 0; i < num_radix_blocks; i++) {
@@ -665,7 +674,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, 1, (Torus *)lwe_array_out->ptr,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
@@ -674,16 +683,19 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
}
/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
big_lwe_dimension + 1);
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -693,7 +705,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
@@ -702,15 +714,22 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
PUSH_RANGE("gather")
multi_gpu_gather_many_lut_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_radix_blocks, big_lwe_dimension + 1,
num_many_lut);
POP_RANGE()
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
}
}
for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) {
@@ -787,7 +806,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, 1, (Torus *)(lwe_array_out->ptr),
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
@@ -795,13 +814,17 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
}
PUSH_RANGE("scatter")
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(Torus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
big_lwe_dimension + 1);
(Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
/// Apply KS to go from a big LWE dimension to a small LWE dimension
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_after_ks_vec, lwe_trivial_indexes_vec,
@@ -811,7 +834,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
@@ -820,15 +843,20 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
num_many_lut, lut_stride);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
(Torus *)(lwe_array_out->ptr),
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes,
num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
PUSH_RANGE("gather")
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)(lwe_array_out->ptr),
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
POP_RANGE()
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
}
}
for (uint i = 0; i < num_radix_blocks; i++) {
@@ -1000,7 +1028,6 @@ void generate_device_accumulator_no_encoding(
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}
@@ -1104,8 +1131,7 @@ void generate_device_accumulator_bivariate_with_factor(
h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f,
factor);
cuda_synchronize_stream(stream, gpu_index);
// copy host lut and lut_indexes_vec to device
// copy host lut and lut_indexes_vec to device
cuda_memcpy_with_size_tracking_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream, gpu_index,
@@ -1137,7 +1163,6 @@ void generate_device_accumulator_with_encoding(
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream, gpu_index, gpu_memory_allocated);
cuda_synchronize_stream(stream, gpu_index);
free(h_lut);
}
@@ -1486,7 +1511,7 @@ void host_full_propagate_inplace(
streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector, 1, 2,
mem_ptr->tmp_small_lwe_vector, 0, 1);
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, 1, (Torus *)mem_ptr->tmp_big_lwe_vector->ptr,
mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
mem_ptr->lut->lut_indexes_vec,
@@ -1668,6 +1693,7 @@ __host__ void reduce_signs(
"than the number of blocks to operate on")
auto diff_buffer = mem_ptr->diff_buffer;
auto active_gpu_count = mem_ptr->active_gpu_count;
auto params = mem_ptr->params;
auto glwe_dimension = params.glwe_dimension;
@@ -1697,7 +1723,7 @@ __host__ void reduce_signs(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, reduce_two_orderings_function, true);
lut->broadcast_lut(streams, gpu_indexes);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
while (num_sign_blocks > 2) {
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
@@ -1728,7 +1754,7 @@ __host__ void reduce_signs(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, final_lut_f, true);
lut->broadcast_lut(streams, gpu_indexes);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
num_sign_blocks, message_modulus);
@@ -1748,7 +1774,7 @@ __host__ void reduce_signs(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, final_lut_f, true);
lut->broadcast_lut(streams, gpu_indexes);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
@@ -1774,7 +1800,8 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
POP_RANGE()
return size_tracker;
}
@@ -1811,7 +1838,8 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
POP_RANGE()
return size_tracker;
}
@@ -1848,7 +1876,8 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
POP_RANGE()
return size_tracker;
}
@@ -2344,11 +2373,17 @@ __host__ void integer_radix_apply_noise_squashing_kb(
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_128_async<__uint128_t>(
///
/// int_noise_squashing_lut doesn't support a different output or lut
/// indexing than the trivial
execute_pbs_async<uint64_t, __uint128_t>(
streams, gpu_indexes, 1, (__uint128_t *)lwe_array_out->ptr,
lut->lut_vec, lwe_after_ks_vec[0], bsks, ms_noise_reduction_key,
lut->pbs_buffer, small_lwe_dimension, glwe_dimension, polynomial_size,
pbs_base_log, pbs_level, lwe_array_out->num_radix_blocks);
lwe_trivial_indexes_vec[0], lut->lut_vec, lwe_trivial_indexes_vec,
lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks,
ms_noise_reduction_key, lut->pbs_buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
grouping_factor, lwe_array_out->num_radix_blocks, params.pbs_type, 0,
0);
} else {
/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -2357,9 +2392,10 @@ __host__ void integer_radix_apply_noise_squashing_kb(
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<InputTorus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
(InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->active_gpu_count,
lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
(InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
lut->using_trivial_lwe_indexes, lut->lwe_aligned_scatter_vec,
lut->active_gpu_count, lwe_array_out->num_radix_blocks,
lut->input_big_lwe_dimension + 1);
execute_keyswitch_async<InputTorus>(
streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
@@ -2367,19 +2403,23 @@ __host__ void integer_radix_apply_noise_squashing_kb(
ksks, lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
ks_level, lwe_array_out->num_radix_blocks);
execute_pbs_128_async<__uint128_t>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, lut->lut_vec,
lwe_after_ks_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
small_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
pbs_level, lwe_array_out->num_radix_blocks);
/// int_noise_squashing_lut doesn't support a different output or lut
/// indexing than the trivial
execute_pbs_async<uint64_t, __uint128_t>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lwe_trivial_indexes_vec,
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
lut->pbs_buffer, glwe_dimension, small_lwe_dimension, polynomial_size,
pbs_base_log, pbs_level, grouping_factor,
lwe_array_out->num_radix_blocks, params.pbs_type, 0, 0);
/// Copy data back to GPU 0 and release vecs
/// In apply noise squashing we always use trivial indexes
multi_gpu_gather_lwe_async<__uint128_t>(
streams, gpu_indexes, active_gpu_count,
(__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec, nullptr,
lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
big_lwe_dimension + 1);
lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {

View File

@@ -404,7 +404,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level,
total_messages);
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
@@ -415,31 +415,10 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut,
lut_stride);
} else {
Torus *h_lwe_indexes_in_pinned;
Torus *h_lwe_indexes_out_pinned;
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
total_ciphertexts * sizeof(Torus));
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
total_ciphertexts * sizeof(Torus));
for (uint32_t i = 0; i < total_ciphertexts; i++) {
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
}
cuda_memcpy_async_to_cpu(
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_cpu(
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
total_ciphertexts * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint32_t i = 0; i < total_ciphertexts; i++) {
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
}
cudaFreeHost(h_lwe_indexes_in_pinned);
cudaFreeHost(h_lwe_indexes_out_pinned);
luts_message_carry->broadcast_lut(streams, gpu_indexes);
// we just need to broadcast the indexes
luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
false);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
@@ -479,7 +458,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, num_radix_blocks);
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
@@ -491,31 +470,9 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
lut_stride);
} else {
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
Torus *h_lwe_indexes_in_pinned;
Torus *h_lwe_indexes_out_pinned;
cudaMallocHost((void **)&h_lwe_indexes_in_pinned,
num_blocks_in_apply_lut * sizeof(Torus));
cudaMallocHost((void **)&h_lwe_indexes_out_pinned,
num_blocks_in_apply_lut * sizeof(Torus));
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
h_lwe_indexes_in_pinned[i] = luts_message_carry->h_lwe_indexes_in[i];
h_lwe_indexes_out_pinned[i] = luts_message_carry->h_lwe_indexes_out[i];
}
cuda_memcpy_async_to_cpu(
h_lwe_indexes_in_pinned, luts_message_carry->lwe_indexes_in,
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_cpu(
h_lwe_indexes_out_pinned, luts_message_carry->lwe_indexes_out,
num_blocks_in_apply_lut * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint32_t i = 0; i < num_blocks_in_apply_lut; i++) {
luts_message_carry->h_lwe_indexes_in[i] = h_lwe_indexes_in_pinned[i];
luts_message_carry->h_lwe_indexes_out[i] = h_lwe_indexes_out_pinned[i];
}
cudaFreeHost(h_lwe_indexes_in_pinned);
cudaFreeHost(h_lwe_indexes_out_pinned);
luts_message_carry->broadcast_lut(streams, gpu_indexes);
// we just need to broadcast the indexes
luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
false);
luts_message_carry->using_trivial_lwe_indexes = false;
integer_radix_apply_univariate_lookup_table_kb<Torus>(

View File

@@ -34,7 +34,7 @@ void host_integer_grouped_oprf(
auto lut = mem_ptr->luts;
if (active_gpu_count == 1) {
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, (uint32_t)1, (Torus *)(radix_lwe_out->ptr),
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec,
const_cast<Torus *>(seeded_lwe_input), lut->lwe_indexes_in, bsks,
@@ -48,7 +48,10 @@ void host_integer_grouped_oprf(
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
}
if (!lut->using_trivial_lwe_indexes) {
PANIC("lut->using_trivial_lwe_indexes should be true");
@@ -56,11 +59,11 @@ void host_integer_grouped_oprf(
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
seeded_lwe_input, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
active_gpu_count, num_blocks_to_process,
seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, active_gpu_count, num_blocks_to_process,
mem_ptr->params.small_lwe_dimension + 1);
execute_pbs_async<Torus>(
execute_pbs_async<Torus, Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
@@ -72,12 +75,18 @@ void host_integer_grouped_oprf(
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_blocks_to_process,
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
lut->lwe_aligned_vec, num_blocks_to_process,
mem_ptr->params.big_lwe_dimension + 1);
for (uint32_t i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
// other gpus record their events
for (int j = 1; j < active_gpu_count; j++) {
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
}
// GPU 0 waits for all
for (int j = 1; j < active_gpu_count; j++) {
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
gpu_indexes[0]);
}
}

View File

@@ -47,7 +47,8 @@ __host__ void host_integer_radix_scalar_bitop_kb(
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
num_clear_blocks * sizeof(Torus), streams[0],
gpu_indexes[0]);
lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(num_clear_blocks, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count, false);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,

View File

@@ -154,7 +154,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_last_leaf_lut_f, true);
lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
@@ -253,7 +254,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
@@ -286,8 +288,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, one_block_lut_f, true);
one_block_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -434,7 +436,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
@@ -540,7 +543,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, true);
signed_msb_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
signed_msb_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
CudaRadixCiphertextFFI sign_block;
as_radix_ciphertext_slice<Torus>(
@@ -588,8 +592,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, one_block_lut_f, true);
one_block_lut->broadcast_lut(streams, gpu_indexes);
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
@@ -819,7 +823,11 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
gpu_indexes[0]);
}
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes);
auto active_gpu_count =
get_active_gpu_count(num_halved_scalar_blocks, gpu_count);
// We use false cause we only will broadcast the indexes
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes,
active_gpu_count, false);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,

View File

@@ -7,6 +7,7 @@
#include "device.h"
#include "fft/bnsmfft.cuh"
#include "helper_multi_gpu.h"
#include "pbs/pbs_128_utilities.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include "polynomial/polynomial_math.cuh"
@@ -202,15 +203,15 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_2_2_params(
// the buffer in registers to avoid synchronizations and shared memory usage
}
template <typename Torus>
template <typename InputTorus, typename OutputTorus>
void execute_pbs_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const LweArrayVariant<Torus> &lwe_array_out,
const LweArrayVariant<Torus> &lwe_output_indexes,
const std::vector<Torus *> lut_vec,
const std::vector<Torus *> lut_indexes_vec,
const LweArrayVariant<Torus> &lwe_array_in,
const LweArrayVariant<Torus> &lwe_input_indexes,
uint32_t gpu_count, const LweArrayVariant<OutputTorus> &lwe_array_out,
const LweArrayVariant<InputTorus> &lwe_output_indexes,
const std::vector<OutputTorus *> lut_vec,
const std::vector<InputTorus *> lut_indexes_vec,
const LweArrayVariant<InputTorus> &lwe_array_in,
const LweArrayVariant<InputTorus> &lwe_input_indexes,
void *const *bootstrapping_keys,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
@@ -219,8 +220,7 @@ void execute_pbs_async(
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
uint32_t num_many_lut, uint32_t lut_stride) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
@@ -238,12 +238,12 @@ void execute_pbs_async(
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
// different gpus and when it is not
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
Torus *current_lwe_output_indexes =
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
Torus *current_lwe_input_indexes =
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
auto current_lwe_output_indexes =
get_variant_element(lwe_output_indexes, i);
auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
auto current_lwe_input_indexes =
get_variant_element(lwe_input_indexes, i);
cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
streams[i], gpu_indexes[i], current_lwe_array_out,
@@ -257,8 +257,7 @@ void execute_pbs_async(
default:
PANIC("Error: unsupported cuda PBS type.")
}
break;
case sizeof(uint64_t):
} else if constexpr (std::is_same_v<OutputTorus, uint64_t>) {
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
@@ -271,12 +270,12 @@ void execute_pbs_async(
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
// different gpus and when it is not
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
Torus *current_lwe_output_indexes =
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
Torus *current_lwe_input_indexes =
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
auto current_lwe_output_indexes =
get_variant_element(lwe_output_indexes, i);
auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
auto current_lwe_input_indexes =
get_variant_element(lwe_input_indexes, i);
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
@@ -300,12 +299,12 @@ void execute_pbs_async(
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
// different gpus and when it is not
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
Torus *current_lwe_output_indexes =
GET_VARIANT_ELEMENT(lwe_output_indexes, i);
Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
Torus *current_lwe_input_indexes =
GET_VARIANT_ELEMENT(lwe_input_indexes, i);
auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
auto current_lwe_output_indexes =
get_variant_element(lwe_output_indexes, i);
auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
auto current_lwe_input_indexes =
get_variant_element(lwe_input_indexes, i);
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
@@ -328,10 +327,81 @@ void execute_pbs_async(
default:
PANIC("Error: unsupported cuda PBS type.")
}
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
"moduli are supported.")
} else if constexpr (std::is_same_v<OutputTorus, __uint128_t>) {
// 128 bits
switch (pbs_type) {
case MULTI_BIT:
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
// different gpus and when it is not
auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
auto current_lwe_output_indexes =
get_variant_element(lwe_output_indexes, i);
auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
auto current_lwe_input_indexes =
get_variant_element(lwe_input_indexes, i);
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
streams[i], gpu_indexes[i], current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
num_inputs_on_gpu, num_many_lut, lut_stride);
}
break;
case CLASSICAL:
for (uint i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu =
get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
// different gpus and when it is not
auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
auto current_lwe_output_indexes =
get_variant_element(lwe_output_indexes, i);
auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
auto current_lwe_input_indexes =
get_variant_element(lwe_input_indexes, i);
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
void *zeros = nullptr;
if (ms_noise_reduction_key != nullptr &&
ms_noise_reduction_key->ptr != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
streams[i], gpu_indexes[i], current_lwe_array_out, lut_vec[i],
current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
zeros, pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu);
}
break;
default:
PANIC("Error: unsupported cuda PBS type.")
}
} else {
static_assert(
std::is_same_v<OutputTorus, uint32_t> ||
std::is_same_v<OutputTorus, uint64_t> ||
std::is_same_v<OutputTorus, __uint128_t>,
"Cuda error: unsupported modulus size: only 32, 64, or 128-bit integer "
"moduli are supported.");
}
}
@@ -344,8 +414,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type,
uint64_t &size_tracker) {
switch (sizeof(Torus)) {
case sizeof(uint32_t):
if constexpr (std::is_same_v<Torus, uint32_t>) {
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
@@ -359,8 +428,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
default:
PANIC("Error: unsupported cuda PBS type.")
}
break;
case sizeof(uint64_t):
} else if constexpr (std::is_same_v<Torus, uint64_t>) {
// 64 bits
switch (pbs_type) {
case MULTI_BIT:
@@ -379,10 +447,32 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
default:
PANIC("Error: unsupported cuda PBS type.")
}
break;
default:
PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
"moduli are supported.")
} else if constexpr (std::is_same_v<Torus, __uint128_t>) {
// 128 bits
switch (pbs_type) {
case MULTI_BIT:
if (grouping_factor == 0)
PANIC("Multi-bit PBS error: grouping factor should be > 0.")
size_tracker =
scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
break;
case CLASSICAL:
size_tracker = scratch_cuda_programmable_bootstrap_128(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, noise_reduction_type);
break;
default:
PANIC("Error: unsupported cuda PBS type.")
}
} else {
static_assert(
std::is_same_v<Torus, uint32_t> || std::is_same_v<Torus, uint64_t> ||
std::is_same_v<Torus, __uint128_t>,
"Cuda error: unsupported modulus size: only 32, 64, or 128-bit integer "
"moduli are supported.");
}
}

View File

@@ -1,45 +0,0 @@
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
#define CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
#include "pbs/pbs_128_utilities.h"
static void execute_scratch_pbs_128(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type,
uint64_t &size_tracker_on_gpu) {
// The squash noise function receives as input 64-bit integers
size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, noise_reduction_type);
}
template <typename Torus>
static void execute_pbs_128_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, const LweArrayVariant<__uint128_t> &lwe_array_out,
const std::vector<Torus *> lut_vector,
const LweArrayVariant<uint64_t> &lwe_array_in,
void *const *bootstrapping_keys,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
std::vector<int8_t *> pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples) {
for (uint32_t i = 0; i < gpu_count; i++) {
int num_inputs_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
uint64_t *current_lwe_array_in = GET_VARIANT_ELEMENT_64BIT(lwe_array_in, i);
void *zeros = nullptr;
if (ms_noise_reduction_key != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
streams[i], gpu_indexes[i], current_lwe_array_out, lut_vector[i],
current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
zeros, pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count, num_inputs_on_gpu);
}
}
#endif

View File

@@ -38,6 +38,19 @@ void multi_gpu_copy_array_async(cudaStream_t const *streams,
gpu_indexes[i], gpu_memory_allocated);
}
}
/// Copy an array residing on one CPU to all active gpus
template <typename Torus>
void multi_gpu_copy_array_from_cpu_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest, Torus const *h_src,
uint32_t elements_per_gpu, bool gpu_memory_allocated) {
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
cuda_memcpy_with_size_tracking_async_to_gpu(
dest[i], h_src, elements_per_gpu * sizeof(Torus), streams[i],
gpu_indexes[i], gpu_memory_allocated);
}
}
/// Allocates the input/output vector for all devices
/// Initializes also the related indexing and initializes it to the trivial
/// index
@@ -93,6 +106,35 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
}
}
// This function reads lwes using the indexes and place them in a single aligned
// array. This function is needed before communication to perform a single
// contiguous data movement. Each block handles one lwe.
template <typename Torus>
__global__ void align_with_indexes(Torus *d_packed_vector,
Torus const *d_vector,
Torus const *d_indexes, int lwe_size) {
int output_offset = blockIdx.x * lwe_size;
int input_offset = d_indexes[blockIdx.x] * lwe_size;
for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
d_packed_vector[ind + output_offset] = d_vector[ind + input_offset];
}
}
// This function takes the aligned array after communication and places it in
// the corresponding indexes. Each block handles one lwe.
template <typename Torus>
__global__ void realign_with_indexes(Torus *d_vector,
Torus const *d_packed_vector,
Torus const *d_indexes, int lwe_size) {
int input_offset = blockIdx.x * lwe_size;
int output_offset = d_indexes[blockIdx.x] * lwe_size;
for (int ind = threadIdx.x; ind < lwe_size; ind += blockDim.x) {
d_vector[ind + output_offset] = d_packed_vector[ind + input_offset];
}
}
/// Load an array residing on one GPU to all active gpus
/// and split the array among them.
/// The input indexing logic is given by an index array.
@@ -102,15 +144,15 @@ template <typename Torus>
void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, std::vector<Torus *> &dest,
Torus const *src, Torus const *h_src_indexes,
Torus const *src, Torus const *d_src_indexes,
bool is_trivial_index,
std::vector<Torus *> &aligned_vec,
uint32_t max_active_gpu_count,
uint32_t num_inputs, uint32_t lwe_size) {
if (max_active_gpu_count < gpu_count)
PANIC("Cuda error: number of gpus in scatter should be <= number of gpus "
"used to create the lut")
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
dest.resize(gpu_count);
for (uint i = 0; i < gpu_count; i++) {
auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
@@ -127,18 +169,28 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
gpu_indexes[i], true);
} else {
if (h_src_indexes == nullptr)
if (aligned_vec.size() == 0)
PANIC("Cuda error: auxiliary arrays should be setup!");
if (d_src_indexes == nullptr)
PANIC("Cuda error: source indexes should be initialized!");
auto src_indexes = h_src_indexes + gpu_offset;
for (uint j = 0; j < inputs_on_gpu; j++) {
auto d_dest = dest[i] + j * lwe_size;
auto d_src = src + src_indexes[j] * lwe_size;
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
cuda_set_device(gpu_indexes[0]);
align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
lwe_size);
check_cuda_error(cudaGetLastError());
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
true);
}
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i], true);
cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
}
}
}
@@ -150,7 +202,8 @@ template <typename Torus>
void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
Torus *dest, const std::vector<Torus *> &src,
Torus *h_dest_indexes, bool is_trivial_index,
Torus *d_dest_indexes, bool is_trivial_index,
std::vector<Torus *> &aligned_vec,
uint32_t num_inputs, uint32_t lwe_size) {
for (uint i = 0; i < gpu_count; i++) {
@@ -168,19 +221,27 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
gpu_indexes[i], true);
} else {
if (h_dest_indexes == nullptr)
if (aligned_vec.size() == 0)
PANIC("Cuda error: auxiliary arrays should be setup!");
if (d_dest_indexes == nullptr)
PANIC("Cuda error: destination indexes should be initialized!");
auto dest_indexes = h_dest_indexes + gpu_offset;
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
for (uint j = 0; j < inputs_on_gpu; j++) {
auto d_dest = dest + dest_indexes[j] * lwe_size;
auto d_src = src[i] + j * lwe_size;
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
d_dest, d_src, lwe_size * sizeof(Torus), streams[i], gpu_indexes[i],
true);
}
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
streams[i], gpu_indexes[i], true);
cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
cuda_set_device(gpu_indexes[0]);
realign_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset, lwe_size);
check_cuda_error(cudaGetLastError());
}
}
}

View File

@@ -289,7 +289,6 @@ unsafe extern "C" {
message_modulus: u32,
carry_modulus: u32,
allocate_gpu_memory: bool,
allocate_ms_array: bool,
) -> u64;
}
unsafe extern "C" {

View File

@@ -88,6 +88,8 @@ extern "C" {
pub fn cuda_get_number_of_gpus() -> i32;
pub fn cuda_get_number_of_sms() -> i32;
pub fn cuda_synchronize_device(gpu_index: u32);
pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);

View File

@@ -15,7 +15,7 @@
[rtl]
bpip_use = true
bpip_use_opportunism = true
bpip_use_opportunism = false
bpip_timeout = 100_000
[board]
@@ -35,13 +35,21 @@
bsk_pc = [
{Hbm={pc=8}},
{Hbm={pc=10}},
{Hbm={pc=12}},
{Hbm={pc=14}},
{Hbm={pc=24}},
{Hbm={pc=26}},
{Hbm={pc=28}},
{Hbm={pc=30}},
{Hbm={pc=40}},
{Hbm={pc=42}},
{Hbm={pc=44}},
{Hbm={pc=46}},
{Hbm={pc=56}},
{Hbm={pc=60}}
{Hbm={pc=58}},
{Hbm={pc=60}},
{Hbm={pc=62}}
]
ksk_pc = [
@@ -70,7 +78,7 @@
#implementation = "Ilp"
implementation = "Llt"
integer_w=[2,4,6,8,10,12,14,16,32,64,128]
min_batch_size = 11
min_batch_size = 9
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
@@ -87,8 +95,8 @@
custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm"
[firmware.op_cfg.default]
fill_batch_fifo = true
min_batch_size = false
fill_batch_fifo = false
min_batch_size = true
use_tiers = false
flush_behaviour = "Patient"
flush = true

View File

@@ -49,3 +49,15 @@ offset= 0x10
read_access="Read"
write_access="Write"
duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"]
[section.hpu_reset]
description="Used to control the HPU soft reset"
offset= 0x100
[section.hpu_reset.register.trigger]
description="A soft reset for the whole HPU reconfigurable logic"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"}
field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"}

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1d1afb554756df4d8b39bee33ded2dda19c23a6f9d8e2b242092efd35cf1cc19
size 83281321
oid sha256:f077c9cebbd56ba83c93ed0fdb4dea4f431dd6ee59be436ffbd8225e3ce82f49
size 84230351

View File

@@ -31,14 +31,22 @@ class LD(BaseInstruction):
self.__dict__ = d
def args(self):
return f'R{self.rid} @{hex(self.slot["Addr"])}'
try:
return f'R{self.rid} @{hex(self.slot["Addr"])}'
except:
# It can happen that an IOP is not translated by the FW
return f'R{self.rid} @{self.slot}'
class ST(BaseInstruction):
def __init__(self, d):
self.__dict__ = d
def args(self):
return f'@{hex(self.slot["Addr"])} R{self.rid}'
try:
return f'@{hex(self.slot["Addr"])} R{self.rid}'
except:
# It can happen that an IOP is not translated by the FW
return f'@{self.slot} R{self.rid}'
class MAC(BaseInstruction):
def __init__(self, d):

View File

@@ -176,6 +176,18 @@ pub const IOP_2CT_F_CT_SCALAR: ConstIOpProto<2, 1> = ConstIOpProto {
imm: 1,
};
pub const SIMD_N: usize = 9; //TODO: We need to come up with a way to have this dynamic
pub const IOP_NCT_F_2NCT: ConstIOpProto<{ SIMD_N }, { 2 * SIMD_N }> = ConstIOpProto {
dst: [VarMode::Native; SIMD_N],
src: [VarMode::Native; 2 * SIMD_N],
imm: 0,
};
pub const IOP_2NCT_F_3NCT: ConstIOpProto<{ 2 * SIMD_N }, { 3 * SIMD_N }> = ConstIOpProto {
dst: [VarMode::Native; 2 * SIMD_N],
src: [VarMode::Native; 3 * SIMD_N],
imm: 0,
};
use crate::iop;
use arg::IOpFormat;
use lazy_static::lazy_static;
@@ -227,4 +239,6 @@ iop!(
[IOP_CT_F_CT -> "LEAD1", opcode::LEAD1],
[IOP_CT_F_CT -> "TRAIL0", opcode::TRAIL0],
[IOP_CT_F_CT -> "TRAIL1", opcode::TRAIL1],
[IOP_NCT_F_2NCT -> "ADD_SIMD", opcode::ADD_SIMD],
[IOP_2NCT_F_3NCT -> "ERC_20_SIMD", opcode::ERC_20_SIMD],
);

View File

@@ -87,6 +87,10 @@ pub const LEAD1: u8 = 0x85;
pub const TRAIL0: u8 = 0x86;
pub const TRAIL1: u8 = 0x87;
// SIMD for maximum throughput
pub const ADD_SIMD: u8 = 0xF0;
pub const ERC_20_SIMD: u8 = 0xF1;
//
// Utility operations
// Used to handle real clone of ciphertext already uploaded in the Hpu memory
pub const MEMCPY: u8 = 0xFF;

View File

@@ -72,6 +72,9 @@ crate::impl_fw!("Ilp" [
LEAD1 => fw_impl::ilp_log::iop_lead1;
TRAIL0 => fw_impl::ilp_log::iop_trail0;
TRAIL1 => fw_impl::ilp_log::iop_trail1;
// SIMD Implementations
ADD_SIMD => fw_impl::llt::iop_add_simd;
ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd;
]);
#[instrument(level = "trace", skip(prog))]

View File

@@ -57,16 +57,16 @@ crate::impl_fw!("Llt" [
OVF_SSUB => fw_impl::ilp::iop_overflow_ssub;
OVF_MULS => fw_impl::ilp::iop_overflow_muls;
BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())});
BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())});
BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())});
BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())});
BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())});
BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())});
CMP_GT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))});
CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))});
CMP_LT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))});
CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))});
CMP_EQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))});
CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))});
CMP_GT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))});
CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))});
CMP_LT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))});
CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))});
CMP_EQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))});
CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))});
IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero;
IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else;
@@ -81,6 +81,10 @@ crate::impl_fw!("Llt" [
LEAD1 => fw_impl::ilp_log::iop_lead1;
TRAIL0 => fw_impl::ilp_log::iop_trail0;
TRAIL1 => fw_impl::ilp_log::iop_trail1;
// SIMD Implementations
ADD_SIMD => fw_impl::llt::iop_add_simd;
ERC_20_SIMD => fw_impl::llt::iop_erc_20_simd;
]);
// ----------------------------------------------------------------------------
@@ -102,6 +106,17 @@ pub fn iop_add(prog: &mut Program) {
iop_addx(prog, dst, src_a, src_b);
}
#[instrument(level = "trace", skip(prog))]
pub fn iop_add_simd(prog: &mut Program) {
// Add Comment header
prog.push_comment("ADD_SIMD Operand::Dst Operand::Src Operand::Src".to_string());
simd(
prog,
crate::asm::iop::SIMD_N,
fw_impl::llt::iop_add_ripple_rtl,
);
}
pub fn iop_adds(prog: &mut Program) {
// Allocate metavariables:
// Dest -> Operand
@@ -189,7 +204,7 @@ pub fn iop_mul(prog: &mut Program) {
// Add Comment header
prog.push_comment("MUL Operand::Dst Operand::Src Operand::Src".to_string());
// Deferred implementation to generic mulx function
iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog);
}
@@ -205,29 +220,50 @@ pub fn iop_muls(prog: &mut Program) {
// Add Comment header
prog.push_comment("MULS Operand::Dst Operand::Src Operand::Immediat".to_string());
// Deferred implementation to generic mulx function
iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog);
}
#[instrument(level = "trace", skip(prog))]
pub fn iop_erc_20(prog: &mut Program) {
// Add Comment header
prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
iop_erc_20_rtl(prog, 0).add_to_prog(prog);
}
#[instrument(level = "trace", skip(prog))]
pub fn iop_erc_20_simd(prog: &mut Program) {
// Add Comment header
prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl);
}
// ----------------------------------------------------------------------------
// Helper Functions
// ----------------------------------------------------------------------------
/// Implement erc_20 fund xfer
/// Targeted algorithm is as follow:
/// 1. Check that from has enough funds
/// 2. Compute real_amount to xfer (i.e. amount or 0)
/// 3. Compute new amount (from - new_amount, to + new_amount)
///
/// The input operands are:
/// (from[0], to[0], amount[0], ..., from[N-1], to[N-1], amount[N-1])
/// The output operands are:
/// (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
/// Where N is the batch size
#[instrument(level = "trace", skip(prog))]
pub fn iop_erc_20(prog: &mut Program) {
pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
// Allocate metavariables:
// Dest -> Operand
let dst_from = prog.iop_template_var(OperandKind::Dst, 0);
let dst_to = prog.iop_template_var(OperandKind::Dst, 1);
let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
let dst_to = prog.iop_template_var(OperandKind::Dst, 2 * batch_index + 1);
// Src -> Operand
let src_from = prog.iop_template_var(OperandKind::Src, 0);
let src_to = prog.iop_template_var(OperandKind::Src, 1);
let src_from = prog.iop_template_var(OperandKind::Src, 3 * batch_index);
let src_to = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 1);
// Src Amount -> Operand
let src_amount = prog.iop_template_var(OperandKind::Src, 2);
// Add Comment header
prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
let src_amount = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 2);
// TODO: Make this a parameter or sweep this
// All these little parameters would be very handy to write an
@@ -236,7 +272,7 @@ pub fn iop_erc_20(prog: &mut Program) {
let kogge_blk_w = 10;
let ripple = true;
let tree = {
{
let props = prog.params();
let tfhe_params: asm::DigitParameters = props.clone().into();
let lut = pbs_by_name!("IfFalseZeroed");
@@ -273,13 +309,26 @@ pub fn iop_erc_20(prog: &mut Program) {
kogge::add(prog, dst_to, src_to, src_amount.clone(), None, kogge_blk_w)
+ kogge::sub(prog, dst_from, src_from, src_amount, kogge_blk_w)
}
};
tree.add_to_prog(prog);
}
}
/// A SIMD implementation of add for maximum throughput
#[instrument(level = "trace", skip(prog))]
pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8) -> Rtl {
// Allocate metavariables:
let dst = prog.iop_template_var(OperandKind::Dst, i);
let src_a = prog.iop_template_var(OperandKind::Src, 2 * i);
let src_b = prog.iop_template_var(OperandKind::Src, 2 * i + 1);
// Convert MetaVarCell in VarCell for Rtl analysis
let a = VarCell::from_vec(src_a);
let b = VarCell::from_vec(src_b);
let d = VarCell::from_vec(dst);
// Do a + b with the ripple carry adder
kogge::ripple_add(d, a, b, None)
}
// ----------------------------------------------------------------------------
// Helper Functions
// ----------------------------------------------------------------------------
fn iop_addx(
prog: &mut Program,
dst: Vec<MetaVarCell>,
@@ -313,11 +362,181 @@ fn iop_subx(
.add_to_prog(prog);
}
/// Generic mul operation for massively parallel HPUs
#[instrument(level = "trace", skip(prog))]
pub fn iop_mulx_par(
prog: &mut Program,
dst: Vec<metavar::MetaVarCell>,
src_a: Vec<metavar::MetaVarCell>,
src_b: Vec<metavar::MetaVarCell>,
) -> Rtl {
let props = prog.params();
let tfhe_params: asm::DigitParameters = props.clone().into();
let blk_w = props.blk_w();
// Transform metavars into RTL vars
let mut dst = VarCell::from_vec(dst);
let src_a = VarCell::from_vec(src_a);
let src_b = VarCell::from_vec(src_b);
let max_deg = VarDeg {
deg: props.max_val(),
nu: props.nu,
};
let pbs_mul_lsb = pbs_by_name!("MultCarryMsgLsb");
let pbs_mul_msb = pbs_by_name!("MultCarryMsgMsb");
let max_carry = (props.max_msg() * props.max_msg()) >> props.msg_w;
let max_msg = props.max_msg();
let mut mul_map: HashMap<usize, Vec<VarCellDeg>> = HashMap::new();
itertools::iproduct!(0..blk_w, 0..blk_w).for_each(|(i, j)| {
let pp = src_a[i].mac(tfhe_params.msg_range(), &src_b[j]);
let lsb = pp.single_pbs(&pbs_mul_lsb);
let msb = pp.single_pbs(&pbs_mul_msb);
mul_map
.entry(i + j)
.or_default()
.push(VarCellDeg::new(max_msg, lsb));
mul_map
.entry(i + j + 1)
.or_default()
.push(VarCellDeg::new(max_carry, msb));
});
let mut pp: Vec<VecVarCellDeg> = (0..dst.len())
.map(|i| mul_map.remove(&i).unwrap().into())
.collect();
// Reduce dada tree like
while pp.iter().any(|x| x.len() > 1) {
trace!(
target: "llt::mul",
"pp length: {:?}",
pp.iter().map(|x| x.len()).collect::<Vec<_>>()
);
for c in (0..dst.len()).rev() {
let mut col_len = pp[c].len();
let mut reduced = Vec::new();
let mut chunks = pp[c].deg_chunks(&max_deg).peekable();
let max_col = if c == (dst.len() - 1) {
0
} else {
dst.len() - 1
};
while chunks.peek().is_some() && col_len > pp[max_col].len() {
let mut chunk = chunks.next().unwrap();
let chunk_len = chunk.len();
col_len -= chunk.len();
// sum the chunk
while chunk.len() > 1 {
chunk = chunk
.chunks(2)
.map(|chunk| match chunk.len() {
1 => chunk[0].clone(),
2 => &chunk[0] + &chunk[1],
_ => panic!("Invalid chunk size"),
})
.collect()
}
// And bootstrap if needed
let element = chunk
.into_iter()
.next()
.map(|sum| {
assert!(sum.deg.nu <= props.nu);
if sum.deg == max_deg || chunk_len == 1 {
let (data, carry) = sum.bootstrap(&props);
if let (Some(carry), Some(elm)) = (carry, pp.get_mut(c + 1)) {
elm.push(carry);
}
data
} else {
sum
}
})
.unwrap();
reduced.push(element);
}
pp[c] = reduced
.into_iter()
.chain(chunks.flatten())
.collect::<Vec<_>>()
.into();
}
}
trace!(
target: "llt::mul",
"final pp: {:?}", pp
);
// Extract carry and message and do carry propagation
let mut a: Vec<Option<VarCell>> = (0..dst.len() + 1).map(|_| None).collect();
let mut b: Vec<Option<VarCell>> = (0..dst.len() + 1).map(|_| None).collect();
pp.into_iter().enumerate().for_each(|(i, pp)| {
assert!(pp.len() == 1);
let vardeg = pp.first().unwrap();
let (msg, carry) = vardeg.bootstrap(&props);
a[i] = Some(msg.var);
if let Some(carry) = carry {
b[i + 1] = Some(carry.var);
}
});
let cs: Vec<_> = a
.into_iter()
.take(dst.len())
.zip(b.into_iter())
.map(|(a, b)| match (a, b) {
(Some(a), Some(b)) => &a + &b,
(Some(a), None) => a,
(None, Some(b)) => b,
_ => panic!("Fix your code"),
})
.collect();
// Do fully parallel carry propagation
kogge::propagate_carry(prog, dst.as_mut_slice(), cs.as_slice(), &None);
Rtl::from(dst)
}
/// multiplier wrapper, to choose between parallel and serial implementations
#[instrument(level = "trace", skip(prog))]
pub fn iop_mulx(
prog: &mut Program,
dst: Vec<metavar::MetaVarCell>,
src_a: Vec<metavar::MetaVarCell>,
src_b: Vec<metavar::MetaVarCell>,
) -> Rtl {
// When the batch size is enough to do a full stage in parallel, do parallel
// mul.
// Note: The break-even point might not be this one, but choosing the right
// point is uninportant since we'll leap imensely the number of batches from
// FPGA to ASIC.
let parallel = prog
.op_cfg()
.parallel
.unwrap_or_else(|| prog.params().pbs_batch_w >= dst.len());
if parallel {
iop_mulx_par(prog, dst, src_a, src_b)
} else {
iop_mulx_ser(prog, dst, src_a, src_b)
}
}
/// Generic mul operation
/// One destination and two sources operation
/// Source could be Operand or Immediat
#[instrument(level = "trace", skip(prog))]
pub fn iop_mulx(
pub fn iop_mulx_ser(
prog: &mut Program,
dst: Vec<metavar::MetaVarCell>,
src_a: Vec<metavar::MetaVarCell>,
@@ -367,7 +586,10 @@ pub fn iop_mulx(
sum.var.single_pbs(&pbs_carry),
));
}
VarCellDeg::new(props.max_msg(), sum.var.single_pbs(&pbs_msg))
VarCellDeg::new(
sum.deg.deg.min(props.max_msg()),
sum.var.single_pbs(&pbs_msg),
)
};
while to_sum.len() > 1 {
@@ -540,3 +762,23 @@ fn bw_inv(prog: &mut Program, b: Vec<VarCell>) -> Vec<VarCell> {
})
.collect::<Vec<_>>()
}
/// Creates a SIMD version of the closure
/// Make sure that the closure is a PBS optimized version of the operation
/// The closure receives as inputs the program and the batch index.
/// How the ASM operands are actually organized is defined by the closure
/// itself.
///
/// Maybe this should go into a SIMD firmware implementation... At some point we
/// would need a mechanism to choose between implementations on the fly to make
/// real good use of all of this.
fn simd<F>(prog: &mut Program, batch_size: usize, rtl_closure: F)
where
F: Fn(&mut Program, u8) -> Rtl,
{
(0..batch_size)
.map(|i| i as u8)
.map(|i| rtl_closure(prog, i))
.sum::<Rtl>()
.add_to_prog(prog);
}

View File

@@ -1,4 +1,6 @@
use super::rtl::VarCell;
use super::*;
use crate::pbs_by_name;
use tracing::trace;
#[derive(Clone, Eq, Default, Debug)]
@@ -48,6 +50,48 @@ pub struct VarCellDeg {
pub deg: VarDeg,
}
impl VarCellDeg {
pub fn bootstrap(&self, props: &FwParameters) -> (VarCellDeg, Option<VarCellDeg>) {
trace!(target: "vardeg::VarCellDeg::bootstrap", "bootstrap: {:?}", self);
let pbs_many_carry = pbs_by_name!("ManyCarryMsg");
let pbs_carry = pbs_by_name!("CarryInMsg");
let pbs_msg = pbs_by_name!("MsgOnly");
if self.deg.deg <= props.max_msg() {
match self.deg.nu {
1 => (self.clone(), None),
_ => (
VarCellDeg::new(self.deg.deg, self.var.single_pbs(&pbs_msg)),
None,
),
}
// If we still have a bit available to do manyLUT
} else if self.deg.deg > props.max_msg() && self.deg.deg <= (props.max_val() >> 1) {
let mut pbs = self.var.pbs(&pbs_many_carry).into_iter();
(
VarCellDeg::new(props.max_msg().min(self.deg.deg), pbs.next().unwrap()),
Some(VarCellDeg::new(
self.deg.deg >> props.carry_w,
pbs.next().unwrap(),
)),
)
//Otherwise, we'll have to use two independent PBSs
} else {
(
VarCellDeg::new(
self.deg.deg.min(props.max_msg()),
self.var.single_pbs(&pbs_msg),
),
Some(VarCellDeg::new(
self.deg.deg >> props.carry_w,
self.var.single_pbs(&pbs_carry),
)),
)
}
}
}
impl PartialOrd for VarCellDeg {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
@@ -105,25 +149,23 @@ impl std::fmt::Debug for VarCellDeg {
}
impl VecVarCellDeg {
pub fn deg_chunks(
mut self,
max_deg: &VarDeg,
) -> <Vec<Vec<VarCellDeg>> as IntoIterator>::IntoIter {
pub fn deg_chunks(&self, max_deg: &VarDeg) -> <Vec<Vec<VarCellDeg>> as IntoIterator>::IntoIter {
trace!(target: "llt:deg_chunks", "len: {:?}, {:?}", self.len(), self.0);
let mut res: Vec<Vec<VarCellDeg>> = Vec::new();
let mut acc: VarDeg = VarDeg::default();
let mut chunk: Vec<VarCellDeg> = Vec::new();
let mut copy = self.0.clone();
// There are many ways to combine the whole vector in chunks up to
// max_deg. We'll be greedy and sum up the elements by maximum degree
// first.
self.0.sort();
copy.sort();
while !self.is_empty() {
let sum = &acc + &self.0.last().unwrap().deg;
while !copy.is_empty() {
let sum = &acc + &copy.last().unwrap().deg;
if sum <= *max_deg {
chunk.push(self.0.pop().unwrap());
chunk.push(copy.pop().unwrap());
acc = sum;
} else {
res.push(chunk);
@@ -131,7 +173,7 @@ impl VecVarCellDeg {
chunk = Vec::new();
}
trace!(target: "llt:deg_chunks:loop", "len: {:?}, {:?}, chunk: {:?}, acc: {:?}",
self.len(), self.0, chunk, acc);
self.len(), copy, chunk, acc);
}
// Any remaining chunk is appended
@@ -159,4 +201,8 @@ impl VecVarCellDeg {
pub fn is_empty(&self) -> bool {
self.0.len() == 0
}
pub fn push(&mut self, item: VarCellDeg) {
self.0.push(item)
}
}

View File

@@ -12,6 +12,8 @@ pub struct OpCfg {
pub flush: bool,
/// Whether to use latency tiers when scheduling
pub use_tiers: bool,
/// Whether to use a massively parallel implementation
pub parallel: Option<bool>,
}
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]

View File

@@ -1500,7 +1500,7 @@ impl Arch {
}
}
#[derive(Clone, Debug)]
#[derive(Default, Clone, Debug)]
pub struct Rtl(Vec<VarCell>);
impl Rtl {
@@ -1623,6 +1623,12 @@ impl std::ops::Add<Rtl> for Rtl {
}
}
impl std::iter::Sum<Rtl> for Rtl {
fn sum<I: Iterator<Item = Rtl>>(iter: I) -> Self {
iter.fold(Rtl::default(), |acc, x| acc + x)
}
}
impl Drop for Rtl {
fn drop(&mut self) {
self.unload();

View File

@@ -354,7 +354,7 @@ impl InfoPePbs {
}
pub fn update_load_bsk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) {
(1..16).for_each(|i| {
(0..16).for_each(|i| {
let reg_name = format!("runtime_3in3::pep_load_bsk_rcp_dur_pc{i}");
let reg = regmap
.register()
@@ -364,7 +364,7 @@ impl InfoPePbs {
});
}
pub fn update_load_ksk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) {
(1..16).for_each(|i| {
(0..16).for_each(|i| {
let reg_name = format!("runtime_1in3::pep_load_ksk_rcp_dur_pc{i}");
let reg = regmap
.register()

View File

@@ -64,13 +64,13 @@ pub struct IscPoolState {
pub(super) vld: bool,
pub(super) wr_lock: u32,
pub(super) rd_lock: u32,
//pub(super) issue_lock: u32,
pub(super) issue_lock: u32,
pub(super) sync_id: u32,
}
impl Len for IscPoolState {
fn len() -> usize {
21
28
}
}
@@ -85,8 +85,8 @@ where
vld: *(slice.get(2).ok_or(NoMoreBits)?),
wr_lock: slice.get(3..10).ok_or(NoMoreBits)?.load::<u32>(),
rd_lock: slice.get(10..17).ok_or(NoMoreBits)?.load::<u32>(),
//issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::<u32>(),
sync_id: slice.get(17..21).ok_or(NoMoreBits)?.load::<u32>(),
issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::<u32>(),
sync_id: slice.get(24..28).ok_or(NoMoreBits)?.load::<u32>(),
})
}
}

View File

@@ -79,6 +79,9 @@ pub enum Command {
#[arg(short, long, default_value_t = String::from("trace.json"))]
file: String,
},
#[clap(about = "Resets all HPU processing logic")]
SoftReset {},
}
#[derive(Clone, Debug, ValueEnum)]
@@ -286,5 +289,26 @@ fn main() {
serde_json::to_writer_pretty(file.make_writer(), &parsed)
.expect("Could not write trace dump");
}
Command::SoftReset {} => {
let soft_reset = regmap
.register()
.get("hpu_reset::trigger")
.expect("The current HPU does not support soft reset.");
let soft_reset_addr = *soft_reset.offset() as u64;
for reset in [true, false].into_iter() {
hpu_hw.write_reg(soft_reset_addr, reset as u32);
loop {
let done = {
let val = hpu_hw.read_reg(soft_reset_addr);
let fields = soft_reset.as_field(val);
*fields.get("done").expect("Unknown field") != 0
};
if done == reset {
break;
}
}
}
}
}
}

View File

@@ -12,5 +12,9 @@
"n3-H100x4": 6.08,
"n3-H100x2": 3.04,
"n3-L40x1": 0.80,
"n3-H100x8-SXM5": 19.2
"n3-H100x8-SXM5": 19.2,
"hpu_x1": 1.0,
"hpu_x2": 1.4,
"hpu_x4": 2.3,
"hpu_x8": 4.0
}

View File

@@ -36,7 +36,7 @@
[pc_params]
ksk_pc= 16
ksk_bytes_w= 32
bsk_pc= 8
bsk_pc= 16
bsk_bytes_w= 32
pem_pc= 2
pem_bytes_w= 32

View File

@@ -23,7 +23,7 @@ V80_PCIE_DEV="unselected"
XILINX_VIVADO=${XILINX_VIVADO:-"/opt/amd/Vivado/2024.2"}
# V80 bitstream refresh require insmod of ami.ko module
AMI_PATH=${AMI_PATH:-"/opt/v80/ami/1e6a8da"}
AMI_PATH=${AMI_PATH:-"/opt/v80/ami/ef9249f"}
# Parse user CLI ##############################################################
opt_short="hc:l:p:"
@@ -63,6 +63,9 @@ do
if [ -n "${2}" ] && [[ ! ${2} =~ ^- ]]; then
V80_PCIE_DEV="${2}"
((i++))
shift 1
elif [[ ${#DEVICE[@]} -eq 1 ]]; then
V80_PCIE_DEV=${DEVICE[0]%%:*}
else
echo "Please select a device in following list (1st two digits):"
for item in "${DEVICE[@]}"; do
@@ -70,7 +73,7 @@ do
done
return 1
fi
shift 2
shift 1
;;
"") # End of input reading
break ;;

View File

@@ -29,9 +29,15 @@ fn bench_fhe_type<FheType>(
+ RotateRight<&'a FheType, Output = FheType>
+ OverflowingAdd<&'a FheType, Output = FheType>
+ OverflowingSub<&'a FheType, Output = FheType>,
for<'a> FheType: FheMin<&'a FheType, Output = FheType> + FheMax<&'a FheType, Output = FheType>,
{
let mut bench_group = c.benchmark_group(type_name);
let bench_prefix = "hlapi::ops";
let mut bench_prefix = "hlapi::ops".to_string();
if cfg!(feature = "gpu") {
bench_prefix = format!("{}::cuda", bench_prefix);
} else if cfg!(feature = "hpu") {
bench_prefix = format!("{}::hpu", bench_prefix);
}
let mut rng = thread_rng();
@@ -66,31 +72,25 @@ fn bench_fhe_type<FheType>(
});
write_record(bench_id, "add");
#[cfg(not(feature = "hpu"))]
{
bench_id = format!("{bench_prefix}::overflowing_add::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let (res, flag) = lhs.overflowing_add(&rhs);
res.wait();
black_box((res, flag))
})
});
write_record(bench_id, "overflowing_add");
}
bench_id = format!("{bench_prefix}::overflowing_add::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let (res, flag) = lhs.overflowing_add(&rhs);
res.wait();
black_box((res, flag))
})
});
write_record(bench_id, "overflowing_add");
#[cfg(not(feature = "hpu"))]
{
bench_id = format!("{bench_prefix}::overflowing_sub::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let (res, flag) = lhs.overflowing_sub(&rhs);
res.wait();
black_box((res, flag))
})
});
write_record(bench_id, "overflowing_sub");
}
bench_id = format!("{bench_prefix}::overflowing_sub::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let (res, flag) = lhs.overflowing_sub(&rhs);
res.wait();
black_box((res, flag))
})
});
write_record(bench_id, "overflowing_sub");
bench_id = format!("{bench_prefix}::sub::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
@@ -142,48 +142,65 @@ fn bench_fhe_type<FheType>(
});
write_record(bench_id, "bitxor");
#[cfg(not(feature = "hpu"))]
{
bench_id = format!("{bench_prefix}::left_shift::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = &lhs << &rhs;
res.wait();
black_box(res)
})
});
write_record(bench_id, "left_shift");
bench_id = format!("{bench_prefix}::left_shift::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = &lhs << &rhs;
res.wait();
black_box(res)
})
});
write_record(bench_id, "left_shift");
bench_id = format!("{bench_prefix}::right_shift::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = &lhs >> &rhs;
res.wait();
black_box(res)
})
});
write_record(bench_id, "right_shift");
bench_id = format!("{bench_prefix}::right_shift::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = &lhs >> &rhs;
res.wait();
black_box(res)
})
});
write_record(bench_id, "right_shift");
bench_id = format!("{bench_prefix}::left_rotate::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = (&lhs).rotate_left(&rhs);
res.wait();
black_box(res)
})
});
write_record(bench_id, "left_rotate");
bench_id = format!("{bench_prefix}::left_rotate::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = (&lhs).rotate_left(&rhs);
res.wait();
black_box(res)
})
});
write_record(bench_id, "left_rotate");
bench_id = format!("{bench_prefix}::right_rotate::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = (&lhs).rotate_right(&rhs);
res.wait();
black_box(res)
})
});
write_record(bench_id, "right_rotate");
}
bench_id = format!("{bench_prefix}::right_rotate::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = (&lhs).rotate_right(&rhs);
res.wait();
black_box(res)
})
});
write_record(bench_id, "right_rotate");
bench_id = format!("{bench_prefix}::min::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = lhs.min(&rhs);
res.wait();
black_box(res)
})
});
write_record(bench_id, "min");
bench_id = format!("{bench_prefix}::max::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let res = lhs.max(&rhs);
res.wait();
black_box(res)
})
});
write_record(bench_id, "max");
}
macro_rules! bench_type {

View File

@@ -1,9 +1,16 @@
#[cfg(feature = "gpu")]
use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use benchmark::params_aliases::{
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};
#[cfg(not(feature = "gpu"))]
use benchmark::params_aliases::BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
#[cfg(feature = "gpu")]
use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use benchmark::params_aliases::{
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};
#[cfg(feature = "gpu")]
use benchmark::utilities::configure_gpu;
use benchmark::utilities::{
@@ -21,11 +28,13 @@ use tfhe::core_crypto::gpu::get_number_of_gpus;
#[cfg(feature = "gpu")]
use tfhe::{set_server_key, GpuIndex};
use tfhe::{
ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16,
FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
ClientKey, CompressedCiphertextListBuilder, CompressedServerKey,
CompressedSquashedNoiseCiphertextListBuilder, FheUint10, FheUint12, FheUint128, FheUint14,
FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, HlCompressible,
HlExpandable, HlSquashedNoiseCompressible,
};
fn bench_fhe_type<FheType>(
fn bench_sns_only_fhe_type<FheType>(
c: &mut Criterion,
client_key: &ClientKey,
type_name: &str,
@@ -139,27 +148,184 @@ fn bench_fhe_type<FheType>(
);
}
macro_rules! bench_type {
fn bench_decomp_sns_comp_fhe_type<FheType>(
c: &mut Criterion,
client_key: &ClientKey,
type_name: &str,
num_bits: usize,
) where
FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
FheType: SquashNoise + Tagged + HlExpandable + HlCompressible,
<FheType as SquashNoise>::Output: HlSquashedNoiseCompressible,
{
let mut bench_group = c.benchmark_group(type_name);
let bench_id_prefix = if cfg!(feature = "gpu") {
"hlapi::cuda"
} else {
"hlapi"
};
let bench_id_suffix = format!("decomp_noise_squash_comp::{type_name}");
let mut rng = thread_rng();
let bench_id;
match get_bench_type() {
BenchmarkType::Latency => {
bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
#[cfg(feature = "gpu")]
configure_gpu(client_key);
let input = FheType::encrypt(rng.gen(), client_key);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(input);
let compressed = builder.build().unwrap();
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
let decompressed = compressed.get::<FheType>(0).unwrap().unwrap();
let squashed = decompressed.squash_noise().unwrap();
let mut builder = CompressedSquashedNoiseCiphertextListBuilder::new();
builder.push(squashed);
let _ = builder.build();
})
});
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_id_prefix}::throughput::{bench_id_suffix}");
let params = client_key.computation_parameters();
let num_blocks = num_bits
.div_ceil((params.message_modulus().0 * params.carry_modulus().0).ilog2() as usize);
#[cfg(feature = "gpu")]
{
let elements = throughput_num_threads(num_blocks, 4);
bench_group.throughput(Throughput::Elements(elements));
println!("elements: {elements}");
let gpu_count = get_number_of_gpus() as usize;
let compressed_server_key = CompressedServerKey::new(client_key);
let sks_vec = (0..gpu_count)
.map(|i| {
compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
})
.collect::<Vec<_>>();
bench_group.bench_function(&bench_id, |b| {
let compressed_values = || {
(0..elements)
.map(|_| {
let input = FheType::encrypt(rng.gen(), client_key);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(input);
builder.build().unwrap()
})
.collect::<Vec<_>>()
};
b.iter_batched(
compressed_values,
|compressed_inputs| {
compressed_inputs
.par_iter()
.enumerate()
.for_each(|(i, input)| {
set_server_key(sks_vec[i % gpu_count].clone());
let decompressed = input.get::<FheType>(0).unwrap().unwrap();
let squashed = decompressed.squash_noise().unwrap();
let mut builder =
CompressedSquashedNoiseCiphertextListBuilder::new();
builder.push(squashed);
let _ = builder.build();
})
},
criterion::BatchSize::SmallInput,
)
});
}
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
{
let elements = throughput_num_threads(num_blocks, 1);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let compressed_values = || {
(0..elements)
.map(|_| {
let input = FheType::encrypt(rng.gen(), client_key);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(input);
builder.build().unwrap()
})
.collect::<Vec<_>>()
};
b.iter_batched(
compressed_values,
|compressed_inputs| {
compressed_inputs.par_iter().for_each(|input| {
let decompressed = input.get::<FheType>(0).unwrap().unwrap();
let squashed = decompressed.squash_noise().unwrap();
let mut builder =
CompressedSquashedNoiseCiphertextListBuilder::new();
builder.push(squashed);
let _ = builder.build();
})
},
criterion::BatchSize::SmallInput,
)
});
}
}
}
let params = client_key.computation_parameters();
write_to_json::<u64, _>(
&bench_id,
params,
params.name(),
"decomp_noise_squash_comp",
&OperatorType::Atomic,
64,
vec![],
);
}
macro_rules! bench_sns_only_type {
($fhe_type:ident) => {
::paste::paste! {
fn [<bench_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
bench_sns_only_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
}
}
};
}
bench_type!(FheUint2);
bench_type!(FheUint4);
bench_type!(FheUint6);
bench_type!(FheUint8);
bench_type!(FheUint10);
bench_type!(FheUint12);
bench_type!(FheUint14);
bench_type!(FheUint16);
bench_type!(FheUint32);
bench_type!(FheUint64);
bench_type!(FheUint128);
macro_rules! bench_decomp_sns_comp_type {
($fhe_type:ident) => {
::paste::paste! {
fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
}
}
};
}
bench_sns_only_type!(FheUint2);
bench_sns_only_type!(FheUint4);
bench_sns_only_type!(FheUint6);
bench_sns_only_type!(FheUint8);
bench_sns_only_type!(FheUint10);
bench_sns_only_type!(FheUint12);
bench_sns_only_type!(FheUint14);
bench_sns_only_type!(FheUint16);
bench_sns_only_type!(FheUint32);
bench_sns_only_type!(FheUint64);
bench_sns_only_type!(FheUint128);
bench_decomp_sns_comp_type!(FheUint64);
fn main() {
#[cfg(feature = "hpu")]
@@ -172,6 +338,10 @@ fn main() {
BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
.enable_noise_squashing_compression(
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_compression(BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
.build();
let cks = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&cks);
@@ -190,6 +360,12 @@ fn main() {
.enable_noise_squashing(
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_noise_squashing_compression(
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_compression(
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.build();
let cks = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&cks);
@@ -200,17 +376,19 @@ fn main() {
let mut c = Criterion::default().configure_from_args();
bench_fhe_uint2(&mut c, &cks);
bench_fhe_uint4(&mut c, &cks);
bench_fhe_uint6(&mut c, &cks);
bench_fhe_uint8(&mut c, &cks);
bench_fhe_uint10(&mut c, &cks);
bench_fhe_uint12(&mut c, &cks);
bench_fhe_uint14(&mut c, &cks);
bench_fhe_uint16(&mut c, &cks);
bench_fhe_uint32(&mut c, &cks);
bench_fhe_uint64(&mut c, &cks);
bench_fhe_uint128(&mut c, &cks);
bench_sns_only_fhe_uint2(&mut c, &cks);
bench_sns_only_fhe_uint4(&mut c, &cks);
bench_sns_only_fhe_uint6(&mut c, &cks);
bench_sns_only_fhe_uint8(&mut c, &cks);
bench_sns_only_fhe_uint10(&mut c, &cks);
bench_sns_only_fhe_uint12(&mut c, &cks);
bench_sns_only_fhe_uint14(&mut c, &cks);
bench_sns_only_fhe_uint16(&mut c, &cks);
bench_sns_only_fhe_uint32(&mut c, &cks);
bench_sns_only_fhe_uint64(&mut c, &cks);
bench_sns_only_fhe_uint128(&mut c, &cks);
bench_decomp_sns_comp_fhe_uint64(&mut c, &cks);
c.final_summary();
}

View File

@@ -677,6 +677,114 @@ fn if_then_else_parallelized(c: &mut Criterion) {
bench_group.finish()
}
fn flip_parallelized(c: &mut Criterion) {
let bench_name = "integer::flip_parallelized";
let display_name = "flip";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(60));
let mut rng = rand::thread_rng();
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
let param_name = param.name();
let bench_id;
match get_bench_type() {
BenchmarkType::Latency => {
let bench_data = LazyCell::new(|| {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let clear_0 = gen_random_u256(&mut rng);
let clear_1 = gen_random_u256(&mut rng);
let clear_cond = rng.gen_bool(0.5);
let true_ct = cks.encrypt_radix(clear_0, num_block);
let false_ct = cks.encrypt_radix(clear_1, num_block);
let condition = cks.encrypt_bool(clear_cond);
(sks, condition, true_ct, false_ct)
});
bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
let (sks, condition, true_ct, false_ct) =
(&bench_data.0, &bench_data.1, &bench_data.2, &bench_data.3);
b.iter(|| sks.flip_parallelized(condition, true_ct, false_ct))
});
}
BenchmarkType::Throughput => {
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
// Execute the operation once to know its cost.
let clear_0 = gen_random_u256(&mut rng);
let true_ct = cks.encrypt_radix(clear_0, num_block);
let clear_1 = gen_random_u256(&mut rng);
let false_ct = cks.encrypt_radix(clear_1, num_block);
let condition = sks.create_trivial_boolean_block(rng.gen_bool(0.5));
reset_pbs_count();
sks.flip_parallelized(&condition, &true_ct, &false_ct);
let pbs_count = max(get_pbs_count(), 1); // Operation might not perform any PBS, so we take 1 as default
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
let elements = throughput_num_threads(num_block, pbs_count);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let setup_encrypted_values = || {
let cts_cond = (0..elements)
.map(|_| sks.create_trivial_boolean_block(rng.gen_bool(0.5)))
.collect::<Vec<_>>();
let cts_then = (0..elements)
.map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block))
.collect::<Vec<_>>();
let cts_else = (0..elements)
.map(|_| cks.encrypt_radix(gen_random_u256(&mut rng), num_block))
.collect::<Vec<_>>();
(cts_cond, cts_then, cts_else)
};
b.iter_batched(
setup_encrypted_values,
|(cts_cond, cts_then, cts_else)| {
cts_cond
.par_iter()
.zip(cts_then.par_iter())
.zip(cts_else.par_iter())
.for_each(|((condition, true_ct), false_ct)| {
sks.flip_parallelized(condition, true_ct, false_ct);
})
},
criterion::BatchSize::SmallInput,
);
});
}
}
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
display_name,
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
bench_group.finish()
}
fn ciphertexts_sum_parallelized(c: &mut Criterion) {
let bench_name = "integer::sum_ciphertexts_parallelized";
let display_name = "sum_ctxts";
@@ -3032,7 +3140,7 @@ mod hpu {
fn [< default_hpu_ $iop:lower >](c: &mut Criterion) {
bench_hpu_iop_clean_inputs(
c,
concat!("integer::hpu::scalar::", stringify!($iop)),
concat!("integer::hpu::scalar_", stringify!($iop)),
stringify!($name),
&hpu_asm::iop::[< IOP_ $iop:upper >],
)
@@ -3350,6 +3458,7 @@ criterion_group!(
gt_parallelized,
ge_parallelized,
if_then_else_parallelized,
flip_parallelized,
);
criterion_group!(
@@ -3365,6 +3474,7 @@ criterion_group!(
eq_parallelized,
gt_parallelized,
if_then_else_parallelized,
flip_parallelized
);
criterion_group!(
@@ -3629,6 +3739,9 @@ fn go_through_hpu_bench_groups(val: &str) {
hpu::default_hpu_bitwise();
hpu::default_hpu_cmp();
hpu::default_hpu_select();
hpu::default_hpu_shiftrot();
hpu::default_hpu_shiftrot_scalar();
hpu::default_hpu_bitcnt();
}
"fast_default" => {
hpu::default_hpu_ops();

File diff suppressed because it is too large Load Diff

View File

@@ -145,7 +145,7 @@ pub mod shortint_params_aliases {
pub const BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
NoiseSquashingCompressionParameters =
V1_3_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
V1_4_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
#[cfg(feature = "hpu")]
// KS PBS Gaussian for Hpu

View File

@@ -3,7 +3,7 @@ use std::path::PathBuf;
use std::sync::OnceLock;
use std::{env, fs};
#[cfg(feature = "gpu")]
use tfhe::core_crypto::gpu::get_number_of_gpus;
use tfhe::core_crypto::gpu::{get_number_of_gpus, get_number_of_sms};
use tfhe::core_crypto::prelude::*;
#[cfg(feature = "boolean")]
@@ -417,10 +417,6 @@ pub fn get_bench_type() -> &'static BenchmarkType {
BENCH_TYPE.get_or_init(|| BenchmarkType::from_env().unwrap())
}
/// Number of streaming multiprocessors (SM) available on Nvidia H100 GPU
#[cfg(feature = "gpu")]
const H100_PCIE_SM_COUNT: u32 = 114;
/// Generate a number of threads to use to saturate current machine for throughput measurements.
pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
let ref_block_count = 32; // Represent a ciphertext of 64 bits for 2_2 parameters set
@@ -431,11 +427,19 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
#[cfg(feature = "gpu")]
{
let total_num_sm = H100_PCIE_SM_COUNT * get_number_of_gpus();
let num_sms_per_gpu = get_number_of_sms();
let total_num_sm = num_sms_per_gpu * get_number_of_gpus();
let total_blocks_per_sm = 4u32; // Assume each SM can handle 4 blocks concurrently
let total_num_sm = total_blocks_per_sm * total_num_sm;
let min_num_waves = 4u64; //Enforce at least 4 waves in the GPU
let elements_per_wave = total_num_sm as u64 / (num_block as u64);
let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
elements.min(200) // This threshold is useful for operation with both a small number of
// block and low PBs count.
elements.min(elements_per_wave * min_num_waves) // This threshold is useful for operation
// with both a small number of
// block and low PBs count.
}
#[cfg(feature = "hpu")]
{

View File

@@ -14,6 +14,7 @@ rust-version = "1.72"
[dependencies]
aes = "0.8.2"
rayon = { workspace = true, optional = true }
getrandom = { workspace = true }
[target.'cfg(target_os = "macos")'.dependencies]
libc = "0.2.133"

View File

@@ -1,12 +1,11 @@
use crate::seeders::{Seed, Seeder};
use std::fs::File;
use std::io::Read;
/// A seeder which uses the `/dev/random` source on unix-like systems.
/// A seeder which uses the system entropy source on unix-like systems.
///
/// If available, this will use `getrandom` or `getentropy` system call. Otherwise it will draw from
/// `/dev/urandom` after successfully polling `/dev/random`.
pub struct UnixSeeder {
counter: u128,
secret: u128,
file: File,
}
impl UnixSeeder {
@@ -15,35 +14,34 @@ impl UnixSeeder {
/// Important:
/// ----------
///
/// This secret is used to ensure the quality of the seed in scenarios where `/dev/random` may
/// be compromised.
/// This secret is used to ensure the quality of the seed in scenarios where the system random
/// source may be compromised.
///
/// The attack hypotheses are as follow:
/// - `/dev/random` output can be predicted by a process running on the machine by just
/// - The kernel random output can be predicted by a process running on the machine by just
/// observing various states of the machine
/// - The attacker cannot read data from the process where `tfhe-csprng` is running
///
/// Using a secret in `tfhe-csprng` allows to generate values that the attacker cannot
/// predict, making this seeder secure on systems were `/dev/random` outputs can be
/// predict, making this seeder secure on systems were the kernel random outputs can be
/// predicted.
pub fn new(secret: u128) -> UnixSeeder {
let file = std::fs::File::open("/dev/random").expect("Failed to open /dev/random .");
let counter = std::time::UNIX_EPOCH
.elapsed()
.expect("Failed to initialize unix seeder.")
.as_nanos();
UnixSeeder {
secret,
counter,
file,
}
UnixSeeder { secret }
}
}
impl Seeder for UnixSeeder {
/// Draws entropy from a system source to seed a CSPRNG.
///
/// It may be blocking at system startup if the kernel entropy pool has not been initialized,
/// but should not be blocking after.
///
/// # Panics
/// This may panic if the `getrandom` system call is not available and no file descriptor is
/// available on the system.
fn seed(&mut self) -> Seed {
let output = self.secret ^ self.counter ^ dev_random(&mut self.file);
self.counter = self.counter.wrapping_add(1);
let output = self.secret ^ get_system_entropy();
Seed(output)
}
@@ -52,11 +50,14 @@ impl Seeder for UnixSeeder {
}
}
fn dev_random(random: &mut File) -> u128 {
fn get_system_entropy() -> u128 {
let mut buf = [0u8; 16];
random
.read_exact(&mut buf[..])
.expect("Failed to read from /dev/random .");
// This will use the getrandom syscall if possible (from linux 3.17). This syscall is not
// vulnerable to fd exhaustion since it directly pulls from kernel entropy sources.
//
// This syscall will use the urandom entropy source but block at startup until it is correctly
// seeded. See <https://www.2uo.de/myths-about-urandom/> for a rational around random/urandom.
getrandom::getrandom(&mut buf).expect("Failed to read entropy from system");
u128::from_ne_bytes(buf)
}

View File

@@ -37,7 +37,7 @@ serde_json = "1.0.96"
[target.'cfg(target_arch = "wasm32")'.dev-dependencies]
wasm-bindgen-test = "0.3"
wasm-bindgen = { workspace = true }
getrandom = { version = "0.2", features = ["js"] }
getrandom = { workspace = true, features = ["js"] }
[target.'cfg(all(not(target_os = "windows"), not(target_arch = "wasm32")))'.dev-dependencies]
rug = "1.19.1"

View File

@@ -216,24 +216,6 @@ fn stockham_core_1x2<c64xN: Pod>(
let oo = simd.mul(we, simd.add(a08m1a4c_pj_a2am1a6e, v8_a19m1a5d_pj_a3bm1a7f));
let pp = simd.mul(wf, simd.add(s08pjs4c_pv_s2apjs6e, hf_s19pjs5d_pv_s3bpjs7f));
let ab = simd.catlo(aa, bb);
y[0] = ab;
let cd = simd.catlo(cc, dd);
y[1] = cd;
let ef = simd.catlo(ee, ff);
y[2] = ef;
let gh = simd.catlo(gg, hh);
y[3] = gh;
let ab = simd.cathi(aa, bb);
y[4] = ab;
let cd = simd.cathi(cc, dd);
y[5] = cd;
let ef = simd.cathi(ee, ff);
y[6] = ef;
let gh = simd.cathi(gg, hh);
y[7] = gh;
let ab = simd.catlo(aa, bb);
y[0x0] = ab;
let cd = simd.catlo(cc, dd);

View File

@@ -1,6 +1,8 @@
use ark_ff::biginteger::arithmetic::widening_mul;
use rand::prelude::*;
use crate::proofs::ProofSanityCheckMode;
/// Avoid overflows for squares of u64
pub fn sqr(x: u64) -> u128 {
let x = x as u128;
@@ -188,21 +190,24 @@ impl Montgomery {
}
}
pub fn four_squares(v: u128) -> [u64; 4] {
pub fn four_squares(v: u128, sanity_check_mode: ProofSanityCheckMode) -> [u64; 4] {
let rng = &mut StdRng::seed_from_u64(0);
// In the extreme case where the noise is exactly at the bound, v is 0
if v == 0 {
return [0; 4];
}
// Handle limit cases that would trigger an infinite loop
match v {
0 => return [0; 4],
2 => return [1, 1, 0, 0],
6 => return [2, 1, 1, 0],
_ => {}
};
let f = v % 4;
if f == 2 {
let b = v.isqrt() as u64;
'main_loop: loop {
let x = 2 + rng.gen::<u64>() % (b - 2);
let y = 2 + rng.gen::<u64>() % (b - 2);
let x: u64 = rng.gen_range(0..=b);
let y: u64 = rng.gen_range(0..=b);
let (sum, o) = u128::overflowing_add(sqr(x), sqr(y));
if o || sum > v {
@@ -270,21 +275,80 @@ pub fn four_squares(v: u128) -> [u64; 4] {
return [x, y, z, w];
}
} else if f == 0 {
four_squares(v / 4).map(|x| x + x)
four_squares(v / 4, sanity_check_mode).map(|x| x + x)
} else {
let mut r = four_squares(2 * v);
r.sort_by_key(|&x| {
if x % 2 == 0 {
-1 - ((x / 2) as i64)
} else {
(x / 2) as i64
}
});
// v is odd, compute the four squares for 2*v and deduce the result for v
let double = match sanity_check_mode {
ProofSanityCheckMode::Panic => v.checked_mul(2).unwrap(),
#[cfg(test)]
ProofSanityCheckMode::Ignore => v.wrapping_mul(2),
};
let mut r = four_squares(double, sanity_check_mode);
// At this point we know that exactly 2 values of r are even and 2 are odd:
// r = [w, x, y, z]
// 2v = w² + x² + y² + z²
// We cannot have 4 even numbers because
// 2v = (2w')²+(2x')²+(2y')²+(2z')² = 4v', but v is odd
// We cannot have 4 odd numbers because
// 2v = (2w'+1)²+(2x'+1)²+(2y'+1)²+(2z'+1)²
// = (4w'²+4w'+1)+(4x'²+4x'+1)+(4y'²+4y'+1)+(4z'²+4z'+1) = 4v', same issue
//
// Since w² + x² + y² + z² is even we must have 2 of them odd and 2 of them even
// Sort so that r[0], r[1] are even and r[2], r[3] are odd,
// with r[1] > r[0] and r[3] > r[2]
r.sort_by_key(|&x| (x % 2 != 0, x));
[
(r[0] + r[1]) / 2,
(r[0] - r[1]) / 2,
(r[3] + r[2]) / 2,
// divide by 2 before addition to avoid overflows
(r[1] / 2 + r[0] / 2),
(r[1] - r[0]) / 2,
(r[3] / 2 + r[2] / 2) + 1,
(r[3] - r[2]) / 2,
]
}
}
#[cfg(test)]
mod test {
use rand::rngs::StdRng;
use rand::{thread_rng, Rng, SeedableRng};
use super::*;
fn assert_four_squares(value: u128) {
let squares = four_squares(value, ProofSanityCheckMode::Panic);
let res = squares.iter().map(|x| sqr(*x)).sum();
assert_eq!(value, res);
}
#[test]
fn test_four_squares() {
const RAND_TESTS_COUNT: usize = 1000;
let seed = thread_rng().gen();
println!("four_squares seed: {seed:x}");
let rng = &mut StdRng::seed_from_u64(seed);
for val in 0..256 {
assert_four_squares(val);
}
// If v % 4 = 1 or 3, v will be multiplied by 2
assert_four_squares(u128::MAX / 2);
assert_four_squares(u128::MAX / 2 - 1);
assert_four_squares(u128::MAX / 2 - 2);
assert_four_squares(u128::MAX / 2 - 3);
for i in 8..127 {
assert_four_squares((1u128 << i) + 1);
}
for _ in 0..RAND_TESTS_COUNT {
let v: u128 = rng.gen_range(0..(u128::MAX / 2));
assert_four_squares(v);
}
}
}

View File

@@ -144,7 +144,7 @@ impl<G: Curve> GroupElements<G> {
/// Allows to compute proof with bad inputs for tests
#[derive(Copy, Clone, PartialEq, Eq)]
enum ProofSanityCheckMode {
pub(crate) enum ProofSanityCheckMode {
Panic,
#[cfg(test)]
Ignore,

View File

@@ -928,7 +928,7 @@ fn prove_impl<G: Curve>(
)
.collect::<Box<[_]>>();
let v = four_squares(B_squared - e_sqr_norm).map(|v| v as i64);
let v = four_squares(B_squared - e_sqr_norm, sanity_check_mode).map(|v| v as i64);
let e1_zp = &*e1
.iter()

View File

@@ -86,7 +86,7 @@ wasm-bindgen-rayon = { version = "1.3.0", optional = true }
js-sys = { version = "0.3", optional = true }
console_error_panic_hook = { version = "0.1.7", optional = true }
serde-wasm-bindgen = { version = "0.6.0", optional = true }
getrandom = { version = "0.2.8", optional = true }
getrandom = { workspace = true, optional = true }
bytemuck = { workspace = true }
tfhe-hpu-backend = { version = "0.2", path = "../backends/tfhe-hpu-backend", optional = true }

View File

@@ -2,7 +2,7 @@
This document provides basic instructions to configure the Rust toolchain and features for **TFHE-rs.**
**TFHE-rs** requires a nightly Rust toolchain to build the C API and utilize advanced SIMD instructions. However, for other uses, a stable toolchain (version 1.81 or later) is sufficient.
**TFHE-rs** requires a nightly Rust toolchain to build the C API and utilize advanced SIMD instructions. However, for other uses, a stable toolchain (version 1.84 or later) is sufficient.
Follow the following instructions to install the necessary Rust toolchain:

View File

@@ -73,15 +73,6 @@ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
}
```
Performance can be improved by setting `lto="fat"` in `Cargo.toml`
```toml
[profile.release]
lto = "fat"
```
and by building the code for the native CPU architecture and in release mode, e.g. by calling `RUSTFLAGS="-C target-cpu=native" cargo run --release`.
{% hint style="info" %}
You can choose a more costly proof with `ZkComputeLoad::Proof`, which has a faster verification time. Alternatively, you can select `ZkComputeLoad::Verify` for a faster proof and slower verification.
{% endhint %}

View File

@@ -7,7 +7,6 @@ All parameter sets provide at least 128-bits of security according to the [Latti
## Default parameters
Currently, the default parameters use blocks that contain 2 bits of message and 2 bits of carry - a tweaked uniform (TUniform, defined [here](../../getting-started/security-and-cryptography.md#noise)) noise distribution, and have a bootstrapping failure probability $$p_{error} \le 2^{-128}$$.
These are particularly suitable for applications that need to be secure in the IND-CPA^D model (see [here](../../getting-started/security-and-cryptography.md#security) for more details).
The GPU backend still uses an error probability smaller than $$2^{-64}$$ by default. Those will be updated soon.
When using the high-level API of **TFHE-rs**, you can create a key pair using the default recommended set of parameters. For example:

View File

@@ -11,7 +11,7 @@ tfhe = { version = "~1.3.0", features = ["boolean", "shortint", "integer"] }
```
{% hint style="info" %}
**Rust version**: a minimum Rust version of 1.81 is required to compile **TFHE-rs**.
**Rust version**: a minimum Rust version of 1.84 is required to compile **TFHE-rs**.
{% endhint %}
{% hint style="success" %}

View File

@@ -85,7 +85,8 @@ Now that the project has **TFHE-rs** as a dependency here are the detailed steps
5. Server-side: [compute over encrypted data](../fhe-computation/compute/)
6. Client-side: [decrypt data](../fhe-computation/compute/decrypt-data.md)
This example demonstrates the basic workflow combining the client and server parts:
This example demonstrates the basic workflow combining the client and server parts.
Edit the `main.rs` file that has been generated at the previous step and add this content:
```rust
use tfhe::{ConfigBuilder, generate_keys, set_server_key, FheUint8};
@@ -116,4 +117,21 @@ fn main() {
}
```
Now, you are ready to compute your first homomorphic operations! To run the project, simply invoke the following command:
```console
$ cargo run --release
```
{% hint style="success" %}
**Performance**: for optimal performance, it is highly recommended to run code that uses **`TFHE-rs`** in release mode with cargo's `--release` flag.
{% endhint %}
You can learn more about homomorphic types and associated compilation features in the [configuration documentation.](../configuration/rust-configuration.md)
## Perforance tips
Performance can be further improved by setting `lto="fat"` in `Cargo.toml`
```toml
[profile.release]
lto = "fat"
```
If your application does not need to be portable, you can also build the code for the native CPU architecture, e.g. by calling `RUSTFLAGS="-C target-cpu=native" cargo run --release`.

View File

@@ -97,7 +97,7 @@ For example, when adding two ciphertexts, the sum could exceed the range of eith
By default, the cryptographic parameters provided by **TFHE-rs** ensure at least 128 bits of security. The security has been evaluated using the latest versions of the Lattice Estimator ([repository](https://github.com/malb/lattice-estimator)) with `red_cost_model = reduction.RC.BDGL16`.
For the High-Level API the default parameters are selected with a bootstrapping failure probability (or error probability) fixed at $$p_{error} \le 2^{-128}$$ for the x86 CPU backend, and $$p_{error} \le 2^{-64}$$ for the GPU backend.
For the High-Level API the default parameters are selected with a bootstrapping failure probability (or error probability) fixed at $$p_{error} \le 2^{-128}$$ for all backends (x86 CPU, GPU and HPU).
A failure probability below $$2^{-128}$$ ensures that our implementation is resilient against attacks in the IND-CPA-D model [1]. In the case where only the IND-CPA model is considered, there is a possibility to choose parameters with a $$p_{error} \le 2^{-64}$$, see the dedicated [Parameters section](../fhe-computation/compute/parameters.md)
\[1][ Li, Baiyu, et al. "Securing approximate homomorphic encryption using differential privacy." Annual International Cryptology Conference. Cham: Springer Nature Switzerland, 2022.](https://eprint.iacr.org/2022/816.pdf)

View File

@@ -1,6 +1,6 @@
# Cryptographic Parameters
All parameter sets provide at least 128-bits of security according to the [Lattice-Estimator](https://github.com/malb/lattice-estimator), with an error probability equal to $$2^{-64}$$ when using programmable bootstrapping. This error probability is due to the randomness added at each encryption (see [here](../../../getting-started/security-and-cryptography.md) for more details about the encryption process).
All parameter sets provide at least 128-bits of security according to the [Lattice-Estimator](https://github.com/malb/lattice-estimator). Default parameters have an error probability equal to $$2^{-128}$$ when using programmable bootstrapping. This error probability is due to the randomness added at each encryption (see [here](../../../getting-started/security-and-cryptography.md) for more details about the encryption process).
## Parameters and message precision

View File

@@ -728,8 +728,11 @@ pub fn encrypt_glwe_ciphertext_list<
Gen: ByteRandomGenerator,
{
assert!(
output_glwe_ciphertext_list.polynomial_size().0
* output_glwe_ciphertext_list.glwe_ciphertext_count().0
output_glwe_ciphertext_list
.polynomial_size()
.0
.checked_mul(output_glwe_ciphertext_list.glwe_ciphertext_count().0)
.unwrap()
== input_plaintext_list.plaintext_count().0,
"Mismatch between required number of plaintexts: {} ({:?} * {:?}) and input \
PlaintextCount: {:?}",
@@ -1349,7 +1352,11 @@ pub fn encrypt_seeded_glwe_ciphertext_list_with_pre_seeded_generator<
output.glwe_size().to_glwe_dimension(),
);
assert!(
output.glwe_ciphertext_count().0 * output.polynomial_size().0
output
.glwe_ciphertext_count()
.0
.checked_mul(output.polynomial_size().0)
.unwrap()
== encoded.plaintext_count().0,
"Mismatch between number of output ciphertexts and input plaintexts. \
Got {:?} plaintexts while {:?} plaintexts are required to encrypt {:?} ciphertexts.",

View File

@@ -7,7 +7,7 @@ use crate::core_crypto::commons::computation_buffers::ComputationBuffers;
use crate::core_crypto::commons::math::ntt::ntt64::{Ntt64, Ntt64View};
use crate::core_crypto::commons::parameters::{GlweSize, MonomialDegree, PolynomialSize};
use crate::core_crypto::commons::traits::*;
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::*;
use crate::core_crypto::prelude::{lwe_ciphertext_modulus_switch, ModulusSwitchedLweCiphertext};
use aligned_vec::CACHELINE_ALIGN;
@@ -203,6 +203,9 @@ pub fn blind_rotate_ntt64_bnf_assign<OutputCont, KeyCont>(
/// a properly configured [`Ntt64View`] object and a `PodStack` used as a memory buffer having a
/// capacity at least as large as the result of
/// [`blind_rotate_ntt64_bnf_assign_mem_optimized_requirement`].
///
/// # Panics
/// This will panic if the input mask len does not match the size of the bsk
pub fn blind_rotate_ntt64_bnf_assign_mem_optimized<OutputCont, KeyCont>(
msed_input: &impl ModulusSwitchedLweCiphertext<usize>,
lut: &mut GlweCiphertext<OutputCont>,
@@ -234,7 +237,8 @@ pub fn blind_rotate_ntt64_bnf_assign_mem_optimized<OutputCont, KeyCont>(
// We initialize the ct_0 used for the successive cmuxes
let mut ct0 = lut;
for (lwe_mask_element, bootstrap_key_ggsw) in izip!(msed_lwe_mask, bsk.into_ggsw_iter()) {
for (lwe_mask_element, bootstrap_key_ggsw) in izip_eq!(msed_lwe_mask, bsk.into_ggsw_iter())
{
if lwe_mask_element != 0 {
// We copy ct_0 to ct_1
let (ct1, stack) =
@@ -623,7 +627,7 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
//
// t = 1 t = 2 ...
izip!(
izip_eq!(
ggsw_decomp_matrix.into_rows(),
glwe_decomp_term.as_polynomial_list().iter()
)
@@ -657,7 +661,7 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
//
// We iterate over the polynomials in the output.
if !is_output_uninit {
izip!(
izip_eq!(
out.as_mut_polynomial_list().iter_mut(),
output_fft_buffer
.into_chunks(poly_size)
@@ -675,6 +679,9 @@ pub(crate) fn add_external_product_ntt64_bnf_assign<InputGlweCont>(
}
/// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
///
/// # Panics
/// This will panic if ct0 and ct1 are not of the same size
pub(crate) fn cmux_ntt64_bnf_assign(
ct0: GlweCiphertextMutView<'_, u64>,
ct1: GlweCiphertextMutView<'_, u64>,
@@ -689,7 +696,7 @@ pub(crate) fn cmux_ntt64_bnf_assign(
ntt: Ntt64View<'_>,
stack: &mut PodStack,
) {
izip!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
izip_eq!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
*c1 = c1.wrapping_sub(*c0);
});
add_external_product_ntt64_bnf_assign(ct0, ggsw, &ct1, ntt, stack);
@@ -711,7 +718,7 @@ pub(crate) fn update_with_fmadd_ntt64_bnf(
output_fft_buffer.fill(0);
}
izip!(
izip_eq!(
output_fft_buffer.into_chunks(poly_size),
lhs_polynomial_list.into_chunks(poly_size)
)

View File

@@ -15,7 +15,7 @@ use crate::core_crypto::commons::math::decomposition::{
use crate::core_crypto::commons::math::ntt::ntt64::{Ntt64, Ntt64View};
use crate::core_crypto::commons::parameters::{GlweSize, MonomialDegree, PolynomialSize};
use crate::core_crypto::commons::traits::*;
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::*;
use aligned_vec::CACHELINE_ALIGN;
use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -208,6 +208,9 @@ pub fn blind_rotate_ntt64_assign<InputCont, OutputCont, KeyCont>(
/// a properly configured [`Ntt64View`] object and a `PodStack` used as a memory buffer having a
/// capacity at least as large as the result of
/// [`blind_rotate_ntt64_assign_mem_optimized_requirement`].
///
/// # Panics
/// This will panic if the input mask len does not match the size of the bsk
pub fn blind_rotate_ntt64_assign_mem_optimized<InputCont, OutputCont, KeyCont>(
input: &LweCiphertext<InputCont>,
lut: &mut GlweCiphertext<OutputCont>,
@@ -250,7 +253,9 @@ pub fn blind_rotate_ntt64_assign_mem_optimized<InputCont, OutputCont, KeyCont>(
// We initialize the ct_0 used for the successive cmuxes
let mut ct0 = lut;
for (lwe_mask_element, bootstrap_key_ggsw) in izip!(lwe_mask.iter(), bsk.into_ggsw_iter()) {
for (lwe_mask_element, bootstrap_key_ggsw) in
izip_eq!(lwe_mask.iter(), bsk.into_ggsw_iter())
{
if *lwe_mask_element != 0u64 {
let stack = &mut *stack;
// We copy ct_0 to ct_1
@@ -615,7 +620,7 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
//
// t = 1 t = 2 ...
izip!(
izip_eq!(
ggsw_decomp_matrix.into_rows(),
glwe_decomp_term.as_polynomial_list().iter()
)
@@ -647,7 +652,7 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
//
// We iterate over the polynomials in the output.
if !is_output_uninit {
izip!(
izip_eq!(
out.as_mut_polynomial_list().iter_mut(),
output_fft_buffer
.into_chunks(poly_size)
@@ -660,6 +665,9 @@ pub(crate) fn add_external_product_ntt64_assign<InputGlweCont>(
}
/// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
///
/// # Panics
/// This will panic if ct0 and ct1 are not of the same size
pub(crate) fn cmux_ntt64_assign(
ct0: GlweCiphertextMutView<'_, u64>,
mut ct1: GlweCiphertextMutView<'_, u64>,
@@ -667,7 +675,7 @@ pub(crate) fn cmux_ntt64_assign(
ntt: Ntt64View<'_>,
stack: &mut PodStack,
) {
izip!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
izip_eq!(ct1.as_mut(), ct0.as_ref(),).for_each(|(c1, c0)| {
*c1 = c1.wrapping_sub_custom_mod(*c0, ntt.custom_modulus());
});
add_external_product_ntt64_assign(ct0, ggsw, &ct1, ntt, stack);
@@ -686,7 +694,7 @@ pub(crate) fn update_with_fmadd_ntt64(
output_fft_buffer.fill(0);
}
izip!(
izip_eq!(
output_fft_buffer.into_chunks(poly_size),
lhs_polynomial_list.into_chunks(poly_size)
)

View File

@@ -5,7 +5,7 @@ pub trait ModulusSwitchedLweCiphertext<Scalar> {
fn log_modulus(&self) -> CiphertextModulusLog;
fn lwe_dimension(&self) -> LweDimension;
fn body(&self) -> Scalar;
fn mask(&self) -> impl Iterator<Item = Scalar> + '_;
fn mask(&self) -> impl ExactSizeIterator<Item = Scalar> + '_;
}
pub fn lwe_ciphertext_modulus_switch<Scalar, SwitchedScalar, Cont>(

View File

@@ -1,5 +1,5 @@
use crate::core_crypto::commons::ciphertext_modulus::CiphertextModulusKind;
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::prelude::*;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
@@ -123,7 +123,7 @@ impl Ntt64View<'_> {
pulp::Arch::new().dispatch(
#[inline(always)]
|| {
for (out, inp) in izip!(standard, &*ntt) {
for (out, inp) in izip_eq!(standard, &*ntt) {
*out = u64::wrapping_add_custom_mod(*out, *inp, self.custom_modulus());
}
},
@@ -258,7 +258,7 @@ impl Ntt64View<'_> {
pulp::Arch::new().dispatch(
#[inline(always)]
|| {
for (out, inp) in izip!(standard, &*ntt) {
for (out, inp) in izip_eq!(standard, &*ntt) {
*out = u64::wrapping_add(*out, *inp);
}
},

View File

@@ -141,7 +141,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
// mid here is the number of ref_elements, we need to multiply by the size of a single
// element to know where to split the underlying container
let mid = mid * self.get_entity_view_pod_size();
let mid = mid.checked_mul(self.get_entity_view_pod_size()).unwrap();
let self_meta = self.get_self_view_creation_metadata();
let (container_left, container_right) = self.as_ref().split_at(mid);
@@ -156,7 +156,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
// index here is the number of ref_elements, we need to multiply by the size of a single
// element to know where to reference the underlying container
let start = index * self.get_entity_view_pod_size();
let start = index.checked_mul(self.get_entity_view_pod_size()).unwrap();
let stop = start + self.get_entity_view_pod_size();
let meta = self.get_entity_view_creation_metadata();
@@ -178,8 +178,12 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
Bound::Unbounded => self.entity_count(),
};
let start_index = entity_start_index * self.get_entity_view_pod_size();
let stop_index = entity_stop_index * self.get_entity_view_pod_size();
let start_index = entity_start_index
.checked_mul(self.get_entity_view_pod_size())
.unwrap();
let stop_index = entity_stop_index
.checked_mul(self.get_entity_view_pod_size())
.unwrap();
let self_meta = self.get_self_view_creation_metadata();
@@ -214,7 +218,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
let entity_count = self.entity_count();
let entity_view_pod_size = self.get_entity_view_pod_size();
let pod_chunk_size = entity_view_pod_size * chunk_size;
let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
let meta = self.get_self_view_creation_metadata();
self.as_ref()
@@ -235,7 +239,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
);
let entity_view_pod_size = self.get_entity_view_pod_size();
let pod_chunk_size = entity_view_pod_size * chunk_size;
let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
let meta = self.get_self_view_creation_metadata();
self.as_ref()
@@ -273,7 +277,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
let entity_count = self.entity_count();
let entity_view_pod_size = self.get_entity_view_pod_size();
let pod_chunk_size = entity_view_pod_size * chunk_size;
let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
let meta = self.get_self_view_creation_metadata();
self.as_ref()
@@ -299,7 +303,7 @@ pub trait ContiguousEntityContainer: AsRef<[Self::Element]> {
);
let entity_view_pod_size = self.get_entity_view_pod_size();
let pod_chunk_size = entity_view_pod_size * chunk_size;
let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
let meta = self.get_self_view_creation_metadata();
self.as_ref()
@@ -386,8 +390,12 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
Bound::Unbounded => self.entity_count(),
};
let start_index = entity_start_index * self.get_entity_view_pod_size();
let stop_index = entity_stop_index * self.get_entity_view_pod_size();
let start_index = entity_start_index
.checked_mul(self.get_entity_view_pod_size())
.unwrap();
let stop_index = entity_stop_index
.checked_mul(self.get_entity_view_pod_size())
.unwrap();
let self_meta = self.get_self_view_creation_metadata();
@@ -412,7 +420,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
let entity_count = self.entity_count();
let entity_view_pod_size = self.get_entity_view_pod_size();
let pod_chunk_size = entity_view_pod_size * chunk_size;
let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
let meta = self.get_self_view_creation_metadata();
self.as_mut()
@@ -434,7 +442,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
);
let entity_view_pod_size = self.get_entity_view_pod_size();
let pod_chunk_size = entity_view_pod_size * chunk_size;
let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
let meta = self.get_self_view_creation_metadata();
self.as_mut()
@@ -487,7 +495,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
let entity_count = self.entity_count();
let entity_view_pod_size = self.get_entity_view_pod_size();
let pod_chunk_size = entity_view_pod_size * chunk_size;
let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
let meta = self.get_self_view_creation_metadata();
self.as_mut()
@@ -513,7 +521,7 @@ pub trait ContiguousEntityContainerMut: ContiguousEntityContainer + AsMut<[Self:
);
let entity_view_pod_size = self.get_entity_view_pod_size();
let pod_chunk_size = entity_view_pod_size * chunk_size;
let pod_chunk_size = entity_view_pod_size.checked_mul(chunk_size).unwrap();
let meta = self.get_self_view_creation_metadata();
self.as_mut()

View File

@@ -1,32 +1,30 @@
//! Utilities for the library.
#[track_caller]
#[inline]
fn assert_same_len(a: (usize, Option<usize>), b: (usize, Option<usize>)) {
debug_assert_eq!(a.1, Some(a.0));
debug_assert_eq!(b.1, Some(b.0));
debug_assert_eq!(a.0, b.0);
}
/// Return a Zip iterator, but checks that the two components have the same length.
pub trait ZipChecked: IntoIterator + Sized {
pub trait ZipChecked: IntoIterator + Sized
where
<Self as IntoIterator>::IntoIter: ExactSizeIterator,
{
#[track_caller]
#[inline]
fn zip_checked<B: IntoIterator>(
self,
b: B,
) -> core::iter::Zip<<Self as IntoIterator>::IntoIter, <B as IntoIterator>::IntoIter> {
) -> core::iter::Zip<<Self as IntoIterator>::IntoIter, <B as IntoIterator>::IntoIter>
where
<B as IntoIterator>::IntoIter: ExactSizeIterator,
{
let a = self.into_iter();
let b = b.into_iter();
assert_same_len(a.size_hint(), b.size_hint());
assert_eq!(a.len(), b.len());
core::iter::zip(a, b)
}
}
impl<A: IntoIterator> ZipChecked for A {}
impl<A: IntoIterator> ZipChecked for A where <A as IntoIterator>::IntoIter: ExactSizeIterator {}
// https://docs.rs/itertools/0.7.8/src/itertools/lib.rs.html#247-269
macro_rules! izip {
macro_rules! izip_eq {
(@ __closure @ ($a:expr)) => { |a| (a,) };
(@ __closure @ ($a:expr, $b:expr)) => { |(a, b)| (a, b) };
(@ __closure @ ($a:expr, $b:expr, $c:expr)) => { |((a, b), c)| (a, b, c) };
@@ -54,11 +52,11 @@ macro_rules! izip {
{
#[allow(unused_imports)]
use $crate::core_crypto::commons::utils::ZipChecked;
::core::iter::IntoIterator::into_iter($first)
$first
$(.zip_checked($rest))*
.map($crate::core_crypto::commons::utils::izip!(@ __closure @ ($first, $($rest),*)))
.map($crate::core_crypto::commons::utils::izip_eq!(@ __closure @ ($first, $($rest),*)))
}
};
}
pub(crate) use izip;
pub(crate) use izip_eq;

View File

@@ -255,9 +255,6 @@ pub fn glwe_ciphertext_encryption_noise_sample_count(
/// A [`GLWE ciphertext`](`GlweCiphertext`).
///
/// **Remark:** GLWE ciphertexts generalize LWE ciphertexts by definition, however in this library,
/// GLWE ciphertext entities do not generalize LWE ciphertexts, i.e., polynomial size cannot be 1.
///
/// # Formal Definition
///
/// ## GLWE Ciphertext

View File

@@ -6,7 +6,7 @@ use crate::core_crypto::commons::parameters::{
PolynomialSize,
};
use crate::core_crypto::commons::traits::{Container, Split};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::fft_impl::fft128::crypto::ggsw::Fourier128GgswCiphertext;
use crate::core_crypto::prelude::MultiBitBootstrapKeyConformanceParams;
@@ -77,7 +77,7 @@ impl<C: Container<Element = f64>> Fourier128LweMultiBitBootstrapKey<C> {
let ggsw_count =
multi_bit_lwe_dim.0 * self.grouping_factor().ggsw_per_multi_bit_element().0;
izip!(
izip_eq!(
self.data_re0.split_into(ggsw_count),
self.data_re1.split_into(ggsw_count),
self.data_im0.split_into(ggsw_count),

View File

@@ -52,7 +52,7 @@ impl<Scalar: Copy> ModulusSwitchedLweCiphertext<Scalar>
*self.container.last().unwrap()
}
fn mask(&self) -> impl Iterator<Item = Scalar> + '_ {
fn mask(&self) -> impl ExactSizeIterator<Item = Scalar> + '_ {
let (_body, mask) = self.container.split_last().unwrap();
mask.iter().copied()
@@ -138,7 +138,7 @@ where
.cast_into()
}
fn mask(&self) -> impl Iterator<Item = SwitchedScalar> {
fn mask(&self) -> impl ExactSizeIterator<Item = SwitchedScalar> {
self.lwe_in
.as_ref()
.split_last()

View File

@@ -288,7 +288,9 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttGgswLevelMatrix
}
/// Return an iterator over the rows of the level matrices.
pub fn into_rows(self) -> impl DoubleEndedIterator<Item = NttGgswLevelRow<C>>
pub fn into_rows(
self,
) -> impl DoubleEndedIterator<Item = NttGgswLevelRow<C>> + ExactSizeIterator<Item = NttGgswLevelRow<C>>
where
C: Split,
{

View File

@@ -207,7 +207,10 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttGgswCiphertextL
/// consider calling [`NttGgswCiphertextList::as_view`] or
/// [`NttGgswCiphertextList::as_mut_view`] first to have an iterator over borrowed contents
/// instead of consuming the original entity.
pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
pub fn into_ggsw_iter(
self,
) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
+ ExactSizeIterator<Item = NttGgswCiphertext<C>>
where
C: Split,
{

View File

@@ -130,7 +130,10 @@ impl<Scalar: UnsignedInteger, C: Container<Element = Scalar>> NttLweBootstrapKey
/// consider calling [`NttLweBootstrapKey::as_view`] or
/// [`NttLweBootstrapKey::as_mut_view`] first to have an iterator over borrowed contents
/// instead of consuming the original entity.
pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
pub fn into_ggsw_iter(
self,
) -> impl DoubleEndedIterator<Item = NttGgswCiphertext<C>>
+ ExactSizeIterator<Item = NttGgswCiphertext<C>>
where
C: Split,
{

View File

@@ -3,7 +3,7 @@
use crate::core_crypto::commons::math::decomposition::SignedDecomposer;
use crate::core_crypto::commons::parameters::*;
use crate::core_crypto::commons::traits::*;
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::*;
use crate::core_crypto::experimental::entities::fourier_pseudo_ggsw_ciphertext::{
PseudoFourierGgswCiphertext, PseudoFourierGgswCiphertextView,
@@ -237,7 +237,7 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
//
// t = 1 t = 2 ...
izip!(
izip_eq!(
ggsw_decomp_matrix.into_rows(),
glwe_decomp_term.get_mask().as_polynomial_list().iter()
)
@@ -276,7 +276,7 @@ pub fn glwe_fast_keyswitch<Scalar, OutputGlweCont, InputGlweCont, GgswCont>(
//
// We iterate over the polynomials in the output.
if !is_output_uninit {
izip!(
izip_eq!(
out.as_mut_polynomial_list().iter_mut(),
output_fft_buffer
.into_chunks(fourier_poly_size)

View File

@@ -5,7 +5,7 @@ use crate::core_crypto::commons::parameters::{
use crate::core_crypto::commons::traits::{
Container, ContiguousEntityContainer, IntoContainerOwned, Split,
};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::experimental::entities::PseudoGgswCiphertext;
use crate::core_crypto::fft_impl::fft64::math::decomposition::DecompositionLevel;
use crate::core_crypto::fft_impl::fft64::math::fft::{FftView, FourierPolynomialList};
@@ -159,7 +159,10 @@ impl<C: Container<Element = c64>> PseudoFourierGgswLevelMatrix<C> {
}
/// Return an iterator over the rows of the level matrices.
pub fn into_rows(self) -> impl DoubleEndedIterator<Item = PseudoFourierGgswLevelRow<C>>
pub fn into_rows(
self,
) -> impl DoubleEndedIterator<Item = PseudoFourierGgswLevelRow<C>>
+ ExactSizeIterator<Item = PseudoFourierGgswLevelRow<C>>
where
C: Split,
{
@@ -278,7 +281,7 @@ impl PseudoFourierGgswCiphertextMutView<'_> {
debug_assert_eq!(coef_ggsw.polynomial_size(), self.polynomial_size());
let fourier_poly_size = coef_ggsw.polynomial_size().to_fourier_polynomial_size().0;
for (fourier_poly, coef_poly) in izip!(
for (fourier_poly, coef_poly) in izip_eq!(
self.data().into_chunks(fourier_poly_size),
coef_ggsw.as_polynomial_list().iter()
) {

View File

@@ -14,7 +14,7 @@ use crate::core_crypto::commons::parameters::{
use crate::core_crypto::commons::traits::{
Container, ContiguousEntityContainer, ContiguousEntityContainerMut, Split,
};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::ggsw_ciphertext::fourier_ggsw_ciphertext_size;
use crate::core_crypto::entities::*;
use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
@@ -78,11 +78,14 @@ impl<C: Container<Element = f64>> Fourier128LweBootstrapKey<C> {
}
/// Return an iterator over the GGSW ciphertexts composing the key.
pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = Fourier128GgswCiphertext<C>>
pub fn into_ggsw_iter(
self,
) -> impl DoubleEndedIterator<Item = Fourier128GgswCiphertext<C>>
+ ExactSizeIterator<Item = Fourier128GgswCiphertext<C>>
where
C: Split,
{
izip!(
izip_eq!(
self.data_re0.split_into(self.input_lwe_dimension.0),
self.data_re1.split_into(self.input_lwe_dimension.0),
self.data_im0.split_into(self.input_lwe_dimension.0),
@@ -216,7 +219,9 @@ where
coef_bsk: LweBootstrapKey<&[Scalar]>,
fft: Fft128View<'_>,
) {
for (mut fourier_ggsw, standard_ggsw) in izip!(this.into_ggsw_iter(), coef_bsk.iter()) {
for (mut fourier_ggsw, standard_ggsw) in
izip_eq!(this.into_ggsw_iter(), coef_bsk.iter())
{
fourier_ggsw.fill_with_forward_fourier(&standard_ggsw, fft);
}
}
@@ -287,7 +292,7 @@ where
let mut ct0 = lut;
for (lwe_mask_element, bootstrap_key_ggsw) in
izip!(msed_lwe_mask, this.into_ggsw_iter())
izip_eq!(msed_lwe_mask, this.into_ggsw_iter())
{
if lwe_mask_element != 0 {
let stack = &mut *stack;

View File

@@ -8,7 +8,7 @@ use crate::core_crypto::commons::parameters::{
use crate::core_crypto::commons::traits::{
Container, ContiguousEntityContainer, ContiguousEntityContainerMut, Split,
};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::ggsw_ciphertext::{
fourier_ggsw_ciphertext_size, fourier_ggsw_level_matrix_size, GgswCiphertext,
};
@@ -113,6 +113,9 @@ impl<C: Container<Element = f64>> Fourier128GgswCiphertext<C> {
decomposition_level_count,
);
assert_eq!(data_re0.container_len(), container_len);
assert_eq!(data_re1.container_len(), container_len);
assert_eq!(data_im0.container_len(), container_len);
assert_eq!(data_im1.container_len(), container_len);
Self {
data_re0,
@@ -184,7 +187,7 @@ impl<C: Container<Element = f64>> Fourier128GgswCiphertext<C> {
C: Split,
{
let decomposition_level_count = self.decomposition_level_count.0;
izip!(
izip_eq!(
self.data_re0.split_into(decomposition_level_count),
self.data_re1.split_into(decomposition_level_count),
self.data_im0.split_into(decomposition_level_count),
@@ -235,12 +238,15 @@ impl<C: Container<Element = f64>> Fourier128GgswLevelMatrix<C> {
}
/// Return an iterator over the rows of the level matrices.
pub fn into_rows(self) -> impl DoubleEndedIterator<Item = Fourier128GgswLevelRow<C>>
pub fn into_rows(
self,
) -> impl DoubleEndedIterator<Item = Fourier128GgswLevelRow<C>>
+ ExactSizeIterator<Item = Fourier128GgswLevelRow<C>>
where
C: Split,
{
let row_count = self.row_count();
izip!(
izip_eq!(
self.data_re0.split_into(row_count),
self.data_re1.split_into(row_count),
self.data_im0.split_into(row_count),
@@ -349,7 +355,7 @@ where
let (data_re0, data_re1, data_im0, data_im1) = this.data();
for (fourier_re0, fourier_re1, fourier_im0, fourier_im1, coef_poly) in izip!(
for (fourier_re0, fourier_re1, fourier_im0, fourier_im1, coef_poly) in izip_eq!(
data_re0.into_chunks(poly_size),
data_re1.into_chunks(poly_size),
data_im0.into_chunks(poly_size),
@@ -485,7 +491,7 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
//
// t = 1 t = 2 ...
for (ggsw_row, glwe_poly) in izip!(
for (ggsw_row, glwe_poly) in izip_eq!(
ggsw_decomp_matrix.into_rows(),
glwe_decomp_term.as_polynomial_list().iter()
) {
@@ -531,7 +537,7 @@ pub fn add_external_product_assign<Scalar, ContOut, ContGgsw, ContGlwe>(
//
// We iterate over the polynomials in the output.
if !is_output_uninit {
for (mut out, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip!(
for (mut out, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip_eq!(
out.as_mut_polynomial_list().iter_mut(),
output_fft_buffer_re0.into_chunks(fourier_poly_size),
output_fft_buffer_re1.into_chunks(fourier_poly_size),
@@ -604,7 +610,7 @@ fn update_with_fmadd_scalar(
rhs_re1,
rhs_im0,
rhs_im1,
) in izip!(
) in izip_eq!(
output_fourier_re0,
output_fourier_re1,
output_fourier_im0,
@@ -647,7 +653,7 @@ fn update_with_fmadd_scalar(
rhs_re1,
rhs_im0,
rhs_im1,
) in izip!(
) in izip_eq!(
output_fourier_re0,
output_fourier_re1,
output_fourier_im0,
@@ -703,7 +709,7 @@ pub fn update_with_fmadd(
ggsw_poly_re1,
ggsw_poly_im0,
ggsw_poly_im1,
) in izip!(
) in izip_eq!(
output_fft_buffer_re0.into_chunks(fourier_poly_size),
output_fft_buffer_re1.into_chunks(fourier_poly_size),
output_fft_buffer_im0.into_chunks(fourier_poly_size),
@@ -797,6 +803,9 @@ pub fn cmux_scratch<Scalar>(
}
/// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
///
/// # Panics
/// This will panic if ct0 and ct1 are not of the same size
pub fn cmux<Scalar, ContCt0, ContCt1, ContGgsw>(
ct0: &mut GlweCiphertext<ContCt0>,
ct1: &mut GlweCiphertext<ContCt1>,
@@ -816,7 +825,7 @@ pub fn cmux<Scalar, ContCt0, ContCt1, ContGgsw>(
fft: Fft128View<'_>,
stack: &mut PodStack,
) {
for (c1, c0) in izip!(ct1.as_mut(), ct0.as_ref()) {
for (c1, c0) in izip_eq!(ct1.as_mut(), ct0.as_ref()) {
*c1 = c1.wrapping_sub(*c0);
}
add_external_product_assign(&mut ct0, &ggsw, &ct1, fft, stack);

View File

@@ -1,7 +1,7 @@
use crate::core_crypto::commons::math::torus::UnsignedTorus;
use crate::core_crypto::commons::numeric::{CastFrom, CastInto, UnsignedInteger};
use crate::core_crypto::commons::parameters::PolynomialSize;
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use core::any::TypeId;
use dyn_stack::{PodStack, SizeOverflow, StackReq};
use std::collections::hash_map::Entry;
@@ -277,7 +277,7 @@ pub fn convert_forward_torus<Scalar: UnsignedTorus>(
let normalization = 2.0_f64.powi(-(Scalar::BITS as i32));
for (out_re0, out_re1, out_im0, out_im1, &in_re, &in_im) in
izip!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
{
let out_re = to_signed_to_f128(in_re);
let out_im = to_signed_to_f128(in_im);
@@ -301,7 +301,7 @@ pub fn convert_forward_integer<Scalar: UnsignedTorus>(
in_im: &[Scalar],
) {
for (out_re0, out_re1, out_im0, out_im1, &in_re, &in_im) in
izip!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re, in_im)
{
let out_re = to_signed_to_f128(in_re);
let out_im = to_signed_to_f128(in_im);
@@ -323,7 +323,7 @@ fn convert_add_backward_torus<Scalar: UnsignedTorus>(
) {
let norm = 1.0 / in_re0.len() as f64;
for (out_re, out_im, in_re0, in_re1, in_im0, in_im1) in
izip!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
izip_eq!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
{
let in_re = f128(*in_re0 * norm, *in_re1 * norm);
let in_im = f128(*in_im0 * norm, *in_im1 * norm);
@@ -343,7 +343,7 @@ fn convert_backward_torus<Scalar: UnsignedTorus>(
) {
let norm = 1.0 / in_re0.len() as f64;
for (out_re, out_im, in_re0, in_re1, in_im0, in_im1) in
izip!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
izip_eq!(out_re, out_im, in_re0, in_re1, in_im0, in_im1)
{
let in_re = f128(*in_re0 * norm, *in_re1 * norm);
let in_im = f128(*in_im0 * norm, *in_im1 * norm);

View File

@@ -42,7 +42,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
stack,
);
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
if Scalar::BITS <= 64 {
assert_eq!(*expected, *actual);
} else {
@@ -103,7 +103,7 @@ fn test_product<Scalar: UnsignedTorus>() {
let mut fourier1_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
let integer_magnitude = 16;
for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
for (x, y) in izip_eq!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
*x = generator.random_uniform();
*y = generator.random_uniform();
@@ -128,7 +128,7 @@ fn test_product<Scalar: UnsignedTorus>() {
&poly1,
);
for (f0_re0, f0_re1, f0_im0, f0_im1, f1_re0, f1_re1, f1_im0, f1_im1) in izip!(
for (f0_re0, f0_re1, f0_im0, f0_im1, f1_re0, f1_re1, f1_im0, f1_im1) in izip_eq!(
&mut *fourier0_re0,
&mut *fourier0_re1,
&mut *fourier0_im0,
@@ -161,7 +161,7 @@ fn test_product<Scalar: UnsignedTorus>() {
poly1.as_ref(),
);
for (expected, actual) in izip!(
for (expected, actual) in izip_eq!(
convolution_from_naive.as_ref().iter(),
convolution_from_fft.as_ref().iter()
) {

View File

@@ -6,7 +6,7 @@ use crate::core_crypto::commons::parameters::{
CiphertextModulus, DecompositionBaseLog, DecompositionLevelCount, MonomialDegree,
};
use crate::core_crypto::commons::traits::ContiguousEntityContainerMut;
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::*;
use crate::core_crypto::prelude::{Container, ContainerMut, ModulusSwitchedLweCiphertext};
use aligned_vec::CACHELINE_ALIGN;
@@ -21,13 +21,13 @@ pub fn polynomial_wrapping_monic_monomial_mul_assign_split(
let output_hi = output_hi.into_container();
let full_cycles_count = monomial_degree.0 / output_lo.container_len();
if full_cycles_count % 2 != 0 {
izip!(&mut *output_lo, &mut *output_hi)
izip_eq!(&mut *output_lo, &mut *output_hi)
.for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
}
let remaining_degree = monomial_degree.0 % output_lo.container_len();
output_lo.rotate_right(remaining_degree);
output_hi.rotate_right(remaining_degree);
izip!(output_lo, output_hi)
izip_eq!(output_lo, output_hi)
.take(remaining_degree)
.for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
}
@@ -41,13 +41,13 @@ pub fn polynomial_wrapping_monic_monomial_div_assign_split(
let output_hi = output_hi.into_container();
let full_cycles_count = monomial_degree.0 / output_lo.container_len();
if full_cycles_count % 2 != 0 {
izip!(&mut *output_lo, &mut *output_hi)
izip_eq!(&mut *output_lo, &mut *output_hi)
.for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
}
let remaining_degree = monomial_degree.0 % output_lo.container_len();
output_lo.rotate_left(remaining_degree);
output_hi.rotate_left(remaining_degree);
izip!(output_lo, output_hi)
izip_eq!(output_lo, output_hi)
.rev()
.take(remaining_degree)
.for_each(|(lo, hi)| (*lo, *hi) = wrapping_neg((*lo, *hi)));
@@ -79,7 +79,7 @@ where
let msed_lwe_mask = msed_lwe.mask();
let msed_lwe_body = msed_lwe.body();
for (poly_lo, poly_hi) in izip!(
for (poly_lo, poly_hi) in izip_eq!(
lut_lo.as_mut_polynomial_list().iter_mut(),
lut_hi.as_mut_polynomial_list().iter_mut(),
) {
@@ -95,7 +95,7 @@ where
let mut ct0_hi = lut_hi;
for (lwe_mask_element, bootstrap_key_ggsw) in
izip!(msed_lwe_mask, this.into_ggsw_iter())
izip_eq!(msed_lwe_mask, this.into_ggsw_iter())
{
if lwe_mask_element != 0 {
let stack = &mut *stack;
@@ -116,7 +116,7 @@ where
);
// We rotate ct_1 by performing ct_1 <- ct_1 * X^{a_hat}
for (poly_lo, poly_hi) in izip!(
for (poly_lo, poly_hi) in izip_eq!(
ct1_lo.as_mut_polynomial_list().iter_mut(),
ct1_hi.as_mut_polynomial_list().iter_mut(),
) {
@@ -204,7 +204,7 @@ where
);
let (local_accumulator, _) = stack.collect_aligned(
align,
izip!(local_accumulator_lo.as_ref(), local_accumulator_hi.as_ref())
izip_eq!(local_accumulator_lo.as_ref(), local_accumulator_hi.as_ref())
.map(|(&lo, &hi)| lo as u128 | ((hi as u128) << 64)),
);
let mut local_accumulator = GlweCiphertextMutView::from_container(

View File

@@ -4,7 +4,7 @@ use crate::core_crypto::commons::traits::container::Split;
use crate::core_crypto::commons::traits::contiguous_entity_container::{
ContiguousEntityContainer, ContiguousEntityContainerMut,
};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::*;
use crate::core_crypto::fft_impl::fft128::crypto::ggsw::update_with_fmadd;
use crate::core_crypto::prelude::{Container, ContainerMut, SignedDecomposer};
@@ -86,7 +86,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
let (decomposition_states_hi, substack1) =
stack.make_aligned_raw::<u64>(poly_size * glwe_size, align);
for (out_lo, out_hi, in_lo, in_hi) in izip!(
for (out_lo, out_hi, in_lo, in_hi) in izip_eq!(
&mut *decomposition_states_lo,
&mut *decomposition_states_hi,
glwe_lo.as_ref(),
@@ -153,7 +153,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
//
// t = 1 t = 2 ...
for (ggsw_row, glwe_poly_lo, glwe_poly_hi) in izip!(
for (ggsw_row, glwe_poly_lo, glwe_poly_hi) in izip_eq!(
ggsw_decomp_matrix.into_rows(),
glwe_decomp_term_lo.as_polynomial_list().iter(),
glwe_decomp_term_hi.as_polynomial_list().iter(),
@@ -202,7 +202,7 @@ pub fn add_external_product_assign_split<ContOutLo, ContOutHi, ContGgsw, ContGlw
//
// We iterate over the polynomials in the output.
if !is_output_uninit {
for (mut out_lo, mut out_hi, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip!(
for (mut out_lo, mut out_hi, fourier_re0, fourier_re1, fourier_im0, fourier_im1) in izip_eq!(
out_lo.as_mut_polynomial_list().iter_mut(),
out_hi.as_mut_polynomial_list().iter_mut(),
output_fft_buffer_re0.into_chunks(fourier_poly_size),
@@ -293,7 +293,7 @@ fn collect_next_term_split_avx512(
let base_log_complement = simd.splat_u64x8(64u64.wrapping_sub(base_log));
let base_log = simd.splat_u64x8(base_log);
for (out_lo, out_hi, state_lo, state_hi) in izip!(
for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
glwe_decomp_term_lo,
glwe_decomp_term_hi,
decomposition_states_lo,
@@ -425,7 +425,7 @@ fn collect_next_term_split_avx2(
let base_log_complement = simd.splat_u64x4(64u64.wrapping_sub(base_log));
let base_log = simd.splat_u64x4(base_log);
for (out_lo, out_hi, state_lo, state_hi) in izip!(
for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
glwe_decomp_term_lo,
glwe_decomp_term_hi,
decomposition_states_lo,
@@ -507,7 +507,7 @@ fn collect_next_term_split_scalar(
base_log: usize,
) {
assert!(base_log < 128);
for (out_lo, out_hi, state_lo, state_hi) in izip!(
for (out_lo, out_hi, state_lo, state_hi) in izip_eq!(
glwe_decomp_term_lo,
glwe_decomp_term_hi,
decomposition_states_lo,
@@ -603,6 +603,9 @@ fn collect_next_term_split(
}
/// This cmux mutates both ct1 and ct0. The result is in ct0 after the method was called.
///
/// # Panics
/// This will panic if ct0_lo, ct0_hi, ct1_lo and ct1_hi are not of the same size
pub fn cmux_split<ContCt0Lo, ContCt0Hi, ContCt1Lo, ContCt1Hi, ContGgsw>(
ct0_lo: &mut GlweCiphertext<ContCt0Lo>,
ct0_hi: &mut GlweCiphertext<ContCt0Hi>,
@@ -627,7 +630,7 @@ pub fn cmux_split<ContCt0Lo, ContCt0Hi, ContCt1Lo, ContCt1Hi, ContGgsw>(
fft: Fft128View<'_>,
stack: &mut PodStack,
) {
for (c1_lo, c1_hi, c0_lo, c0_hi) in izip!(
for (c1_lo, c1_hi, c0_lo, c0_hi) in izip_eq!(
ct1_lo.as_mut(),
ct1_hi.as_mut(),
ct0_lo.as_ref(),

View File

@@ -1,4 +1,4 @@
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
pub use crate::core_crypto::fft_impl::fft128::math::fft::Fft128View;
use dyn_stack::PodStack;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -819,7 +819,7 @@ pub fn convert_forward_integer_avx2(
let in_im_hi = pulp::as_arrays::<4, _>(in_im_hi).0;
for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
{
let out_re =
to_signed_to_f128_avx2(simd, (pulp::cast(*in_re_lo), pulp::cast(*in_re_hi)));
@@ -899,7 +899,7 @@ pub fn convert_forward_integer_avx512(
let in_im_hi = pulp::as_arrays::<8, _>(in_im_hi).0;
for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
{
let out_re =
to_signed_to_f128_avx512(simd, (pulp::cast(*in_re_lo), pulp::cast(*in_re_hi)));
@@ -938,7 +938,7 @@ pub fn convert_forward_integer_scalar(
in_im_hi: &[u64],
) {
for (out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi) in
izip!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
izip_eq!(out_re0, out_re1, out_im0, out_im1, in_re_lo, in_re_hi, in_im_lo, in_im_hi)
{
let out_re = to_signed_to_f128((*in_re_lo, *in_re_hi));
let out_im = to_signed_to_f128((*in_im_lo, *in_im_hi));
@@ -990,7 +990,7 @@ pub fn convert_add_backward_torus_scalar(
) {
let norm = 1.0 / in_re0.len() as f64;
for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
{
let in_re = f128(*in_re0 * norm, *in_re1 * norm);
let in_im = f128(*in_im0 * norm, *in_im1 * norm);
@@ -1056,7 +1056,7 @@ pub fn convert_add_backward_torus_avx2(
let in_im1 = pulp::as_arrays::<4, _>(in_im1).0;
for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
{
let in_re = (
simd.mul_f64x4(pulp::cast(*in_re0), norm),
@@ -1149,7 +1149,7 @@ pub fn convert_add_backward_torus_avx512(
let in_im1 = pulp::as_arrays::<8, _>(in_im1).0;
for (out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1) in
izip!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
izip_eq!(out_re_lo, out_re_hi, out_im_lo, out_im_hi, in_re0, in_re1, in_im0, in_im1)
{
let in_re = (
simd.mul_f64x8(pulp::cast(*in_re0), norm),

View File

@@ -14,7 +14,7 @@ use crate::core_crypto::commons::parameters::{
use crate::core_crypto::commons::traits::{
Container, ContiguousEntityContainer, ContiguousEntityContainerMut, IntoContainerOwned, Split,
};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::*;
use crate::core_crypto::fft_impl::common::FourierBootstrapKey;
use crate::core_crypto::fft_impl::fft64::math::fft::par_convert_polynomials_list_to_fourier;
@@ -72,7 +72,10 @@ impl<C: Container<Element = c64>> FourierLweBootstrapKey<C> {
}
/// Return an iterator over the GGSW ciphertexts composing the key.
pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
pub fn into_ggsw_iter(
self,
) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
+ ExactSizeIterator<Item = FourierGgswCiphertext<C>>
where
C: Split,
{
@@ -190,6 +193,9 @@ pub fn fill_with_forward_fourier_scratch(fft: FftView<'_>) -> Result<StackReq, S
impl FourierLweBootstrapKeyMutView<'_> {
/// Fill a bootstrapping key with the Fourier transform of a bootstrapping key in the standard
/// domain.
///
/// # Panics
/// This will panic if self and coeff_bsk are not of the same size
pub fn fill_with_forward_fourier<Scalar: UnsignedTorus>(
mut self,
coef_bsk: LweBootstrapKey<&'_ [Scalar]>,
@@ -197,7 +203,7 @@ impl FourierLweBootstrapKeyMutView<'_> {
stack: &mut PodStack,
) {
for (fourier_ggsw, standard_ggsw) in
izip!(self.as_mut_view().into_ggsw_iter(), coef_bsk.iter())
izip_eq!(self.as_mut_view().into_ggsw_iter(), coef_bsk.iter())
{
fourier_ggsw.fill_with_forward_fourier(standard_ggsw, fft, stack);
}
@@ -325,7 +331,8 @@ impl FourierLweBootstrapKeyView<'_> {
let mut ct1 =
GlweCiphertextMutView::from_container(&mut *ct1, lut_poly_size, ciphertext_modulus);
for (lwe_mask_element, bootstrap_key_ggsw) in izip!(msed_lwe_mask, self.into_ggsw_iter()) {
for (lwe_mask_element, bootstrap_key_ggsw) in izip_eq!(msed_lwe_mask, self.into_ggsw_iter())
{
if lwe_mask_element != 0 {
let monomial_degree = MonomialDegree(lwe_mask_element);
@@ -335,7 +342,7 @@ impl FourierLweBootstrapKeyView<'_> {
// We rotate ct_1 and subtract ct_0 (first step of cmux) by performing
// ct_1 <- (ct_0 * X^{a_hat}) - ct_0
for (mut ct1_poly, ct0_poly) in izip!(
for (mut ct1_poly, ct0_poly) in izip_eq!(
ct1.as_mut_polynomial_list().iter_mut(),
ct0.as_polynomial_list().iter(),
) {
@@ -386,7 +393,7 @@ impl FourierLweBootstrapKeyView<'_> {
let ciphertext_modulus = lut_list.ciphertext_modulus();
assert!(ciphertext_modulus.is_compatible_with_native_modulus());
for (mut lut, lwe) in izip!(lut_list.as_mut_view().iter_mut(), msed_lwe_list.iter()) {
for (mut lut, lwe) in izip_eq!(lut_list.as_mut_view().iter_mut(), msed_lwe_list.iter()) {
let msed_lwe_body = lwe.body();
let monomial_degree = MonomialDegree(msed_lwe_body.cast_into());
@@ -414,7 +421,7 @@ impl FourierLweBootstrapKeyView<'_> {
);
for (idx, bootstrap_key_ggsw) in self.into_ggsw_iter().enumerate() {
for (mut ct0, mut ct1, msed_lwe) in izip!(
for (mut ct0, mut ct1, msed_lwe) in izip_eq!(
ct0_list.as_mut_view().iter_mut(),
ct1_list.as_mut_view().iter_mut(),
msed_lwe_list.iter()
@@ -431,7 +438,7 @@ impl FourierLweBootstrapKeyView<'_> {
// We rotate ct_1 and subtract ct_0 (first step of cmux) by performing
// ct_1 <- (ct_0 * X^{a_hat}) - ct_0
for (mut ct1_poly, ct0_poly) in izip!(
for (mut ct1_poly, ct0_poly) in izip_eq!(
ct1.as_mut_polynomial_list().iter_mut(),
ct0.as_polynomial_list().iter(),
) {
@@ -551,7 +558,8 @@ impl FourierLweBootstrapKeyView<'_> {
self.batch_blind_rotate_assign(local_accumulator.as_mut_view(), &lwe_in_msed, fft, stack);
for (mut lwe_out, local_accumulator) in izip!(lwe_out.iter_mut(), local_accumulator.iter())
for (mut lwe_out, local_accumulator) in
izip_eq!(lwe_out.iter_mut(), local_accumulator.iter())
{
extract_lwe_sample_from_glwe_ciphertext(
&local_accumulator,

View File

@@ -10,7 +10,7 @@ use crate::core_crypto::commons::parameters::{
use crate::core_crypto::commons::traits::{
Container, ContiguousEntityContainer, ContiguousEntityContainerMut, IntoContainerOwned, Split,
};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::ggsw_ciphertext::{
fourier_ggsw_level_matrix_size, GgswCiphertextView,
};
@@ -154,7 +154,10 @@ impl<C: Container<Element = c64>> FourierGgswLevelMatrix<C> {
}
/// Return an iterator over the rows of the level matrices.
pub fn into_rows(self) -> impl DoubleEndedIterator<Item = FourierGgswLevelRow<C>>
pub fn into_rows(
self,
) -> impl DoubleEndedIterator<Item = FourierGgswLevelRow<C>>
+ ExactSizeIterator<Item = FourierGgswLevelRow<C>>
where
C: Split,
{
@@ -262,7 +265,7 @@ impl FourierGgswCiphertextMutView<'_> {
debug_assert_eq!(coef_ggsw.polynomial_size(), self.polynomial_size());
let fourier_poly_size = coef_ggsw.polynomial_size().to_fourier_polynomial_size().0;
for (fourier_poly, coef_poly) in izip!(
for (fourier_poly, coef_poly) in izip_eq!(
self.data().into_chunks(fourier_poly_size),
coef_ggsw.as_polynomial_list().iter()
) {
@@ -401,7 +404,10 @@ impl<C: Container<Element = c64>> FourierGgswCiphertextList<C> {
}
}
pub fn into_ggsw_iter(self) -> impl DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
pub fn into_ggsw_iter(
self,
) -> impl ExactSizeIterator<Item = FourierGgswCiphertext<C>>
+ DoubleEndedIterator<Item = FourierGgswCiphertext<C>>
where
C: Split,
{
@@ -548,7 +554,7 @@ pub fn add_external_product_assign<Scalar>(
//
// t = 1 t = 2 ...
izip!(
izip_eq!(
ggsw_decomp_matrix.into_rows(),
glwe_decomp_term.as_polynomial_list().iter()
)
@@ -586,7 +592,7 @@ pub fn add_external_product_assign<Scalar>(
//
// We iterate over the polynomials in the output.
if !is_output_uninit {
izip!(
izip_eq!(
out.as_mut_polynomial_list().iter_mut(),
output_fft_buffer
.into_chunks(fourier_poly_size)
@@ -649,26 +655,26 @@ pub(crate) fn update_with_fmadd(
let rhs = S::as_simd_c64s(fourier).0;
if is_output_uninit {
for (output_fourier, ggsw_poly) in izip!(
for (output_fourier, ggsw_poly) in izip_eq!(
output_fft_buffer.into_chunks(fourier_poly_size),
lhs_polynomial_list.into_chunks(fourier_poly_size)
) {
let out = S::as_mut_simd_c64s(output_fourier).0;
let lhs = S::as_simd_c64s(ggsw_poly).0;
for (out, lhs, rhs) in izip!(out, lhs, rhs) {
for (out, lhs, rhs) in izip_eq!(out, lhs, rhs) {
*out = simd.mul_c64s(*lhs, *rhs);
}
}
} else {
for (output_fourier, ggsw_poly) in izip!(
for (output_fourier, ggsw_poly) in izip_eq!(
output_fft_buffer.into_chunks(fourier_poly_size),
lhs_polynomial_list.into_chunks(fourier_poly_size)
) {
let out = S::as_mut_simd_c64s(output_fourier).0;
let lhs = S::as_simd_c64s(ggsw_poly).0;
for (out, lhs, rhs) in izip!(out, lhs, rhs) {
for (out, lhs, rhs) in izip_eq!(out, lhs, rhs) {
*out = simd.mul_add_c64s(*lhs, *rhs, *out);
}
}
@@ -719,7 +725,7 @@ pub(crate) fn update_with_fmadd_factor(
fn with_simd<S: pulp::Simd>(self, simd: S) -> Self::Output {
let factor = simd.splat_c64s(self.factor);
for (output_fourier, ggsw_poly) in izip!(
for (output_fourier, ggsw_poly) in izip_eq!(
self.output_fft_buffer.into_chunks(self.fourier_poly_size),
self.lhs_polynomial_list.into_chunks(self.fourier_poly_size)
) {
@@ -728,12 +734,12 @@ pub(crate) fn update_with_fmadd_factor(
let rhs = S::as_simd_c64s(self.fourier).0;
if self.is_output_uninit {
for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
for (out, &lhs, &rhs) in izip_eq!(out, lhs, rhs) {
// NOTE: factor * (lhs * rhs) is more efficient than (lhs * rhs) * factor
*out = simd.mul_c64s(factor, simd.mul_c64s(lhs, rhs));
}
} else {
for (out, &lhs, &rhs) in izip!(out, lhs, rhs) {
for (out, &lhs, &rhs) in izip_eq!(out, lhs, rhs) {
// NOTE: see above
*out = simd.mul_add_c64s(factor, simd.mul_c64s(lhs, rhs), *out);
}
@@ -769,7 +775,7 @@ pub fn cmux<Scalar: UnsignedTorus>(
fft: FftView<'_>,
stack: &mut PodStack,
) {
izip!(ct1.as_mut(), ct0.as_ref()).for_each(|(c1, c0)| {
izip_eq!(ct1.as_mut(), ct0.as_ref()).for_each(|(c1, c0)| {
*c1 = c1.wrapping_sub(*c0);
});
add_external_product_assign(ct0, ggsw, ct1.as_view(), fft, stack);

View File

@@ -12,7 +12,7 @@ use crate::core_crypto::algorithms::*;
use crate::core_crypto::commons::math::decomposition::DecompositionLevel;
use crate::core_crypto::commons::parameters::*;
use crate::core_crypto::commons::traits::*;
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::*;
use aligned_vec::CACHELINE_ALIGN;
use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -216,7 +216,7 @@ pub fn extract_bits<Scalar: UnsignedTorus + CastInto<usize>>(
*out_pbs_body = (*out_pbs_body).wrapping_add(Scalar::ONE << (delta_log.0 + bit_idx - 1));
// Remove the extracted bit from the initial LWE to get a 0 at the extracted bit location.
izip!(lwe_in_buffer.as_mut(), lwe_out_pbs_buffer.as_ref())
izip_eq!(lwe_in_buffer.as_mut(), lwe_out_pbs_buffer.as_ref())
.for_each(|(out, inp)| *out = (*out).wrapping_sub(*inp));
}
}
@@ -521,7 +521,7 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
break;
};
let mut t_iter = izip!(t_0.iter_mut(), t_1.iter_mut()).enumerate();
let mut t_iter = izip_eq!(t_0.iter_mut(), t_1.iter_mut()).enumerate();
let (mut j_counter, (mut t0_j, mut t1_j)) = t_iter.next().unwrap();
@@ -539,7 +539,7 @@ pub fn cmux_tree_memory_optimized<Scalar: UnsignedTorus + CastInto<usize>>(
if t_fill[j] == 2 {
let (diff_data, stack) = stack.collect_aligned(
CACHELINE_ALIGN,
izip!(t1_j.as_ref(), t0_j.as_ref()).map(|(&a, &b)| a.wrapping_sub(b)),
izip_eq!(t1_j.as_ref(), t0_j.as_ref()).map(|(&a, &b)| a.wrapping_sub(b)),
);
let diff = GlweCiphertext::from_container(
&*diff_data,
@@ -709,7 +709,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
pfpksk_list.ciphertext_modulus(),
);
for (lwe_in, ggsw) in izip!(lwe_list_in.iter(), ggsw_list.as_mut_view().into_ggsw_iter()) {
for (lwe_in, ggsw) in izip_eq!(lwe_list_in.iter(), ggsw_list.as_mut_view().into_ggsw_iter()) {
circuit_bootstrap_boolean(
fourier_bsk,
lwe_in,
@@ -728,7 +728,7 @@ pub fn circuit_bootstrap_boolean_vertical_packing<Scalar: UnsignedTorus + CastIn
let small_lut_size = big_lut_as_polynomial_list.polynomial_count().0 / number_of_luts;
for (lut, lwe_out) in izip!(
for (lut, lwe_out) in izip_eq!(
big_lut_as_polynomial_list.chunks_exact(small_lut_size),
lwe_list_out.iter_mut(),
) {

View File

@@ -521,7 +521,7 @@ pub fn test_cmux_tree() {
level,
);
for (&single_bit_msg, mut fourier_ggsw) in
izip!(vec_message.iter(), ggsw_list.as_mut_view().into_ggsw_iter())
izip_eq!(vec_message.iter(), ggsw_list.as_mut_view().into_ggsw_iter())
{
let mut ggsw = GgswCiphertextOwned::new(
0_u64,

View File

@@ -6,7 +6,7 @@ use crate::core_crypto::commons::math::torus::UnsignedTorus;
use crate::core_crypto::commons::numeric::CastInto;
use crate::core_crypto::commons::parameters::{PolynomialCount, PolynomialSize};
use crate::core_crypto::commons::traits::{Container, ContainerMut, IntoContainerOwned};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::entities::*;
use aligned_vec::{avec, ABox};
use dyn_stack::{PodStack, SizeOverflow, StackReq};
@@ -65,7 +65,7 @@ impl Twisties {
let mut im = avec![0.0; n].into_boxed_slice();
let unit = core::f64::consts::PI / (2.0 * n as f64);
for (i, (re, im)) in izip!(&mut *re, &mut *im).enumerate() {
for (i, (re, im)) in izip_eq!(&mut *re, &mut *im).enumerate() {
(*im, *re) = (i as f64 * unit).sin_cos();
}
@@ -212,7 +212,7 @@ fn convert_forward_torus<Scalar: UnsignedTorus>(
) {
let normalization = 2.0_f64.powi(-(Scalar::BITS as i32));
izip!(out, in_re, in_im, twisties.re, twisties.im).for_each(
izip_eq!(out, in_re, in_im, twisties.re, twisties.im).for_each(
|(out, in_re, in_im, w_re, w_im)| {
let in_re: f64 = in_re.into_signed().cast_into() * normalization;
let in_im: f64 = in_im.into_signed().cast_into() * normalization;
@@ -233,7 +233,7 @@ fn convert_forward_integer_scalar<Scalar: UnsignedTorus>(
in_im: &[Scalar],
twisties: TwistiesView<'_>,
) {
izip!(out, in_re, in_im, twisties.re, twisties.im).for_each(
izip_eq!(out, in_re, in_im, twisties.re, twisties.im).for_each(
|(out, in_re, in_im, w_re, w_im)| {
let in_re: f64 = in_re.into_signed().cast_into();
let in_im: f64 = in_im.into_signed().cast_into();
@@ -278,7 +278,7 @@ fn convert_backward_torus<Scalar: UnsignedTorus>(
twisties: TwistiesView<'_>,
) {
let normalization = 1.0 / inp.len() as f64;
izip!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
izip_eq!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
|(out_re, out_im, inp, w_re, w_im)| {
let tmp = inp
* (c64 {
@@ -299,7 +299,7 @@ fn convert_add_backward_torus_scalar<Scalar: UnsignedTorus>(
twisties: TwistiesView<'_>,
) {
let normalization = 1.0 / inp.len() as f64;
izip!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
izip_eq!(out_re, out_im, inp, twisties.re, twisties.im).for_each(
|(out_re, out_im, inp, w_re, w_im)| {
let tmp = inp
* (c64 {
@@ -781,7 +781,7 @@ pub fn par_convert_polynomials_list_to_fourier<Scalar: UnsignedTorus>(
let stack = PodStack::new(&mut mem);
for (fourier_poly, standard_poly) in izip!(
for (fourier_poly, standard_poly) in izip_eq!(
fourier_poly_chunk.chunks_exact_mut(f_polynomial_size),
standard_poly_chunk.chunks_exact(polynomial_size.0)
) {

View File

@@ -34,7 +34,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
fft.backward_as_torus(roundtrip.as_mut_view(), fourier.as_view(), stack);
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
if Scalar::BITS == 32 {
assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
} else {
@@ -48,7 +48,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
fft.add_backward_as_torus(roundtrip.as_mut_view(), fourier.as_view(), stack);
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
if Scalar::BITS == 32 {
assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
} else {
@@ -62,7 +62,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
fft.forward_as_torus(fourier.as_mut_view(), poly.as_view(), stack);
fft.add_backward_in_place_as_torus(roundtrip.as_mut_view(), fourier.as_mut_view(), stack);
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
if Scalar::BITS == 32 {
assert!(modular_distance(*expected, *actual) == Scalar::ZERO);
} else {
@@ -119,7 +119,7 @@ fn test_product<Scalar: UnsignedTorus>() {
};
let integer_magnitude = 16;
for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
for (x, y) in izip_eq!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
*x = generator.random_uniform();
*y = generator.random_uniform();
*y >>= Scalar::BITS - integer_magnitude;
@@ -135,7 +135,7 @@ fn test_product<Scalar: UnsignedTorus>() {
fft.forward_as_torus(fourier0.as_mut_view(), poly0.as_view(), stack);
fft.forward_as_integer(fourier1.as_mut_view(), poly1.as_view(), stack);
for (f0, f1) in izip!(&mut *fourier0.data, &*fourier1.data) {
for (f0, f1) in izip_eq!(&mut *fourier0.data, &*fourier1.data) {
*f0 *= *f1;
}
@@ -152,7 +152,7 @@ fn test_product<Scalar: UnsignedTorus>() {
stack,
);
for (expected, actual) in izip!(
for (expected, actual) in izip_eq!(
convolution_from_naive.as_ref().iter(),
convolution_from_fft.as_ref().iter()
) {
@@ -174,7 +174,7 @@ fn test_product<Scalar: UnsignedTorus>() {
stack,
);
for (expected, actual) in izip!(
for (expected, actual) in izip_eq!(
convolution_from_naive.as_ref().iter(),
convolution_from_fft.as_ref().iter()
) {
@@ -198,7 +198,7 @@ fn test_product<Scalar: UnsignedTorus>() {
stack,
);
for (expected, actual) in izip!(
for (expected, actual) in izip_eq!(
convolution_from_naive.as_ref().iter(),
convolution_from_fft.as_ref().iter()
) {

View File

@@ -11,7 +11,7 @@
use super::super::super::c64;
use super::TwistiesView;
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
@@ -178,7 +178,7 @@ pub fn convert_forward_integer_u32_v4(
let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
let w_im = pulp::as_arrays::<8, _>(twisties.im).0;
for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
let in_re = pulp::cast(in_re);
let in_im = pulp::cast(in_im);
let w_re = pulp::cast(w_re);
@@ -270,7 +270,7 @@ pub fn convert_forward_integer_u64_v4(
let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
let w_im = pulp::as_arrays::<8, _>(twisties.im).0;
for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
let in_re = pulp::cast(in_re);
let in_im = pulp::cast(in_im);
let w_re = pulp::cast(w_re);
@@ -363,7 +363,7 @@ pub fn convert_forward_integer_u32_v3(
let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
let w_im = pulp::as_arrays::<4, _>(twisties.im).0;
for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
let in_re = pulp::cast(in_re);
let in_im = pulp::cast(in_im);
let w_re = pulp::cast(w_re);
@@ -456,7 +456,7 @@ pub fn convert_forward_integer_u64_avx2_v3(
let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
let w_im = pulp::as_arrays::<4, _>(twisties.im).0;
for (out, &in_re, &in_im, &w_re, &w_im) in izip!(out, in_re, in_im, w_re, w_im) {
for (out, &in_re, &in_im, &w_re, &w_im) in izip_eq!(out, in_re, in_im, w_re, w_im) {
let in_re = pulp::cast(in_re);
let in_im = pulp::cast(in_im);
let w_re = pulp::cast(w_re);
@@ -604,7 +604,7 @@ pub fn convert_add_backward_torus_u32_v4(
let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
let w_im = pulp::as_arrays::<8, _>(twisties.im).0;
for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
let inp = pulp::cast::<_, [__m512d; 2]>(inp);
let w_re = pulp::cast(w_re);
let w_im = pulp::cast(w_im);
@@ -690,7 +690,7 @@ pub fn convert_add_backward_torus_u64_v4(
let w_re = pulp::as_arrays::<8, _>(twisties.re).0;
let w_im = pulp::as_arrays::<8, _>(twisties.im).0;
for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
let inp = pulp::cast::<_, [__m512d; 2]>(inp);
let w_re = pulp::cast(w_re);
let w_im = pulp::cast(w_im);
@@ -832,7 +832,7 @@ pub fn convert_add_backward_torus_u32_v3(
let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
let w_im = pulp::as_arrays::<4, _>(twisties.im).0;
for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
let inp = pulp::cast::<_, [__m128d; 4]>(inp);
let w_re = pulp::cast(w_re);
let w_im = pulp::cast(w_im);
@@ -917,7 +917,7 @@ pub fn convert_add_backward_torus_u64_v3(
let w_re = pulp::as_arrays::<4, _>(twisties.re).0;
let w_im = pulp::as_arrays::<4, _>(twisties.im).0;
for (out_re, out_im, &inp, &w_re, &w_im) in izip!(out_re, out_im, inp, w_re, w_im) {
for (out_re, out_im, &inp, &w_re, &w_im) in izip_eq!(out_re, out_im, inp, w_re, w_im) {
let inp = pulp::cast::<_, [__m128d; 4]>(inp);
let w_re = pulp::cast(w_re);
let w_im = pulp::cast(w_im);

View File

@@ -1,5 +1,5 @@
use crate::core_crypto::commons::test_tools::{modular_distance, new_random_generator};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::commons::utils::izip_eq;
use crate::core_crypto::gpu::vec::GpuIndex;
use crate::core_crypto::gpu::{
fourier_transform_backward_as_torus_f128_async, fourier_transform_forward_as_torus_f128_async,
@@ -53,7 +53,7 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
cuda_synchronize_device(0);
}
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
for (expected, actual) in izip_eq!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
if Scalar::BITS <= 64 {
assert_eq!(*expected, *actual);
} else {

View File

@@ -1,5 +1,5 @@
use crate::core_crypto::gpu::vec::{range_bounds_to_start_end, CudaVec};
use crate::core_crypto::gpu::{CudaLweList, CudaStreams};
use crate::core_crypto::gpu::{CudaLweList, CudaStreams, GpuIndex};
use crate::core_crypto::prelude::{
CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
LweDimension, LweSize, UnsignedInteger,
@@ -213,6 +213,10 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
Self(self.0.duplicate(streams))
}
pub(crate) fn gpu_indexes(&self) -> &[GpuIndex] {
self.0.d_vec.gpu_indexes.as_slice()
}
pub(crate) fn lwe_dimension(&self) -> LweDimension {
self.0.lwe_dimension
}

View File

@@ -1212,6 +1212,11 @@ pub fn get_number_of_gpus() -> u32 {
unsafe { cuda_get_number_of_gpus() as u32 }
}
/// Get the number of sms on the GPU
pub fn get_number_of_sms() -> u32 {
unsafe { cuda_get_number_of_sms() as u32 }
}
/// Setup multi-GPU and return the number of GPUs used
pub fn setup_multi_gpu(device_0_id: GpuIndex) -> u32 {
unsafe { cuda_setup_multi_gpu(device_0_id.get()) as u32 }

Some files were not shown because too many files have changed in this diff Show More