Compare commits

...

16 Commits

Author SHA1 Message Date
Baptiste Roux
52b8e81ccb fix(hpu): Correctly select adder configuration in ERC_20/ERC_20_SIMD
Add knobs to select ripple or kogge adder in ERC_20/ERC_20_SIMD.
Previously, it was hardcoded to ripple carry and thus degraded latency
performance of ERC_20.
2025-12-24 10:38:38 +01:00
Baptiste Roux
b19a7773bb feat: Add IfThenZero impl for Cpu 2025-12-24 10:38:38 +01:00
pgardratzama
0342b0466d chore(hpu): fix panic msg 2025-12-24 10:38:38 +01:00
pgardratzama
edc9ef0026 fix(hpu): fix whitepaper erc20 for HPU using if_then_zero 2025-12-24 10:38:38 +01:00
Guillermo Oyarzun
92df46f8f2 fix(gpu): return to 64 regs in multi-bit pbs 2025-12-23 11:51:00 +01:00
David Testé
effb7ada6d chore(ci): fix argument name passed to data_extractor 2025-12-18 18:09:34 +01:00
Agnes Leroy
49be544297 fix(gpu): fix cpu memory leak in expand and rerand 2025-12-18 16:33:23 +01:00
David Testé
23600eb8e1 chore(ci): split gpu documentation benchmarks execution
This is done to mitigate H100x8-SXM5 server scarcity.
2025-12-18 14:56:15 +01:00
Agnes Leroy
9708cc7fe9 chore(gpu): remove core crypto from valgrind run 2025-12-18 13:01:12 +01:00
dependabot[bot]
4cdfccb659 chore(deps): bump actions/setup-node from 6.0.0 to 6.1.0
Bumps [actions/setup-node](https://github.com/actions/setup-node) from 6.0.0 to 6.1.0.
- [Release notes](https://github.com/actions/setup-node/releases)
- [Commits](2028fbc5c2...395ad32622)

---
updated-dependencies:
- dependency-name: actions/setup-node
  dependency-version: 6.1.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-18 11:05:56 +01:00
dependabot[bot]
031c3fe34f chore(deps): bump actions/checkout from 6.0.0 to 6.0.1
Bumps [actions/checkout](https://github.com/actions/checkout) from 6.0.0 to 6.0.1.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](1af3b93b68...8e8c483db8)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-version: 6.0.1
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-18 11:05:47 +01:00
dependabot[bot]
ea99307cf5 chore(deps): bump actions/stale from 10.1.0 to 10.1.1
Bumps [actions/stale](https://github.com/actions/stale) from 10.1.0 to 10.1.1.
- [Release notes](https://github.com/actions/stale/releases)
- [Changelog](https://github.com/actions/stale/blob/main/CHANGELOG.md)
- [Commits](5f858e3efb...997185467f)

---
updated-dependencies:
- dependency-name: actions/stale
  dependency-version: 10.1.1
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
2025-12-18 11:05:36 +01:00
Enzo Di Maria
ca2a79f1fb refactor(gpu): Threshold for multi-GPU with Classical PBS 2025-12-18 09:27:09 +01:00
Enzo Di Maria
0a59e86675 fix(gpu): Using tbc for classical 64 bits pbs on H100 2025-12-17 19:18:01 +01:00
Nicolas Sarlin
312ce494bf chore(zk): add 1 * 64 benches with production CRS 2025-12-17 15:06:37 +01:00
Nicolas Sarlin
5f2e7e31f1 chore(zk): align wasm bench and integer bench 2025-12-17 15:06:37 +01:00
95 changed files with 474 additions and 314 deletions

View File

@@ -66,7 +66,7 @@ jobs:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'true' # Needed to pull lfs data
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -63,7 +63,7 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -171,7 +171,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -50,7 +50,7 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -112,7 +112,7 @@ jobs:
timeout-minutes: 480 # 8 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -60,7 +60,7 @@ jobs:
timeout-minutes: 1440
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -51,7 +51,7 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -112,7 +112,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -72,7 +72,7 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -182,7 +182,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -64,7 +64,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -149,7 +149,7 @@ jobs:
params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -229,7 +229,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -49,7 +49,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -105,7 +105,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -8,8 +8,13 @@ on:
description: "Run CPU benchmarks"
type: boolean
default: true
run-gpu-benchmarks:
description: "Run GPU benchmarks"
# GPU benchmarks are split because of resource scarcity.
run-gpu-integer-benchmarks:
description: "Run GPU integer benchmarks"
type: boolean
default: true
run-gpu-core-crypto-benchmarks:
description: "Run GPU core-crypto benchmarks"
type: boolean
default: true
run-hpu-benchmarks:
@@ -52,7 +57,7 @@ jobs:
run-benchmarks-gpu-integer:
name: benchmark_documentation/run-benchmarks-gpu-integer
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-benchmarks
if: inputs.run-gpu-integer-benchmarks
with:
profile: multi-h100-sxm5
hardware_name: n3-H100-SXM5x8
@@ -113,7 +118,7 @@ jobs:
run-benchmarks-gpu-core-crypto:
name: benchmark_documentation/run-benchmarks-gpu-core-crypto
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-benchmarks
if: inputs.run-gpu-core-crypto-benchmarks
with:
profile: multi-h100-sxm5
hardware_name: n3-H100-SXM5x8
@@ -133,7 +138,7 @@ jobs:
generate-svgs-with-benchmarks-run:
name: benchmark-documentation/generate-svgs-with-benchmarks-run
if: ${{ always() &&
(inputs.run-cpu-benchmarks || inputs.run-gpu-benchmarks ||inputs.run-hpu-benchmarks) &&
(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
inputs.generate-svgs }}
needs: [
run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
@@ -143,7 +148,7 @@ jobs:
with:
time_span_days: 5
generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
generate-gpu-svgs: ${{ inputs.run-gpu-benchmarks }}
generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -152,7 +157,7 @@ jobs:
generate-svgs-without-benchmarks-run:
name: benchmark-documentation/generate-svgs-without-benchmarks-run
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-benchmarks || inputs.run-hpu-benchmarks) &&
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
inputs.generate-svgs }}
uses: ./.github/workflows/generate_svgs.yml
with:
@@ -175,7 +180,7 @@ jobs:
PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'

View File

@@ -40,7 +40,7 @@ jobs:
timeout-minutes: 1440 # 24 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -123,7 +123,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -175,7 +175,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -287,7 +287,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -130,7 +130,7 @@ jobs:
git lfs install
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
path: tfhe-rs
persist-credentials: false
@@ -141,7 +141,7 @@ jobs:
ls
- name: Checkout fhevm
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
repository: zama-ai/fhevm
persist-credentials: 'false'
@@ -223,7 +223,7 @@ jobs:
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
- name: Use Node.js
uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
with:
node-version: 20.x
@@ -299,7 +299,7 @@ jobs:
path: fhevm/$${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -126,7 +126,7 @@ jobs:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -191,7 +191,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -50,7 +50,7 @@ jobs:
pull-requests: write # Needed to write a comment in a pull-request
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0 # Needed to get commit hash
persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -305,7 +305,7 @@ jobs:
REF_NAME: ${{ github.head_ref || github.ref_name }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

View File

@@ -55,7 +55,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -102,7 +102,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -55,7 +55,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -102,7 +102,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -39,7 +39,7 @@ jobs:
wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -91,7 +91,7 @@ jobs:
browser: [ chrome, firefox ]
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -175,7 +175,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -26,7 +26,7 @@ jobs:
name: cargo_audit/audit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
outputs:
matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -140,7 +140,7 @@ jobs:
result: ${{ steps.set_builds_result.outputs.result }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -26,7 +26,7 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
os: [ubuntu-latest, macos-latest, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -56,7 +56,7 @@ jobs:
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
fail-fast: false
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -92,7 +92,7 @@ jobs:
if: needs.should-run.outputs.fft_test == 'true'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -31,7 +31,7 @@ jobs:
ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: "false"
@@ -87,7 +87,7 @@ jobs:
os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
fail-fast: false
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -50,7 +50,7 @@ jobs:
timeout-minutes: 5760 # 4 days
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -62,7 +62,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -43,7 +43,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
python3 -m pip install -r ci/data_extractor/requirements.txt
python3 ci/data_extractor/src/data_extractor.py "${OUTPUT_FILENAME}" \
--generate-svg \
--backend-comparison\
--backends-comparison \
--time-span-days "${TIME_SPAN}"
env:
OUTPUT_FILENAME: ${{ inputs.output_filename }}

View File

@@ -41,7 +41,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -23,8 +23,8 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# every 3 months
- cron: "0 0 1 */3 *"
# every month
- cron: "0 0 1 * *"
permissions:
contents: read
@@ -50,7 +50,7 @@ jobs:
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: gpu-test
profile: single-h100
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
@@ -79,7 +79,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -114,7 +114,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -68,7 +68,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -65,7 +65,7 @@ jobs:
timeout-minutes: 4320 # 72 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -78,7 +78,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -78,7 +78,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -74,7 +74,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -117,7 +117,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -117,7 +117,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -32,7 +32,7 @@ jobs:
hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -83,7 +83,7 @@ jobs:
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -53,7 +53,7 @@ jobs:
timeout-minutes: 4320 # 72 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
timeout-minutes: 720
steps:
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -52,7 +52,7 @@ jobs:
hash: ${{ steps.hash.outputs.hash }}
steps:
- name: Checkout
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -93,7 +93,7 @@ jobs:
id-token: write # Needed for OIDC token exchange on crates.io
steps:
- name: Checkout
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0
persist-credentials: 'false'

View File

@@ -64,7 +64,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0
persist-credentials: "false"

View File

@@ -68,7 +68,7 @@ jobs:
id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
steps:
- name: Checkout
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -85,7 +85,7 @@ jobs:
make build_web_js_api_parallel
- name: Authenticate on NPM
uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
with:
node-version: '24'
registry-url: 'https://registry.npmjs.org'

View File

@@ -60,7 +60,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
toolchain: stable
- name: Checkout lattice-estimator
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: malb/lattice-estimator
path: lattice_estimator

View File

@@ -17,7 +17,7 @@ jobs:
issues: read # Needed to fetch all issues
pull-requests: write # Needed to write message and close the PR
steps:
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
with:
stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
days-before-stale: 2

View File

@@ -35,7 +35,8 @@ template <typename Torus> struct int_aes_lut_buffers {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and_lut = streams.active_gpu_subset(
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
params.pbs_type);
this->and_lut->broadcast_lut(active_streams_and_lut);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
@@ -50,8 +51,8 @@ template <typename Torus> struct int_aes_lut_buffers {
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush_lut =
streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
auto active_streams_flush_lut = streams.active_gpu_subset(
AES_STATE_BITS * num_aes_inputs, params.pbs_type);
this->flush_lut->broadcast_lut(active_streams_flush_lut);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
@@ -65,7 +66,8 @@ template <typename Torus> struct int_aes_lut_buffers {
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_lambda, allocate_gpu_memory);
auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
auto active_streams_carry_lut =
streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
this->carry_lut->broadcast_lut(active_streams_carry_lut);
this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

View File

@@ -8,7 +8,8 @@
extern std::mutex m;
extern bool p2p_enabled;
extern const int THRESHOLD_MULTI_GPU;
extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
extern "C" {
int32_t cuda_setup_multi_gpu(int device_0_id);
@@ -39,7 +40,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
}
}
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
@@ -73,9 +75,10 @@ public:
// Returns a subset of this set as an active subset. An active subset is one
// that is temporarily used to perform some computation
CudaStreams active_gpu_subset(int num_radix_blocks) {
return CudaStreams(_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count));
CudaStreams active_gpu_subset(int num_radix_blocks, PBS_TYPE pbs_type) {
return CudaStreams(
_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
}
// Returns a CudaStreams struct containing only the ith stream

View File

@@ -20,7 +20,8 @@ template <typename Torus> struct boolean_bitop_buffer {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
auto active_streams =
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
this->unchecked = is_unchecked;
switch (op) {
case BITAND:
@@ -119,7 +120,8 @@ template <typename Torus> struct int_bitop_buffer {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
switch (op) {
case BITAND:
case BITOR:
@@ -216,7 +218,8 @@ template <typename Torus> struct boolean_bitnot_buffer {
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
auto active_streams =
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
message_extract_lut->broadcast_lut(active_streams);
}
}

View File

@@ -39,7 +39,8 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
},
allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut->broadcast_lut(active_streams);
this->last_block = new CudaRadixCiphertextFFI;

View File

@@ -14,7 +14,8 @@ template <typename Torus> struct int_zero_out_if_buffer {
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
tmp = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -114,9 +115,11 @@ template <typename Torus> struct int_cmux_buffer {
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams_pred = streams.active_gpu_subset(2 * num_radix_blocks);
auto active_streams_pred =
streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
predicate_lut->broadcast_lut(active_streams_pred);
auto active_streams_msg = streams.active_gpu_subset(num_radix_blocks);
auto active_streams_msg =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
message_extract_lut->broadcast_lut(active_streams_msg);
}

View File

@@ -52,7 +52,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(max_chunks);
auto active_streams =
streams.active_gpu_subset(max_chunks, params.pbs_type);
is_max_value->broadcast_lut(active_streams);
}
@@ -108,7 +109,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
is_non_zero_lut->broadcast_lut(active_streams);
// Scalar may have up to num_radix_blocks blocks
@@ -238,7 +240,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
block_selector_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
tree_inner_leaf_lut->broadcast_lut(active_streams);
}
@@ -390,7 +393,8 @@ template <typename Torus> struct int_comparison_buffer {
this->op = op;
this->is_signed = is_signed;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
identity_lut_f = [](Torus x) -> Torus { return x; };
@@ -523,7 +527,7 @@ template <typename Torus> struct int_comparison_buffer {
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
signed_lut->broadcast_lut(active_streams);
}
preallocated_h_lut = (Torus *)malloc(

View File

@@ -116,7 +116,8 @@ template <typename Torus> struct int_decompression {
effective_compression_carry_modulus,
encryption_params.message_modulus, encryption_params.carry_modulus,
decompression_rescale_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
auto active_streams = streams.active_gpu_subset(
num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
decompression_rescale_lut->broadcast_lut(active_streams);
}
}

View File

@@ -356,7 +356,8 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_blocks);
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
luts[j]->broadcast_lut(active_streams);
}
}
@@ -1012,7 +1013,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_1 = streams.active_gpu_subset(1);
auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
masking_luts_1[i]->broadcast_lut(active_streams_1);
generate_device_accumulator<Torus>(
@@ -1021,7 +1022,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_2 = streams.active_gpu_subset(num_blocks);
auto active_streams_2 =
streams.active_gpu_subset(num_blocks, params.pbs_type);
masking_luts_2[i]->broadcast_lut(active_streams_2);
}
@@ -1040,7 +1042,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
auto active_streams = streams.active_gpu_subset(num_blocks);
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
@@ -1128,7 +1131,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
// merge_overflow_flags_luts
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
auto active_gpu_count_for_bits = streams.active_gpu_subset(1);
auto active_gpu_count_for_bits =
streams.active_gpu_subset(1, params.pbs_type);
for (int i = 0; i < num_bits_in_message; i++) {
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
return (x == 0 && y == 0) << i;
@@ -1152,7 +1156,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
uint32_t num_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
auto active_streams = streams.active_gpu_subset(2 * num_blocks);
auto active_streams =
streams.active_gpu_subset(2 * num_blocks, params.pbs_type);
this->params = params;
if (params.message_modulus == 4 && params.carry_modulus == 4 &&
@@ -1473,7 +1478,8 @@ template <typename Torus> struct int_div_rem_memory {
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->params = params;
this->is_signed = is_signed;
@@ -1559,7 +1565,7 @@ template <typename Torus> struct int_div_rem_memory {
params.polynomial_size, params.message_modulus, params.carry_modulus,
f_compare_extracted_signed_bits, gpu_memory_allocated);
auto active_gpu_count_cmp =
streams.active_gpu_subset(1); // only 1 block needed
streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
}
}

View File

@@ -20,7 +20,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
this->direction = direction;
this->bit_value = bit_value;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
this->univ_lut_mem =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
@@ -246,7 +247,8 @@ template <typename Torus> struct int_ilog2_buffer {
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus,
lut_message_lambda, allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(counter_num_blocks);
auto active_streams =
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
lut_message_not->broadcast_lut(active_streams);
this->lut_carry_not =

View File

@@ -371,7 +371,8 @@ struct int_radix_lut_custom_input_output {
this->num_input_blocks = num_input_blocks;
this->gpu_memory_allocated = allocate_gpu_memory;
this->active_streams = streams.active_gpu_subset(num_radix_blocks);
this->active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
}
void setup_degrees() {
@@ -382,14 +383,18 @@ struct int_radix_lut_custom_input_output {
void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory, uint64_t &size_tracker) {
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
for (uint i = 0; i < active_streams.count(); i++) {
cuda_set_device(active_streams.gpu_index(i));
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu =
std::min((int)num_radix_blocks,
std::max(THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_radix_blocks, i,
active_streams.count())));
auto num_blocks_on_gpu = std::min(
(int)num_radix_blocks,
std::max(threshold, get_num_inputs_on_gpu(num_radix_blocks, i,
active_streams.count())));
uint64_t size = 0;
execute_scratch_pbs<OutputTorus>(
@@ -424,18 +429,22 @@ struct int_radix_lut_custom_input_output {
/// back to the original indexing
multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
num_radix_blocks, params.big_lwe_dimension + 1,
size_tracker, allocate_gpu_memory);
size_tracker, params.pbs_type,
allocate_gpu_memory);
multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
num_radix_blocks, params.small_lwe_dimension + 1,
size_tracker, allocate_gpu_memory);
size_tracker, params.pbs_type,
allocate_gpu_memory);
if (num_many_lut > 1) {
multi_gpu_alloc_lwe_many_lut_output_async(
active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
params.big_lwe_dimension + 1, size_tracker, params.pbs_type,
allocate_gpu_memory);
} else {
multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
num_radix_blocks, params.big_lwe_dimension + 1,
size_tracker, allocate_gpu_memory);
size_tracker, params.pbs_type,
allocate_gpu_memory);
}
multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
num_radix_blocks, size_tracker,
@@ -451,12 +460,14 @@ struct int_radix_lut_custom_input_output {
}
void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
auto inputs_on_gpu =
std::min((int)num_input_blocks,
std::max(THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_input_blocks, 0,
active_streams.count())));
auto inputs_on_gpu = std::min(
(int)num_input_blocks,
std::max(threshold, get_num_inputs_on_gpu(num_input_blocks, 0,
active_streams.count())));
if (inputs_on_gpu >= get_threshold_ks_gemm()) {
for (auto i = 0; i < active_streams.count(); ++i) {
@@ -798,16 +809,20 @@ struct int_radix_lut_custom_input_output {
void allocate_lwe_vector_for_non_trivial_indexes(
CudaStreams streams, uint64_t max_num_radix_blocks,
uint64_t &size_tracker, bool allocate_gpu_memory) {
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
// We need to create the auxiliary array only in GPU 0
if (active_streams.count() > 1) {
lwe_aligned_vec.resize(active_streams.count());
for (uint i = 0; i < active_streams.count(); i++) {
uint64_t size_tracker_on_array_i = 0;
auto inputs_on_gpu =
std::min((int)max_num_radix_blocks,
std::max(THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(max_num_radix_blocks, i,
active_streams.count())));
auto inputs_on_gpu = std::min(
(int)max_num_radix_blocks,
std::max(threshold, get_num_inputs_on_gpu(max_num_radix_blocks, i,
active_streams.count())));
InputTorus *d_array =
(InputTorus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * (params.big_lwe_dimension + 1) *
@@ -998,8 +1013,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
num_radix_blocks * bits_per_block * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(bits_per_block * num_radix_blocks);
auto active_streams = streams.active_gpu_subset(
bits_per_block * num_radix_blocks, params.pbs_type);
lut->broadcast_lut(active_streams);
/**
@@ -1266,7 +1281,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, gpu_memory_allocated);
auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
auto active_gpu_count_mc =
streams.active_gpu_subset(pbs_count, params.pbs_type);
luts_message_carry->broadcast_lut(active_gpu_count_mc);
}
}
@@ -1436,7 +1452,8 @@ template <typename Torus> struct int_seq_group_prop_memory {
cuda_memcpy_with_size_tracking_async_to_gpu(
seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(num_seq_luts);
auto active_streams =
streams.active_gpu_subset(num_seq_luts, params.pbs_type);
lut_sequential_algorithm->broadcast_lut(active_streams);
free(h_seq_lut_indexes);
};
@@ -1490,7 +1507,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_hillis_steele,
gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_groups);
auto active_streams =
streams.active_gpu_subset(num_groups, params.pbs_type);
lut_hillis_steele->broadcast_lut(active_streams);
};
void release(CudaStreams streams) {
@@ -1667,7 +1685,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_array_first_step->broadcast_lut(active_streams);
};
void release(CudaStreams streams) {
@@ -1932,7 +1951,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
scalar_array_cum_sum, h_scalar_array_cum_sum,
num_radix_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_array_second_step->broadcast_lut(active_streams);
if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -1957,7 +1977,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
auto new_active_streams = streams.active_gpu_subset(
new_num_blocks, luts_array_second_step->params.pbs_type);
// We just need to update the lut indexes so we use false here
luts_array_second_step->broadcast_lut(new_active_streams, false);
@@ -2124,7 +2145,7 @@ template <typename Torus> struct int_sc_prop_memory {
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
lut_overflow_flag_prep->broadcast_lut(active_streams);
}
@@ -2196,7 +2217,8 @@ template <typename Torus> struct int_sc_prop_memory {
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
}
auto active_streams = streams.active_gpu_subset(num_radix_blocks + 1);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
lut_message_extract->broadcast_lut(active_streams);
};
@@ -2393,7 +2415,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_array_first_step->broadcast_lut(active_streams);
};
@@ -2404,7 +2427,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
auto new_active_streams = streams.active_gpu_subset(
new_num_blocks, luts_array_first_step->params.pbs_type);
// We just need to update the lut indexes so we use false here
luts_array_first_step->broadcast_lut(new_active_streams, false);
}
@@ -2499,7 +2523,8 @@ template <typename Torus> struct int_borrow_prop_memory {
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_message_extract,
gpu_memory_allocated);
active_streams = streams.active_gpu_subset(num_radix_blocks);
active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut_message_extract->broadcast_lut(active_streams);
@@ -2520,7 +2545,8 @@ template <typename Torus> struct int_borrow_prop_memory {
lut_borrow_flag->broadcast_lut(active_streams);
}
active_streams = streams.active_gpu_subset(num_radix_blocks);
active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
2);
};

View File

@@ -45,7 +45,8 @@ template <typename Torus> struct int_mul_memory {
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_predicate_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
zero_out_predicate_lut->broadcast_lut(active_streams);
zero_out_mem = new int_zero_out_if_buffer<Torus>(
@@ -122,7 +123,8 @@ template <typename Torus> struct int_mul_memory {
streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
auto active_streams = streams.active_gpu_subset(total_block_count);
auto active_streams =
streams.active_gpu_subset(total_block_count, params.pbs_type);
luts_array->broadcast_lut(active_streams);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(

View File

@@ -126,7 +126,8 @@ template <typename Torus> struct int_grouped_oprf_memory {
luts->get_lut_indexes(0, 0), this->h_lut_indexes,
num_blocks_to_process * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
auto active_streams =
streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
luts->broadcast_lut(active_streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));

View File

@@ -91,7 +91,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->broadcast_lut(active_streams);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
@@ -177,7 +178,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->broadcast_lut(active_streams);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
@@ -220,7 +222,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
// In the arithmetic shift, a PBS has to be applied to the last rotated
// block twice: once to shift it, once to compute the padding block to be
// copied onto all blocks to the left of the last rotated block
@@ -276,7 +278,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
shift_last_block_lut_univariate->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
auto active_streams_shift_last = streams.active_gpu_subset(1);
auto active_streams_shift_last =
streams.active_gpu_subset(1, params.pbs_type);
shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
@@ -302,7 +305,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
padding_block_lut_f, gpu_memory_allocated);
// auto active_streams = streams.active_gpu_subset(1);
// auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
padding_block_lut_univariate->broadcast_lut(active_streams);
lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -344,7 +347,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
blocks_lut_f, gpu_memory_allocated);
auto active_streams_shift_blocks =
streams.active_gpu_subset(num_radix_blocks);
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);

View File

@@ -119,8 +119,8 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
auto active_gpu_count_mux =
streams.active_gpu_subset(bits_per_block * num_radix_blocks);
auto active_gpu_count_mux = streams.active_gpu_subset(
bits_per_block * num_radix_blocks, params.pbs_type);
mux_lut->broadcast_lut(active_gpu_count_mux);
auto cleaning_lut_f = [params](Torus x) -> Torus {
@@ -132,7 +132,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
auto active_gpu_count_cleaning =
streams.active_gpu_subset(num_radix_blocks);
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
}

View File

@@ -108,7 +108,8 @@ template <typename Torus> struct int_overflowing_sub_memory {
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_message_acc, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_array->broadcast_lut(active_streams);
luts_borrow_propagation_sum->broadcast_lut(active_streams);
message_acc->broadcast_lut(active_streams);

View File

@@ -38,7 +38,8 @@ template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
uint32_t num_gpus = active_streams.count();

View File

@@ -40,7 +40,8 @@ template <typename Torus> struct int_equality_selectors_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -154,7 +155,8 @@ template <typename Torus> struct int_possible_results_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -207,7 +209,8 @@ template <typename Torus> struct int_possible_results_buffer {
params.message_modulus, params.carry_modulus, fns,
allocate_gpu_memory);
current_lut->broadcast_lut(streams.active_gpu_subset(1));
current_lut->broadcast_lut(
streams.active_gpu_subset(1, params.pbs_type));
stream_luts[lut_count++] = current_lut;
lut_value_start += luts_in_this_call;
}
@@ -282,7 +285,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams);
@@ -300,7 +304,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
id_fn, allocate_gpu_memory);
lut->broadcast_lut(streams.active_gpu_subset(num_blocks));
lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->stream_identity_luts[i] = lut;
}
@@ -321,7 +326,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
msg_fn, allocate_gpu_memory);
this->message_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks));
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->carry_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -333,7 +338,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
carry_fn, allocate_gpu_memory);
this->carry_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks));
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->partial_aggregated_vectors =
new CudaRadixCiphertextFFI *[num_streams];
@@ -628,7 +633,8 @@ template <typename Torus> struct int_unchecked_contains_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -703,7 +709,8 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -1094,7 +1101,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -1184,7 +1192,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
this->prefix_sum_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1200,7 +1209,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
this->cleanup_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
}
void release(CudaStreams streams) {
@@ -1292,7 +1302,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -1372,7 +1383,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
this->prefix_sum_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1388,7 +1400,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
this->cleanup_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
}
void release(CudaStreams streams) {
@@ -1462,7 +1475,8 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -1523,7 +1537,8 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);

View File

@@ -47,7 +47,7 @@ template <typename Torus> struct compact_lwe_list {
template <typename Torus> struct flattened_compact_lwe_lists {
Torus *d_ptr;
Torus **d_ptr_to_compact_list;
Torus **ptr_array_to_d_compact_list;
const uint32_t *h_num_lwes_per_compact_list;
uint32_t num_compact_lists;
uint32_t lwe_dimension;
@@ -59,13 +59,13 @@ template <typename Torus> struct flattened_compact_lwe_lists {
uint32_t lwe_dimension)
: d_ptr(d_ptr), h_num_lwes_per_compact_list(h_num_lwes_per_compact_list),
num_compact_lists(num_compact_lists), lwe_dimension(lwe_dimension) {
d_ptr_to_compact_list =
static_cast<Torus **>(malloc(num_compact_lists * sizeof(Torus **)));
ptr_array_to_d_compact_list =
static_cast<Torus **>(malloc(num_compact_lists * sizeof(Torus *)));
total_num_lwes = 0;
auto curr_list = d_ptr;
for (auto i = 0; i < num_compact_lists; ++i) {
total_num_lwes += h_num_lwes_per_compact_list[i];
d_ptr_to_compact_list[i] = curr_list;
ptr_array_to_d_compact_list[i] = curr_list;
curr_list += lwe_dimension + h_num_lwes_per_compact_list[i];
}
}
@@ -75,10 +75,12 @@ template <typename Torus> struct flattened_compact_lwe_lists {
PANIC("index out of range in flattened_compact_lwe_lists::get");
}
return compact_lwe_list(d_ptr_to_compact_list[compact_list_index],
return compact_lwe_list(ptr_array_to_d_compact_list[compact_list_index],
lwe_dimension,
h_num_lwes_per_compact_list[compact_list_index]);
}
void release() { free(ptr_array_to_d_compact_list); }
};
/*
@@ -121,7 +123,6 @@ template <typename Torus> struct zk_expand_mem {
: computing_params(computing_params), casting_params(casting_params),
num_compact_lists(num_compact_lists),
casting_key_type(casting_key_type) {
gpu_memory_allocated = allocate_gpu_memory;
// We copy num_lwes_per_compact_list so we get protection against
@@ -289,7 +290,8 @@ template <typename Torus> struct zk_expand_mem {
lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(2 * num_lwes);
auto active_streams =
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
message_and_carry_extract_luts->broadcast_lut(active_streams);
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
@@ -313,7 +315,6 @@ template <typename Torus> struct zk_expand_mem {
}
void release(CudaStreams streams) {
message_and_carry_extract_luts->release(streams);
delete message_and_carry_extract_luts;

View File

@@ -153,7 +153,8 @@ __host__ void are_all_comparisons_block_true(
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
h_lut_indexes, num_chunks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0));
auto active_streams = streams.active_gpu_subset(num_chunks);
auto active_streams =
streams.active_gpu_subset(num_chunks, params.pbs_type);
is_max_value_lut->broadcast_lut(active_streams);
}
lut = is_max_value_lut;
@@ -172,8 +173,8 @@ __host__ void are_all_comparisons_block_true(
is_max_value_lut->h_lut_indexes,
is_max_value_lut->num_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0));
auto active_gpu_count_is_max =
streams.active_gpu_subset(is_max_value_lut->num_blocks);
auto active_gpu_count_is_max = streams.active_gpu_subset(
is_max_value_lut->num_blocks, params.pbs_type);
is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);
reset_radix_ciphertext_blocks(lwe_array_out, 1);
@@ -488,7 +489,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
polynomial_size, message_modulus, carry_modulus, f, true,
tree_buffer->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
last_lut->broadcast_lut(active_streams);
// Last leaf

View File

@@ -339,7 +339,9 @@ host_integer_decompress(CudaStreams streams,
/// dimension to a big LWE dimension
auto encryption_params = h_mem_ptr->encryption_params;
auto lut = h_mem_ptr->decompression_rescale_lut;
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
auto active_streams = streams.active_gpu_subset(
num_blocks_to_decompress,
h_mem_ptr->decompression_rescale_lut->params.pbs_type);
if (active_streams.count() == 1) {
execute_pbs_async<Torus, Torus>(
active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,

View File

@@ -542,7 +542,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
if (active_streams.count() == 1) {
execute_keyswitch_async<Torus>(
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -645,7 +646,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
if (active_streams.count() == 1) {
execute_keyswitch_async<Torus>(
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -764,7 +766,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
if (active_streams.count() == 1) {
execute_keyswitch_async<Torus>(
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -1812,7 +1815,8 @@ uint64_t scratch_cuda_apply_univariate_lut(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
(*mem_ptr)->broadcast_lut(active_streams);
POP_RANGE()
return size_tracker;
@@ -1847,7 +1851,8 @@ uint64_t scratch_cuda_apply_many_univariate_lut(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
(*mem_ptr)->broadcast_lut(active_streams);
POP_RANGE()
return size_tracker;
@@ -1883,7 +1888,8 @@ uint64_t scratch_cuda_apply_bivariate_lut(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
*(*mem_ptr)->get_degree(0) = lut_degree;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
(*mem_ptr)->broadcast_lut(active_streams);
POP_RANGE()
return size_tracker;
@@ -2336,8 +2342,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
// from the output ct
auto active_streams =
streams.active_gpu_subset(lwe_array_out->num_radix_blocks);
auto active_streams = streams.active_gpu_subset(
lwe_array_out->num_radix_blocks, params.pbs_type);
if (active_streams.count() == 1) {
execute_keyswitch_async<InputTorus>(
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],

View File

@@ -388,7 +388,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
current_columns.next_accumulation(total_ciphertexts, total_messages,
needs_processing);
auto active_streams = streams.active_gpu_subset(total_ciphertexts);
auto active_streams =
streams.active_gpu_subset(total_ciphertexts, mem_ptr->params.pbs_type);
GPU_ASSERT(total_ciphertexts <= mem_ptr->luts_message_carry->num_blocks,
"SUM CT");
@@ -442,7 +443,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
streams.stream(0), streams.gpu_index(0), current_blocks,
num_radix_blocks, num_radix_blocks + 1);
auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks);
auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks,
mem_ptr->params.pbs_type);
if (active_streams.count() == 1) {
execute_keyswitch_async<Torus>(

View File

@@ -29,7 +29,8 @@ void host_integer_grouped_oprf(CudaStreams streams,
int_grouped_oprf_memory<Torus> *mem_ptr,
void *const *bsks) {
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
auto active_streams = streams.active_gpu_subset(num_blocks_to_process,
mem_ptr->params.pbs_type);
auto lut = mem_ptr->luts;
if (active_streams.count() == 1) {

View File

@@ -34,49 +34,49 @@ void cuda_rerand_64(
switch (rerand_buffer->params.big_lwe_dimension) {
case 256:
rerand_inplace<uint64_t, AmortizedDegree<256>>(
host_rerand_inplace<uint64_t, AmortizedDegree<256>>(
streams, static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(
lwe_flattened_encryptions_of_zero_compact_array_in),
(uint64_t **)(ksk), rerand_buffer);
break;
case 512:
rerand_inplace<uint64_t, AmortizedDegree<512>>(
host_rerand_inplace<uint64_t, AmortizedDegree<512>>(
streams, static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(
lwe_flattened_encryptions_of_zero_compact_array_in),
(uint64_t **)(ksk), rerand_buffer);
break;
case 1024:
rerand_inplace<uint64_t, AmortizedDegree<1024>>(
host_rerand_inplace<uint64_t, AmortizedDegree<1024>>(
streams, static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(
lwe_flattened_encryptions_of_zero_compact_array_in),
(uint64_t **)(ksk), rerand_buffer);
break;
case 2048:
rerand_inplace<uint64_t, AmortizedDegree<2048>>(
host_rerand_inplace<uint64_t, AmortizedDegree<2048>>(
streams, static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(
lwe_flattened_encryptions_of_zero_compact_array_in),
(uint64_t **)(ksk), rerand_buffer);
break;
case 4096:
rerand_inplace<uint64_t, AmortizedDegree<4096>>(
host_rerand_inplace<uint64_t, AmortizedDegree<4096>>(
streams, static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(
lwe_flattened_encryptions_of_zero_compact_array_in),
(uint64_t **)(ksk), rerand_buffer);
break;
case 8192:
rerand_inplace<uint64_t, AmortizedDegree<8192>>(
host_rerand_inplace<uint64_t, AmortizedDegree<8192>>(
streams, static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(
lwe_flattened_encryptions_of_zero_compact_array_in),
(uint64_t **)(ksk), rerand_buffer);
break;
case 16384:
rerand_inplace<uint64_t, AmortizedDegree<16384>>(
host_rerand_inplace<uint64_t, AmortizedDegree<16384>>(
streams, static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(
lwe_flattened_encryptions_of_zero_compact_array_in),
@@ -88,9 +88,6 @@ void cuda_rerand_64(
" in the interval [256..16384].");
break;
}
cuda_synchronize_stream(static_cast<cudaStream_t>(streams.streams[0]),
streams.gpu_indexes[0]);
}
void cleanup_cuda_rerand(CudaStreamsFFI streams, int8_t **mem_ptr_void) {

View File

@@ -10,7 +10,7 @@
#include "zk/zk_utilities.h"
template <typename Torus, class params>
void rerand_inplace(
void host_rerand_inplace(
CudaStreams const streams, Torus *lwe_array,
const Torus *lwe_flattened_encryptions_of_zero_compact_array_in,
Torus *const *ksk, int_rerand_mem<Torus> *mem_ptr) {
@@ -73,6 +73,7 @@ void rerand_inplace(
message_modulus, carry_modulus);
release_cpu_radix_ciphertext_async(&lwes_ffi);
release_cpu_radix_ciphertext_async(&ksed_zero_lwes_ffi);
compact_lwe_lists.release();
}
template <typename Torus>

View File

@@ -45,7 +45,8 @@ host_scalar_bitop(CudaStreams streams, CudaRadixCiphertextFFI *output,
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
num_clear_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0));
auto active_streams = streams.active_gpu_subset(num_clear_blocks);
auto active_streams = streams.active_gpu_subset(
num_clear_blocks, mem_ptr->lut->params.pbs_type);
lut->broadcast_lut(active_streams, false);
integer_radix_apply_univariate_lookup_table<Torus>(

View File

@@ -146,7 +146,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
true, mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table<Torus>(
@@ -240,7 +240,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
polynomial_size, message_modulus, carry_modulus,
scalar_bivariate_last_leaf_lut_f, true,
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
lut->broadcast_lut(active_streams);
integer_radix_apply_bivariate_lookup_table<Torus>(
@@ -274,7 +274,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, one_block_lut_f, true,
mem_ptr->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
one_block_lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table<Torus>(
@@ -419,7 +419,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
polynomial_size, message_modulus, carry_modulus,
scalar_bivariate_last_leaf_lut_f, true,
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
lut->broadcast_lut(active_streams);
integer_radix_apply_bivariate_lookup_table<Torus>(
@@ -521,7 +521,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
signed_msb_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, true, mem_ptr->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
signed_msb_lut->broadcast_lut(active_streams);
CudaRadixCiphertextFFI sign_block;
@@ -567,7 +567,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, one_block_lut_f, true,
mem_ptr->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
one_block_lut->broadcast_lut(active_streams);
integer_radix_apply_univariate_lookup_table<Torus>(
@@ -785,8 +785,8 @@ __host__ void host_scalar_equality_check(
num_halved_scalar_blocks * sizeof(Torus), lsb_streams.stream(0),
lsb_streams.gpu_index(0));
}
auto active_streams =
lsb_streams.active_gpu_subset(num_halved_scalar_blocks);
auto active_streams = lsb_streams.active_gpu_subset(
num_halved_scalar_blocks, params.pbs_type);
// We use false cause we only will broadcast the indexes
scalar_comparison_luts->broadcast_lut(active_streams, false);

View File

@@ -30,7 +30,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
uint32_t lwe_chunk_size, uint64_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t num_many_lut, uint32_t lut_stride) {
@@ -321,8 +321,9 @@ __host__ void execute_cg_external_product_loop(
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
uint32_t chunk_size = (uint32_t)(std::min(
lwe_chunk_size,
(uint64_t)(lwe_dimension / grouping_factor) - lwe_offset));
auto d_mem = buffer->d_mem_acc_cg;
auto keybundle_fft = buffer->keybundle_fft;

View File

@@ -373,7 +373,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
double2 *join_buffer, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t iteration, uint64_t lwe_chunk_size,
uint32_t level_count, uint32_t iteration, uint32_t lwe_chunk_size,
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t num_many_lut, uint32_t lut_stride) {
// We use shared memory for the polynomials that are used often during the
@@ -790,7 +790,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
uint32_t lut_stride) {
cuda_set_device(gpu_index);
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint32_t lwe_chunk_size = (uint32_t)(buffer->lwe_chunk_size);
uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
polynomial_size);

View File

@@ -626,7 +626,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
if (!cuda_check_support_thread_block_clusters())
return false;
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(

View File

@@ -30,7 +30,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
uint32_t lwe_chunk_size, uint64_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
bool support_dsm, uint32_t num_many_lut, uint32_t lut_stride) {
@@ -205,10 +205,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset,
uint64_t lwe_chunk_size, uint64_t keybundle_size_per_input,
uint32_t num_many_lut, uint32_t lut_stride) {
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
uint32_t lwe_dimension, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint64_t keybundle_size_per_input, uint32_t num_many_lut,
uint32_t lut_stride) {
constexpr uint32_t level_count = 1;
constexpr uint32_t grouping_factor = 4;
@@ -548,8 +548,9 @@ __host__ void execute_tbc_external_product_loop(
lwe_chunk_size * level_count * (glwe_dimension + 1) *
(glwe_dimension + 1) * (polynomial_size / 2);
uint64_t chunk_size = std::min(
lwe_chunk_size, (uint64_t)(lwe_dimension / grouping_factor) - lwe_offset);
uint32_t chunk_size = (uint32_t)(std::min(
lwe_chunk_size,
(uint64_t)(lwe_dimension / grouping_factor) - lwe_offset));
auto d_mem = buffer->d_mem_acc_tbc;
auto keybundle_fft = buffer->keybundle_fft;
@@ -624,9 +625,9 @@ __host__ void execute_tbc_external_product_loop(
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, lwe_offset, chunk_size,
keybundle_size_per_input, num_many_lut, lut_stride));
lwe_array_in, lwe_input_indexes, keybundle_fft, global_accumulator,
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input,
num_many_lut, lut_stride));
} else {
check_cuda_error(cudaLaunchKernelEx(
&config,

View File

@@ -5,7 +5,8 @@
std::mutex m;
bool p2p_enabled = false;
const int THRESHOLD_MULTI_GPU = 12;
const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS = 12;
const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS = 68;
// Enable bidirectional p2p access between all available GPUs and device_0_id
int32_t cuda_setup_multi_gpu(int device_0_id) {
@@ -39,10 +40,13 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
return (int32_t)(num_used_gpus);
}
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count) {
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type) {
int threshold = (pbs_type == MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
uint32_t ceil_div_inputs =
std::max((uint32_t)1,
(num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
std::max((uint32_t)1, (num_inputs + threshold - 1) / threshold);
uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
return active_gpu_count;
}

View File

@@ -59,15 +59,20 @@ template <typename Torus>
void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
uint32_t num_inputs, uint32_t lwe_size,
uint64_t &size_tracker_on_gpu_0,
bool allocate_gpu_memory) {
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
PANIC_IF_FALSE(dest.empty(),
"Cuda error: Requested multi-GPU vector is already allocated");
int threshold = (pbs_type == MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
dest.resize(streams.count());
for (uint i = 0; i < streams.count(); i++) {
uint64_t size_tracker_on_gpu_i = 0;
auto inputs_on_gpu = std::min(
(int)num_inputs,
std::max(THRESHOLD_MULTI_GPU,
std::max((int)threshold,
get_num_inputs_on_gpu(num_inputs, i, streams.count())));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * lwe_size * sizeof(Torus), streams.stream(i),
@@ -81,7 +86,7 @@ void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
template void multi_gpu_alloc_lwe_async<__uint128_t>(
CudaStreams streams, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
/// Allocates the input/output vector for all devices
@@ -91,16 +96,21 @@ template <typename Torus>
void multi_gpu_alloc_lwe_many_lut_output_async(
CudaStreams streams, std::vector<Torus *> &dest, uint32_t num_inputs,
uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
bool allocate_gpu_memory) {
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
PANIC_IF_FALSE(dest.empty(),
"Cuda error: Requested multi-GPU vector is already allocated");
int threshold = (pbs_type == MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
dest.resize(streams.count());
for (uint i = 0; i < streams.count(); i++) {
uint64_t size_tracker = 0;
auto inputs_on_gpu = std::min(
(int)num_inputs,
std::max(THRESHOLD_MULTI_GPU,
std::max((int)threshold,
get_num_inputs_on_gpu(num_inputs, i, streams.count())));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus),

View File

@@ -100,6 +100,7 @@ __host__ void host_expand_without_verification(
2 * num_lwes);
release_cpu_radix_ciphertext_async(&input);
release_cpu_radix_ciphertext_async(&output);
compact_lwe_lists.release();
}
template <typename Torus>

View File

@@ -113,6 +113,7 @@ pub fn iop_add_simd(prog: &mut Program) {
prog,
crate::asm::iop::SIMD_N,
fw_impl::llt::iop_add_ripple_rtl,
None,
);
}
@@ -227,14 +228,18 @@ pub fn iop_muls(prog: &mut Program) {
pub fn iop_erc_20(prog: &mut Program) {
// Add Comment header
prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string());
iop_erc_20_rtl(prog, 0).add_to_prog(prog);
// TODO: Make sweep of kogge_blk_w
// All these little parameters would be very handy to write an
// exploration/compilation program which would try to minimize latency by
// playing with these.
iop_erc_20_rtl(prog, 0, Some(10)).add_to_prog(prog);
}
#[instrument(level = "trace", skip(prog))]
pub fn iop_erc_20_simd(prog: &mut Program) {
// Add Comment header
prog.push_comment("ERC_20_SIMD (new_from, new_to) <- (from, to, amount)".to_string());
simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl);
simd(prog, crate::asm::iop::SIMD_N, fw_impl::llt::iop_erc_20_rtl, None);
}
#[instrument(level = "trace", skip(prog))]
@@ -381,7 +386,7 @@ pub fn iop_rotate_scalar_left(prog: &mut Program) {
/// (dst_from[0], dst_to[0], ..., dst_from[N-1], dst_to[N-1])
/// Where N is the batch size
#[instrument(level = "trace", skip(prog))]
pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8, kogge_blk_w: Option<usize>) -> Rtl {
// Allocate metavariables:
// Dest -> Operand
let dst_from = prog.iop_template_var(OperandKind::Dst, 2 * batch_index);
@@ -392,13 +397,6 @@ pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
// Src Amount -> Operand
let src_amount = prog.iop_template_var(OperandKind::Src, 3 * batch_index + 2);
// TODO: Make this a parameter or sweep this
// All these little parameters would be very handy to write an
// exploration/compilation program which would try to minimize latency by
// playing with these.
let kogge_blk_w = 10;
let ripple = true;
{
let props = prog.params();
let tfhe_params: asm::DigitParameters = props.clone().into();
@@ -429,19 +427,20 @@ pub fn iop_erc_20_rtl(prog: &mut Program, batch_index: u8) -> Rtl {
})
.collect::<Vec<_>>();
if ripple {
if let Some(blk_w) = kogge_blk_w {
kogge::add(prog, dst_to, src_to, src_amount.clone(), None, blk_w)
+ kogge::sub(prog, dst_from, src_from, src_amount, blk_w)
} else { // Default to ripple carry
kogge::ripple_add(dst_to, src_to, src_amount.clone(), None)
+ kogge::ripple_sub(prog, dst_from, src_from, src_amount)
} else {
kogge::add(prog, dst_to, src_to, src_amount.clone(), None, kogge_blk_w)
+ kogge::sub(prog, dst_from, src_from, src_amount, kogge_blk_w)
}
}
}
/// A SIMD implementation of add for maximum throughput
/// NB: No use of kogge_blk_w here, impl force to use ripple carry
#[instrument(level = "trace", skip(prog))]
pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8) -> Rtl {
pub fn iop_add_ripple_rtl(prog: &mut Program, i: u8, _kogge_blk_w: Option<usize>) -> Rtl {
// Allocate metavariables:
let dst = prog.iop_template_var(OperandKind::Dst, i);
let src_a = prog.iop_template_var(OperandKind::Src, 2 * i);
@@ -899,13 +898,13 @@ fn bw_inv(prog: &mut Program, b: Vec<VarCell>) -> Vec<VarCell> {
/// Maybe this should go into a SIMD firmware implementation... At some point we
/// would need a mechanism to choose between implementations on the fly to make
/// real good use of all of this.
fn simd<F>(prog: &mut Program, batch_size: usize, rtl_closure: F)
fn simd<F>(prog: &mut Program, batch_size: usize, rtl_closure: F, kogge_blk_w: Option<usize>)
where
F: Fn(&mut Program, u8) -> Rtl,
F: Fn(&mut Program, u8, Option<usize>) -> Rtl,
{
(0..batch_size)
.map(|i| i as u8)
.map(|i| rtl_closure(prog, i))
.map(|i| rtl_closure(prog, i, kogge_blk_w))
.sum::<Rtl>()
.add_to_prog(prog);
}

View File

@@ -32,10 +32,9 @@ fi
RUSTFLAGS="$RUSTFLAGS" cargo nextest list --cargo-profile "${CARGO_PROFILE}" \
--features=integer,internal-keycache,gpu-debug,zk-pok -p tfhe &> /tmp/test_list.txt
# Filter the tests to get only the HL and a subset of core crypto ones
TESTS_TO_RUN=$(sed -e $'s/\x1b\[[0-9;]*m//g' < /tmp/test_list.txt | grep -E 'high_level_api::.*gpu.*|core_crypto::.*gpu.*' | grep -v 'array' | grep -v 'modulus_switch' | grep -v '3_3' | grep -v 'noise_distribution' | grep -v 'flip')
if [[ "${RUN_VALGRIND}" == "1" ]]; then
TESTS_TO_RUN=$(sed -e $'s/\x1b\[[0-9;]*m//g' < /tmp/test_list.txt | grep -E 'high_level_api::.*gpu.*' | grep -v 'array' | grep -v 'flip')
# Build the tests but don't run them
RUSTFLAGS="$RUSTFLAGS" cargo test --no-run --profile "${CARGO_PROFILE}" \
--features=integer,internal-keycache,gpu-debug,zk-pok -p tfhe
@@ -57,6 +56,7 @@ if [[ "${RUN_VALGRIND}" == "1" ]]; then
fi
if [[ "${RUN_COMPUTE_SANITIZER}" == "1" ]]; then
TESTS_TO_RUN=$(sed -e $'s/\x1b\[[0-9;]*m//g' < /tmp/test_list.txt | grep -E 'high_level_api::.*gpu.*|core_crypto::.*gpu.*' | grep -v 'array' | grep -v 'modulus_switch' | grep -v '3_3' | grep -v 'noise_distribution' | grep -v 'flip')
# Build the tests but don't run them
RUSTFLAGS="$RUSTFLAGS" cargo test --no-run --profile "${CARGO_PROFILE}" \
--features=integer,internal-keycache,gpu,zk-pok -p tfhe

View File

@@ -28,13 +28,12 @@ pub fn transfer_whitepaper<FheType>(
amount: &FheType,
) -> (FheType, FheType)
where
FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + FheTrivialEncrypt<u64>,
FheBool: IfThenElse<FheType>,
FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType>,
FheBool: IfThenZero<FheType>,
for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
{
let has_enough_funds = (from_amount).ge(amount);
let zero_amount = FheType::encrypt_trivial(0u64);
let amount_to_transfer = has_enough_funds.select(amount, &zero_amount);
let amount_to_transfer = has_enough_funds.if_then_zero(amount);
let new_to_amount = to_amount + &amount_to_transfer;
let new_from_amount = from_amount - &amount_to_transfer;
@@ -51,12 +50,13 @@ pub fn par_transfer_whitepaper<FheType>(
where
FheType:
Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + Send + Sync + FheTrivialEncrypt<u64>,
FheBool: IfThenElse<FheType>,
FheBool: IfThenZero<FheType>,
for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
{
let has_enough_funds = (from_amount).ge(amount);
let zero_amount = FheType::encrypt_trivial(0u64);
let amount_to_transfer = has_enough_funds.select(amount, &zero_amount);
//let zero_amount = FheType::encrypt_trivial(0u64);
//let amount_to_transfer = has_enough_funds.select(amount, &zero_amount);
let amount_to_transfer = has_enough_funds.if_then_zero(amount);
let (new_to_amount, new_from_amount) = rayon::join(
|| to_amount + &amount_to_transfer,

View File

@@ -33,8 +33,8 @@ impl ProofConfig {
fn default_proof_config() -> Vec<ProofConfig> {
vec![
ProofConfig::new(64usize, &[64usize]),
ProofConfig::new(2048, &[4 * 64, 10 * 64, 2048]),
ProofConfig::new(64, &[64]),
ProofConfig::new(2048, &[64, 4 * 64, 2048]),
ProofConfig::new(4096, &[4096]),
]
}

View File

@@ -7,7 +7,7 @@ use crate::high_level_api::integers::{FheInt, FheIntId, FheUint, FheUintId};
use crate::high_level_api::keys::InternalServerKey;
use crate::high_level_api::re_randomization::ReRandomizationMetadata;
use crate::high_level_api::traits::{
FheEq, Flip, IfThenElse, ReRandomize, ScalarIfThenElse, Tagged,
FheEq, Flip, IfThenElse, IfThenZero, ReRandomize, ScalarIfThenElse, Tagged,
};
use crate::high_level_api::{global_state, CompactPublicKey};
use crate::integer::block_decomposition::DecomposableInto;
@@ -552,6 +552,66 @@ where
}
}
impl<Id> IfThenZero<FheUint<Id>> for FheBool
where
Id: FheUintId,
{
/// Conditional selection.
///
/// The output value returned depends on the value of `self`.
///
/// - if `self` is true, the output will have the value of `ct_then`
/// - if `self` is false, the output will be an encryption of 0
fn if_then_zero(&self, ct_then: &FheUint<Id>) -> FheUint<Id> {
global_state::with_internal_keys(|sks| match sks {
InternalServerKey::Cpu(cpu_sks) => {
let ct_condition = self;
let mut ct_out = ct_then.ciphertext.on_cpu().clone();
cpu_sks.pbs_key().zero_out_if_condition_is_false(
&mut ct_out,
&ct_condition.ciphertext.on_cpu().0,
);
FheUint::new(
ct_out,
cpu_sks.tag.clone(),
ReRandomizationMetadata::default(),
)
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
panic!("Cuda does not support if_then_zero")
}
#[cfg(feature = "hpu")]
InternalServerKey::Hpu(device) => {
let hpu_then = ct_then.ciphertext.on_hpu(device);
let hpu_cond = self.ciphertext.on_hpu(device);
let (opcode, proto) = {
let asm_iop = &hpu_asm::iop::IOP_IF_THEN_ZERO;
(
asm_iop.opcode(),
&asm_iop.format().expect("Unspecified IOP format").proto,
)
};
// These clones are cheap are they are just Arc
let hpu_result = HpuRadixCiphertext::exec(
proto,
opcode,
&[hpu_then.clone(), hpu_cond.clone()],
&[],
)
.pop()
.unwrap();
FheUint::new(
hpu_result,
device.tag.clone(),
ReRandomizationMetadata::default(),
)
}
})
}
}
impl<Id: FheIntId> IfThenElse<FheInt<Id>> for FheBool {
/// Conditional selection.
///

View File

@@ -11,6 +11,7 @@ pub use crate::high_level_api::traits::{
FheOrd, FheTrivialEncrypt, FheTryEncrypt, FheTryTrivialEncrypt, FheWait, Flip, IfThenElse,
OverflowingAdd, OverflowingMul, OverflowingNeg, OverflowingSub, ReRandomize, RotateLeft,
RotateLeftAssign, RotateRight, RotateRightAssign, ScalarIfThenElse, SquashNoise, Tagged,
IfThenZero,
};
#[cfg(feature = "hpu")]
pub use crate::high_level_api::traits::{FheHpu, HpuHandle};

View File

@@ -149,6 +149,10 @@ pub trait IfThenElse<Ciphertext> {
}
}
pub trait IfThenZero<Ciphertext> {
fn if_then_zero(&self, ct_then: &Ciphertext) -> Ciphertext;
}
pub trait ScalarIfThenElse<Lhs, Rhs> {
type Output;

View File

@@ -656,18 +656,6 @@ async function compactPublicKeyZeroKnowledgeBench() {
ShortintCompactPublicKeyEncryptionParametersName.PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
),
},
{
zk_scheme: "ZKV1",
name: shortint_params_name(
ShortintParametersName.PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
),
block_params: new ShortintParameters(
ShortintParametersName.PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
),
casting_params: new ShortintCompactPublicKeyEncryptionParameters(
ShortintCompactPublicKeyEncryptionParametersName.V1_5_PARAM_PKE_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128_ZKV1,
),
},
];
let bench_results = {};
@@ -695,8 +683,8 @@ async function compactPublicKeyZeroKnowledgeBench() {
// Proof configuration:
let proof_configs = [
{ crs_bit_size: 64, bits_to_encrypt: [64] },
{ crs_bit_size: 640, bits_to_encrypt: [640] },
{ crs_bit_size: 2048, bits_to_encrypt: [2048, 64 * 4] }, // 64 * 4 is a production use-case
// 64 * 4 is a production use-case
{ crs_bit_size: 2048, bits_to_encrypt: [64, 4 * 64, 2048] },
{ crs_bit_size: 4096, bits_to_encrypt: [4096] },
];