Compare commits

..

1 Commits

Author SHA1 Message Date
Guillermo Oyarzun
d4a489e9b0 fix(gpu): handling temporary events destruction 2026-01-26 16:57:49 +01:00
284 changed files with 5809 additions and 15226 deletions

15
.github/runs-on.yml vendored
View File

@@ -1,15 +0,0 @@
runners:
cpu-big:
family: m6i.32xlarge
image: cpu-tests-eu-west-3
volume: 200gb
spot: false
cpu-small:
family: m6i.4xlarge
image: cpu-tests-eu-west-3
volume: 200gb
spot: false
images:
cpu-tests-eu-west-3:
ami: "ami-0a786ffdb1411fac4" # Ubuntu 24.04

View File

@@ -34,9 +34,6 @@ permissions:
jobs:
setup-instance:
name: aws_tfhe_backward_compat_tests/setup-instance
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name != 'push'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -44,7 +41,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -69,7 +66,7 @@ jobs:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'true' # Needed to pull lfs data
token: ${{ env.CHECKOUT_TOKEN }}
@@ -144,7 +141,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -63,7 +63,7 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -171,7 +171,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -299,7 +299,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -50,7 +50,7 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -86,7 +86,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
timeout-minutes: 480 # 8 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
@@ -168,7 +168,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -35,7 +35,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
timeout-minutes: 1440
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -100,7 +100,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -51,7 +51,7 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
@@ -172,7 +172,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -72,7 +72,7 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -155,7 +155,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -182,7 +182,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -279,7 +279,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -147,7 +147,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -16,12 +16,10 @@ on:
- integer_zk
- shortint
- shortint_oprf
- hlapi_unsigned
- hlapi_signed
- hlapi
- hlapi_erc20
- hlapi_dex
- hlapi_noise_squash
- hlapi_kvstore
- tfhe_zk_pok
- boolean
- pbs

View File

@@ -126,7 +126,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -149,7 +149,7 @@ jobs:
params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -229,7 +229,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -261,7 +261,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -24,7 +24,6 @@ permissions: {}
jobs:
prepare-inputs:
name: benchmark_cpu_weekly/prepare-inputs
if: github.repository == 'zama-ai/tfhe-rs'
runs-on: ubuntu-latest
outputs:
is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -73,7 +72,8 @@ jobs:
run-benchmarks-integer:
name: benchmark_cpu_weekly/run-benchmarks-integer
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
if: github.repository == 'zama-ai/tfhe-rs'
&& (needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -92,7 +92,8 @@ jobs:
run-benchmarks-integer-zk-pke:
name: benchmark_cpu_weekly/run-benchmarks-integer-zk-pke
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -110,7 +111,8 @@ jobs:
run-benchmarks-hlapi-erc20:
name: benchmark_cpu_weekly/run-benchmarks-hlapi-erc20
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -128,7 +130,8 @@ jobs:
run-benchmarks-hlapi-dex:
name: benchmark_cpu_weekly/run-benchmarks-hlapi-dex
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -146,7 +149,8 @@ jobs:
run-benchmarks-core-crypto:
name: benchmark_cpu_weekly/run-benchmarks-core-crypto
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -163,7 +167,8 @@ jobs:
run-benchmarks-shortint:
name: benchmark_cpu_weekly/run-benchmarks-shortint
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true'
if: github.repository == 'zama-ai/tfhe-rs'
&& (needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true' || needs.prepare-inputs.outputs.is_quarterly_bench == 'true')
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -181,7 +186,8 @@ jobs:
run-benchmarks-boolean:
name: benchmark_cpu_weekly/run-benchmarks-boolean
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:
@@ -200,7 +206,8 @@ jobs:
run-benchmarks-tfhe-zk-pok:
name: benchmark_cpu_weekly/run-benchmarks-tfhe-zk-pok
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs'
&& needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_cpu_common.yml
with:

View File

@@ -33,7 +33,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -49,7 +49,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -105,7 +105,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -137,7 +137,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -25,6 +25,10 @@ on:
description: "Generate SVG tables"
type: boolean
default: true
open-pr:
description: "Open a PR with the benchmark results"
type: boolean
default: false
permissions: {}
@@ -162,3 +166,54 @@ jobs:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
open-pr:
name: benchmark-documentation/open-pr
needs: [ generate-svgs-with-benchmarks-run, generate-svgs-without-benchmarks-run ]
if: ${{ always() && inputs.open-pr &&
(needs.generate-svgs-with-benchmarks-run.result == 'success' || needs.generate-svgs-without-benchmarks-run.result == 'success') }}
runs-on: ubuntu-latest
permissions:
contents: write # Needed to create a commit
pull-requests: write # Needed to open a pull-request
env:
PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
- name: Download SVG tables
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
with:
path: svg_tables
merge-multiple: 'true'
# Perform best effort to copy SVG tables. If the copy fails or files don't exist, the PR will still be created.
- name: Copy SVG tables to documentation location
run: |
cp -f svg_tables/*integer-benchmark*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
cp -f svg_tables/*pbs-benchmark-tuniform*.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
cp -f svg_tables/cpu-gpu-hpu-integer-benchmark-fheuint64-tuniform-2m128-ciphertext.svg "${PATH_TO_DOC_ASSETS}" 2>/dev/null
- name: Get current date
id: get-date
run: |
echo "date=$(date '+%g_%m_%d_%Hh%Mm%Ss')" >> "${GITHUB_OUTPUT}"
- name: Create pull-request
uses: peter-evans/create-pull-request@98357b18bf14b5342f975ff684046ec3b2a07725 # v8.0.0
with:
sign-commits: true # Commit will be signed by github-actions bot
add-paths: ${{ env.PATH_TO_DOC_ASSETS }}/*.svg
branch: gh-bot/docs/update-svg-tables-${{ steps.get-date.outputs.date }}
commit-message: |
chore(docs): update benchmark results for all backends
Automated documentation update from tfhe-rs CI pipeline.
title: |
[CI] chore(docs): update benchmark results for all backends
body: |
Documentation update triggered by GitHub workflow.
labels: documentation

View File

@@ -40,7 +40,7 @@ jobs:
timeout-minutes: 1440 # 24 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -123,7 +123,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -134,7 +134,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -175,7 +175,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -287,7 +287,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -324,7 +324,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -50,8 +50,6 @@ env:
jobs:
parse-inputs:
name: benchmark_gpu_coprocessor/parse-inputs
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
permissions:
contents: 'read'
@@ -94,7 +92,7 @@ jobs:
steps:
- name: Start remote instance
id: start-remote-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -132,7 +130,7 @@ jobs:
git lfs install
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
path: tfhe-rs
persist-credentials: false
@@ -143,7 +141,7 @@ jobs:
ls
- name: Checkout fhevm
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
repository: zama-ai/fhevm
persist-credentials: 'false'
@@ -194,7 +192,7 @@ jobs:
cargo install sqlx-cli
- name: Install foundry
uses: foundry-rs/foundry-toolchain@8789b3e21e6c11b2697f5eb56eddae542f746c10
uses: foundry-rs/foundry-toolchain@8b0419c685ef46cb79ec93fbdc131174afceb730
- name: Cache cargo
uses: actions/cache@8b402f58fbc84540c8b491a91e594a4576fec3d7 # v5.0.2
@@ -301,7 +299,7 @@ jobs:
path: fhevm/$${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -326,7 +324,7 @@ jobs:
steps:
- name: Stop remote instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -25,7 +25,6 @@ permissions: {}
jobs:
prepare-inputs:
name: benchmark_cpu_weekly/prepare-inputs
if: github.repository == 'zama-ai/tfhe-rs'
runs-on: ubuntu-latest
outputs:
is_weekly_bench_group_1: ${{ steps.check_bench_group_1.outputs.is_weekly_bench_group_1 }}
@@ -50,7 +49,8 @@ jobs:
run-benchmarks-8-h100-sxm5-integer:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -72,7 +72,8 @@ jobs:
run-benchmarks-8-h100-sxm5-integer-compression:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-compression
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -94,7 +95,8 @@ jobs:
run-benchmarks-8-h100-sxm5-integer-zk-aes:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-integer-zk-aes
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -116,7 +118,8 @@ jobs:
run-benchmarks-8-h100-sxm5-noise-squash:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-sxm5-noise-squash
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -138,7 +141,8 @@ jobs:
run-benchmarks-1-h100-core-crypto:
name: benchmark_gpu_weekly/run-benchmarks-1-h100-core-crypto (1xH100)
if: needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_1 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -162,7 +166,8 @@ jobs:
run-benchmarks-1-h100-erc20:
name: benchmark_gpu_weekly/run-benchmarks-1-h100-erc20
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -182,7 +187,8 @@ jobs:
run-benchmarks-2-h100-erc20:
name: benchmark_gpu_weekly/run-benchmarks-2-h100-erc20
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -202,7 +208,8 @@ jobs:
run-benchmarks-8-h100-erc20:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-erc20
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -226,7 +233,8 @@ jobs:
run-benchmarks-1-h100-dex:
name: benchmark_gpu_weekly/run-benchmarks-1-h100-dex
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -246,7 +254,8 @@ jobs:
run-benchmarks-2-h100-dex:
name: benchmark_gpu_weekly/run-benchmarks-2-h100-dex
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:
@@ -266,7 +275,8 @@ jobs:
run-benchmarks-8-h100-dex:
name: benchmark_gpu_weekly/run-benchmarks-8-h100-dex
if: needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
if: github.repository == 'zama-ai/tfhe-rs' &&
needs.prepare-inputs.outputs.is_weekly_bench_group_2 == 'true'
needs: prepare-inputs
uses: ./.github/workflows/benchmark_gpu_common.yml
with:

View File

@@ -12,8 +12,7 @@ on:
default: integer
options:
- integer
- hlapi_unsigned
- hlapi_signed
- hlapi
- hlapi_erc20
op_flavor:
description: "Operations set to run"

View File

@@ -126,7 +126,7 @@ jobs:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -191,7 +191,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab

View File

@@ -50,7 +50,7 @@ jobs:
pull-requests: write # Needed to write a comment in a pull-request
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -143,7 +143,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0 # Needed to get commit hash
persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -305,13 +305,13 @@ jobs:
REF_NAME: ${{ github.head_ref || github.ref_name }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Install recent Python
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
with:
python-version: '3.12'
pip-install: -r ci/data_extractor/requirements.txt -r ci/perf_regression/requirements.txt
@@ -383,7 +383,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,126 +0,0 @@
# Run all benchmarks displayed in the internal documentation.
name: benchmark_summary
run-name: Benchmark Summary
on:
workflow_dispatch:
inputs:
run-cpu-benchmarks:
description: "Run CPU benchmarks"
type: boolean
default: true
run-gpu-benchmarks:
description: "Run GPU benchmarks"
type: boolean
default: true
gpu-profile:
description: "GPU Instance type"
required: true
default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
type: choice
options:
- "l40 (n3-L40x1)"
- "4-l40 (n3-L40x4)"
- "multi-a100-nvlink (n3-A100x8-NVLink)"
- "single-h100 (n3-H100x1)"
- "2-h100 (n3-H100x2)"
- "4-h100 (n3-H100x4)"
- "multi-h100 (n3-H100x8)"
- "multi-h100-nvlink (n3-H100x8-NVLink)"
- "multi-h100-sxm5 (n3-H100-SXM5x8)"
run-hpu-benchmarks:
description: "Run HPU benchmarks"
type: boolean
default: true
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
jobs:
parse-gpu-inputs:
name: benchmark_summary/parse-gpu-inputs
if: inputs.run-gpu-benchmarks
runs-on: ubuntu-latest
outputs:
profile: ${{ steps.parse_profile.outputs.profile }}
hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
env:
INPUTS_PROFILE: ${{ inputs.gpu-profile }}
steps:
- name: Parse profile
id: parse_profile
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
- name: Parse hardware name
id: parse_hardware_name
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
run-benchmarks-cpu:
name: benchmark_documentation/run-benchmarks-cpu-integer
uses: ./.github/workflows/benchmark_cpu_common.yml
if: inputs.run-cpu-benchmarks
with:
command: summary
bench_type: both
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-gpu:
name: benchmark_documentation/run-benchmarks-gpu
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-benchmarks
needs: parse-gpu-inputs
with:
profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
command: summary
bench_type: both
params_type: classical + multi_bit
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# TODO add make recipe for HPU benchmarks
# run-benchmarks-hpu:
# name: benchmark_documentation/run-benchmarks-hpu
# uses: ./.github/workflows/benchmark_hpu_common.yml
# if: inputs.run-hpu-benchmarks
# with:
# command: summary
# bench_type: both
# v80_pcie_dev: 24
# v80_serial_number: XFL12NWY3ZKG
# secrets:
# BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
# REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
# JOB_SECRET: ${{ secrets.JOB_SECRET }}
# SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
# SLAB_URL: ${{ secrets.SLAB_URL }}
# SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}

View File

@@ -31,16 +31,13 @@ permissions: {}
jobs:
setup-instance:
name: benchmark_tfhe_fft/setup-instance
if:
(github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -58,7 +55,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -105,7 +102,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -137,7 +134,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -31,16 +31,13 @@ permissions: {}
jobs:
setup-instance:
name: benchmark_tfhe_ntt/setup-instance
if:
(github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-instance.outputs.label }}
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -58,7 +55,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -105,7 +102,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -137,7 +134,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -31,14 +31,15 @@ jobs:
name: benchmark_wasm_client/should-run
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch' ||
(github.event_name != 'workflow_dispatch' && github.repository == 'zama-ai/tfhe-rs')
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs') ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
permissions:
pull-requests: read # Needed to check for file change
outputs:
wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -70,7 +71,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,7 +91,7 @@ jobs:
browser: [ chrome, firefox ]
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -180,7 +181,7 @@ jobs:
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: zama-ai/slab
path: slab
@@ -212,7 +213,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -24,11 +24,9 @@ permissions: {}
jobs:
audit:
name: cargo_audit/audit
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
outputs:
matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -80,7 +80,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -140,7 +140,7 @@ jobs:
result: ${{ steps.set_builds_result.outputs.result }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -242,7 +242,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -26,7 +26,7 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
os: [ubuntu-latest, macos-latest, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -2,7 +2,6 @@
name: cargo_test_fft
on:
workflow_dispatch:
pull_request:
push:
branches:
@@ -23,8 +22,6 @@ permissions:
jobs:
should-run:
name: cargo_test_fft/should-run
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
@@ -32,7 +29,7 @@ jobs:
fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -59,7 +56,7 @@ jobs:
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
fail-fast: false
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -95,7 +92,7 @@ jobs:
if: needs.should-run.outputs.fft_test == 'true'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -2,7 +2,6 @@
name: cargo_test_ntt
on:
workflow_dispatch:
pull_request:
push:
branches:
@@ -25,8 +24,6 @@ permissions:
jobs:
should-run:
name: cargo_test_ntt/should-run
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
@@ -34,7 +31,7 @@ jobs:
ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: "false"
@@ -63,7 +60,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -90,7 +87,7 @@ jobs:
os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
fail-fast: false
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}
@@ -146,7 +143,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -43,7 +43,7 @@ jobs:
echo "version=$(make zizmor_version)" >> "${GITHUB_OUTPUT}"
- name: Check workflows security
uses: zizmorcore/zizmor-action@135698455da5c3b3e55f73f4419e481ab68cdd95 # v0.4.1
uses: zizmorcore/zizmor-action@e639db99335bc9038abc0e066dfcd72e23d26fb4 # v0.3.0
with:
advanced-security: 'false' # Print results directly in logs
persona: pedantic

View File

@@ -31,7 +31,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
timeout-minutes: 5760 # 4 days
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -130,7 +130,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -37,7 +37,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -93,7 +93,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -43,7 +43,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'

View File

@@ -19,8 +19,8 @@ on:
pull_request:
types: [ labeled ]
schedule:
# Every other day at 1AM
- cron: "0 1 */2 * *"
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
permissions:
contents: read
@@ -37,11 +37,11 @@ jobs:
group: ${{ github.workflow_ref }}
cancel-in-progress: true
runs-on: ["self-hosted", "4090-desktop"]
timeout-minutes: 2880 # 48 hours
timeout-minutes: 1440 # 24 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -23,8 +23,8 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# every friday noon
- cron: "0 12 * * 5"
# every month
- cron: "0 0 1 * *"
permissions:
contents: read
@@ -35,15 +35,15 @@ jobs:
setup-instance:
name: gpu_code_validation_tests/setup-instance
runs-on: ubuntu-latest
if: github.event_name == 'workflow_dispatch' ||
(github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
if: github.event_name != 'pull_request' ||
(github.event.action == 'labeled' && github.event.label.name == 'approved')
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -79,7 +79,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -137,7 +137,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,5 +1,5 @@
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: gpu_core_h100_tests
name: gpu_fast_h100_tests
env:
CARGO_TERM_COLOR: always
@@ -32,7 +32,7 @@ permissions:
jobs:
should-run:
name: gpu_core_h100_tests/should-run
name: gpu_fast_h100_tests/should-run
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -61,14 +61,15 @@ jobs:
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_core_h100_tests.yml'
- '.github/workflows/gpu_fast_h100_tests.yml'
- scripts/integer-tests.sh
- ci/slab.toml
setup-instance:
name: gpu_core_h100_tests/setup-instance
name: gpu_fast_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
@@ -86,7 +87,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -110,7 +111,7 @@ jobs:
echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
cuda-tests-linux:
name: gpu_core_h100_tests/cuda-tests-linux
name: gpu_fast_h100_tests/cuda-tests-linux
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
@@ -128,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -154,8 +155,20 @@ jobs:
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
- name: Run user docs tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
- name: Test C API
run: |
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
- name: Run High Level API Tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
slack-notify:
name: gpu_core_h100_tests/slack-notify
name: gpu_fast_h100_tests/slack-notify
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
@@ -174,10 +187,10 @@ jobs:
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "Core H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
SLACK_MESSAGE: "Fast H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
teardown-instance:
name: gpu_core_h100_tests/teardown-instance
name: gpu_fast_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
@@ -185,7 +198,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -79,7 +79,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -114,7 +114,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -151,7 +151,7 @@ jobs:
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu_fast
make test_high_level_api_gpu
slack-notify:
name: gpu_fast_tests/slack-notify
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -35,7 +35,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
continue-on-error: true
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -68,7 +68,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -124,7 +124,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -154,7 +154,7 @@ jobs:
- name: Run High Level API Tests
run: |
make test_high_level_api_gpu_fast
make test_high_level_api_gpu
slack-notify:
name: gpu_full_multi_gpu_tests/slack-notify
@@ -187,7 +187,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,209 +0,0 @@
# Compile and test tfhe-cuda-backend on an H100 VM on hyperstack
name: gpu_hlapi_h100_tests
env:
CARGO_TERM_COLOR: always
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUSTFLAGS: "-C target-cpu=native"
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACKIFY_MARKDOWN: true
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
PULL_REQUEST_MD_LINK: ""
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
# Secrets will be available only to zama-ai organization members
SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
pull_request:
types: [ labeled ]
permissions:
contents: read
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
jobs:
should-run:
name: gpu_hlapi_h100_tests/should-run
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
outputs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
with:
files_yaml: |
gpu:
- tfhe/Cargo.toml
- tfhe/build.rs
- backends/tfhe-cuda-backend/**
- tfhe/src/core_crypto/gpu/**
- tfhe/src/integer/gpu/**
- tfhe/src/integer/server_key/radix_parallel/tests_unsigned/**
- tfhe/src/integer/server_key/radix_parallel/tests_signed/**
- tfhe/src/integer/server_key/radix_parallel/tests_cases_unsigned.rs
- tfhe/src/shortint/parameters/**
- tfhe/src/high_level_api/**
- tfhe/src/c_api/**
- 'tfhe/docs/**/**.md'
- '.github/workflows/gpu_hlapi_h100_tests.yml'
- scripts/integer-tests.sh
- ci/slab.toml
setup-instance:
name: gpu_hlapi_h100_tests/setup-instance
needs: should-run
if: github.event_name != 'pull_request' ||
(github.event.action != 'labeled' && needs.should-run.outputs.gpu_test == 'true') ||
(github.event.action == 'labeled' && github.event.label.name == 'approved' && needs.should-run.outputs.gpu_test == 'true')
runs-on: ubuntu-latest
outputs:
# Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
# If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
# Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
# otherwise we'll try to run the next job on a non-existing on-demand instance.
runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
steps:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
# This will allow to fallback on permanent instances running on Hyperstack.
- name: Use permanent remote instance
id: use-permanent-instance
if: env.SECRETS_AVAILABLE == 'true' && steps.start-remote-instance.outcome == 'failure'
run: |
echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
id: start-github-instance
if: env.SECRETS_AVAILABLE == 'false'
run: |
echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
cuda-tests-linux:
name: gpu_hlapi_h100_tests/cuda-tests-linux
needs: [ should-run, setup-instance ]
if: github.event_name != 'pull_request' ||
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
concurrency:
group: ${{ github.workflow_ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
strategy:
fail-fast: false
# explicit include-based build matrix, of known valid options
matrix:
include:
- os: ubuntu-22.04
cuda: "12.8"
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Setup Hyperstack dependencies
if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
uses: ./.github/actions/gpu_setup
with:
cuda-version: ${{ matrix.cuda }}
gcc-version: ${{ matrix.gcc }}
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
- name: Install latest stable
uses: dtolnay/rust-toolchain@e97e2d8cc328f1b50210efc529dca0028893a2d9 # zizmor: ignore[stale-action-refs] this action doesn't create releases
with:
toolchain: stable
- name: Enable nvidia multi-process service
run: |
nvidia-cuda-mps-control -d
- name: Run user docs tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_user_doc_gpu
- name: Test C API
run: |
BIG_TESTS_INSTANCE=TRUE make test_c_api_gpu
- name: Run High Level API Tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_high_level_api_gpu
slack-notify:
name: gpu_hlapi_h100_tests/slack-notify
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
continue-on-error: true
steps:
- name: Set pull-request URL
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
run: |
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
env:
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
PR_NUMBER: ${{ github.event.pull_request.number }}
- name: Send message
if: env.SECRETS_AVAILABLE == 'true'
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
SLACK_MESSAGE: "HL API H100 tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
teardown-instance:
name: gpu_hlapi_h100_tests/teardown-instance
if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
needs: [ setup-instance, cuda-tests-linux ]
runs-on: ubuntu-latest
steps:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
label: ${{ needs.setup-instance.outputs.runner-name }}
- name: Slack Notification
if: ${{ failure() }}
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "Instance teardown (cuda-h100-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

View File

@@ -38,7 +38,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -65,7 +65,7 @@ jobs:
timeout-minutes: 4320 # 72 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -112,7 +112,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -42,7 +42,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -78,7 +78,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -134,7 +134,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -38,7 +38,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -74,7 +74,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -159,7 +159,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -81,7 +81,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -170,7 +170,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -87,7 +87,7 @@ jobs:
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
continue-on-error: true
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -184,7 +184,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -82,7 +82,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -117,7 +117,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -179,7 +179,7 @@ jobs:
- name: Stop instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -2,7 +2,6 @@
name: hpu_hlapi_tests
on:
workflow_dispatch:
pull_request:
push:
branches:
@@ -26,8 +25,6 @@ permissions: {}
jobs:
should-run:
name: hpu_hlapi_tests/should-run
if: github.event_name != 'push' ||
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs')
runs-on: ubuntu-latest
permissions:
pull-requests: read # Needed to check for file change
@@ -35,7 +32,7 @@ jobs:
hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -65,7 +62,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -86,7 +83,7 @@ jobs:
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -117,7 +114,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -34,7 +34,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -53,7 +53,7 @@ jobs:
timeout-minutes: 4320 # 72 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -83,7 +83,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
timeout-minutes: 720
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -52,7 +52,7 @@ jobs:
hash: ${{ steps.hash.outputs.hash }}
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -75,7 +75,6 @@ jobs:
name: make_release_common/provenance
if: ${{ !inputs.dry-run }}
needs: package
# This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
permissions:
actions: read # Needed to detect the GitHub Actions environment
@@ -94,7 +93,7 @@ jobs:
id-token: write # Needed for OIDC token exchange on crates.io
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0
persist-credentials: 'false'

View File

@@ -37,7 +37,7 @@ jobs:
steps:
- name: Start instance
id: start-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -64,7 +64,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0
persist-credentials: "false"
@@ -117,7 +117,6 @@ jobs:
name: make_release_cuda/provenance
if: ${{ !inputs.dry_run }}
needs: [package]
# This action cannot be pinned to a specific commit (see https://github.com/slsa-framework/slsa-github-generator/blob/main/README.md#referencing-slsa-builders-and-generators)
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
permissions:
actions: read # Needed to detect the GitHub Actions environment
@@ -222,7 +221,7 @@ jobs:
steps:
- name: Stop instance
id: stop-instance
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -68,7 +68,7 @@ jobs:
id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -92,7 +92,7 @@ jobs:
- name: Publish web package
if: ${{ inputs.push_web_package }}
uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
with:
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}
@@ -109,7 +109,7 @@ jobs:
- name: Publish Node package
if: ${{ inputs.push_node_package }}
uses: JS-DevTools/npm-publish@4ce4bd0f334d5316473155078da1955d42148494
uses: JS-DevTools/npm-publish@d2fef917d9aa6e1f0ee5eac28ed023eb4921ce51
with:
package: tfhe/pkg/package.json
dry-run: ${{ inputs.dry_run }}

View File

@@ -30,7 +30,7 @@ jobs:
name: parameters_check/setup-instance
if:
(github.event_name == 'push' && github.repository == 'zama-ai/tfhe-rs') ||
github.event_name != 'push'
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
outputs:
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
@@ -38,7 +38,7 @@ jobs:
- name: Start remote instance
id: start-remote-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: start
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
@@ -60,7 +60,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
toolchain: stable
- name: Checkout lattice-estimator
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
with:
repository: malb/lattice-estimator
path: lattice_estimator
@@ -137,7 +137,7 @@ jobs:
- name: Stop remote instance
id: stop-instance
if: env.SECRETS_AVAILABLE == 'true'
uses: zama-ai/slab-github-runner@d4580322fc216877c48ac2987df9573ffd03476c # v1.5.0
uses: zama-ai/slab-github-runner@973c1d22702de8d0acd2b34e83404c96ed92c264 # v1.4.2
with:
mode: stop
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}

View File

@@ -1,127 +1,18 @@
# Placeholder workflow file allowing running it without having to merge to main first
name: placeholder_workflow
run-name: Summary benchs tests
on:
workflow_dispatch:
inputs:
run-cpu-benchmarks:
description: "Run CPU benchmarks"
type: boolean
default: true
run-gpu-benchmarks:
description: "Run GPU benchmarks"
type: boolean
default: true
gpu-profile:
description: "GPU Instance type"
required: true
default: "multi-h100-sxm5 (n3-H100-SXM5x8)"
type: choice
options:
- "l40 (n3-L40x1)"
- "4-l40 (n3-L40x4)"
- "8-l40 (n3-L40x8)"
- "multi-a100-nvlink (n3-A100x8-NVLink)"
- "single-h100 (n3-H100x1)"
- "2-h100 (n3-H100x2)"
- "4-h100 (n3-H100x4)"
- "multi-h100 (n3-H100x8)"
- "multi-h100-nvlink (n3-H100x8-NVLink)"
- "multi-h100-sxm5 (n3-H100-SXM5x8)"
run-hpu-benchmarks:
description: "Run HPU benchmarks"
type: boolean
default: true
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
jobs:
parse-gpu-inputs:
name: benchmark_summary/parse-gpu-inputs
if: inputs.run-gpu-benchmarks
placeholder:
name: placeholder_workflow/placeholder
runs-on: ubuntu-latest
outputs:
profile: ${{ steps.parse_profile.outputs.profile }}
hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
env:
INPUTS_PROFILE: ${{ inputs.gpu-profile }}
steps:
- name: Parse profile
id: parse_profile
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
PROFILE=$(echo "${INPUTS_PROFILE}" | sed 's|\(.*\)[[:space:]](.*)|\1|')
echo "profile=${PROFILE}" >> "${GITHUB_OUTPUT}"
- name: Parse hardware name
id: parse_hardware_name
run: |
# Use Sed to extract a value from a string, this cannot be done with the ${variable//search/replace} pattern.
# shellcheck disable=SC2001
NAME=$(echo "${INPUTS_PROFILE}" | sed 's|.*[[:space:]](\(.*\))|\1|')
echo "name=${NAME}" >> "${GITHUB_OUTPUT}"
run-benchmarks-cpu:
name: benchmark_documentation/run-benchmarks-cpu-integer
uses: ./.github/workflows/benchmark_cpu_common.yml
if: inputs.run-cpu-benchmarks
with:
command: summary
bench_type: both
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
run-benchmarks-gpu:
name: benchmark_documentation/run-benchmarks-gpu
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-benchmarks
needs: parse-gpu-inputs
with:
profile: ${{ needs.parse-gpu-inputs.outputs.profile }}
hardware_name: ${{ needs.parse-gpu-inputs.outputs.hardware_name }}
command: summary
bench_type: both
params_type: classical + multi_bit
secrets:
BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
JOB_SECRET: ${{ secrets.JOB_SECRET }}
SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
SLAB_URL: ${{ secrets.SLAB_URL }}
SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# TODO add make recipe for HPU benchmarks
# run-benchmarks-hpu:
# name: benchmark_documentation/run-benchmarks-hpu
# uses: ./.github/workflows/benchmark_hpu_common.yml
# if: inputs.run-hpu-benchmarks
# with:
# command: summary
# bench_type: both
# v80_pcie_dev: 24
# v80_serial_number: XFL12NWY3ZKG
# secrets:
# BOT_USERNAME: ${{ secrets.BOT_USERNAME }}
# SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
# SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
# REPO_CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN }}
# JOB_SECRET: ${{ secrets.JOB_SECRET }}
# SLAB_ACTION_TOKEN: ${{ secrets.SLAB_ACTION_TOKEN }}
# SLAB_URL: ${{ secrets.SLAB_URL }}
# SLAB_BASE_URL: ${{ secrets.SLAB_BASE_URL }}
# SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
- run: |
echo "Hello this is a Placeholder Workflow"

View File

@@ -0,0 +1,67 @@
name: pr_milestone_check
on:
pull_request:
types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
# external contributors workflows are manually approved
jobs:
check-empty-milestone:
name: pr_milestone_check/check-empty-milestone
runs-on: ubuntu-latest
if: github.event.pull_request.milestone == null
permissions:
pull-requests: write # Need write access on pull requests to post comment
steps:
- name: Post Reminder Comment
uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
with:
route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
body: |
'### ❌ Milestone Missing
Please assign a milestone to this pull request. If your PR targets the next version of
TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
If your PR targets a patch version for previous releases: consider creating a dedicated
milestone e.g. v1.5.1 if it does not exist yet.'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Check Final Status
run: |
echo "::error::Milestone is missing. This check is failing."
exit 1
check-milestone-open:
name: pr_milestone_check/check-milestone-open
runs-on: ubuntu-latest
if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
permissions:
pull-requests: write # Need write access on pull requests to post comment
steps:
- name: Post Reminder Comment
uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
with:
route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
body: |
'### ❌ Milestone is closed
Please assign an open milestone to this pull request. If your PR targets the next version of
TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
If your PR targets a patch version for previous releases: consider creating a dedicated
milestone e.g. v1.5.1 if it does not exist yet.'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Check Final Status
run: |
echo "::error::Milestone is closed. This check is failing."
exit 1

View File

@@ -30,7 +30,7 @@ jobs:
run: |
echo ">>> Cloning source repo..."
git lfs install
git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs --origin source && cd ./tfhe-rs
git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -47,8 +47,6 @@ jobs:
echo ">>> Pushing all LFS items..."
git lfs push --all destination "${DESTINATION_BRANCH}"
shred --remove .git/config
- name: git-sync-tags
env:
@@ -61,7 +59,7 @@ jobs:
run: |
echo ">>> Cloning source repo..."
git lfs install
git clone --quiet "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
git clone "https://${USERNAME}:${TOKEN}@github.com/${SOURCE_REPO}.git" ./tfhe-rs-tag --origin source && cd ./tfhe-rs-tag
git remote add destination "https://${USERNAME}:${TOKEN}@github.com/${DEST_REPO}.git"
echo ">>> Fetching all branches references down locally so subsequent commands can see them..."
@@ -72,5 +70,3 @@ jobs:
echo ">>> Pushing git changes..."
git push destination "${SOURCE_BRANCH}:${DESTINATION_BRANCH}" -f
shred --remove .git/config

View File

@@ -12,7 +12,6 @@ permissions: {}
jobs:
stale:
name: unverified_prs/stale
if: github.repository == 'zama-ai/tfhe-rs'
runs-on: ubuntu-latest
permissions:
issues: read # Needed to fetch all issues

161
Makefile
View File

@@ -733,12 +733,11 @@ test_core_crypto_gpu:
--features=gpu -p tfhe -- core_crypto::gpu::
.PHONY: test_integer_gpu # Run the tests of the integer module including experimental on the gpu backend
test_integer_gpu: install_cargo_nextest
TEST_THREADS=2 \
DOCTEST_THREADS=4 \
./scripts/integer-tests.sh \
--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
--tfhe-package "tfhe" --all-but-noise
test_integer_gpu:
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=2
RUSTFLAGS="$(RUSTFLAGS)" cargo test --doc --profile $(CARGO_PROFILE) \
--features=integer,gpu -p tfhe -- integer::gpu::server_key:: --test-threads=4
.PHONY: test_integer_gpu_debug # Run the tests of the integer module with Debug flags for CUDA
test_integer_gpu_debug:
@@ -1050,16 +1049,10 @@ test_high_level_api:
--features=boolean,shortint,integer,internal-keycache,zk-pok,strings -p tfhe \
-- high_level_api::
test_high_level_api_gpu_fast: install_cargo_nextest # Run all the GPU tests for high_level_api except test_uniformity for oprf which is too long
test_high_level_api_gpu: install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-E "test(/high_level_api::.*gpu.*/) and not test(/uniformity/)"
test_high_level_api_gpu: install_cargo_nextest # Run all the GPU tests for high_level_api
RUSTFLAGS="$(RUSTFLAGS)" cargo nextest run --cargo-profile $(CARGO_PROFILE) \
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-E "test(/high_level_api::.*gpu.*/)"
-E "test(/high_level_api::.*gpu.*/)"
test_list_gpu: install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo nextest list --cargo-profile $(CARGO_PROFILE) \
@@ -1378,9 +1371,6 @@ clippy_bench: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
-p tfhe-benchmark -- --no-deps -D warnings
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=shortint,internal-keycache \
-p tfhe-benchmark -- --no-deps -D warnings
.PHONY: clippy_bench_gpu # Run clippy lints on tfhe-benchmark
clippy_bench_gpu: install_rs_check_toolchain
@@ -1415,14 +1405,14 @@ bench_signed_integer: install_rs_check_toolchain
.PHONY: bench_integer_gpu # Run benchmarks for integer on GPU backend
bench_integer_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_signed_integer_gpu # Run benchmarks for signed integer on GPU backend
bench_signed_integer_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-signed \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1438,14 +1428,14 @@ bench_integer_hpu: install_rs_check_toolchain
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-glwe_packing_compression \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_integer_compression_gpu
bench_integer_compression_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-glwe_packing_compression \
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
@@ -1459,8 +1449,7 @@ bench_integer_compression_128b_gpu: install_rs_check_toolchain
.PHONY: bench_integer_zk_gpu
bench_integer_zk_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
@@ -1486,13 +1475,6 @@ bench_integer_trivium_gpu: install_rs_check_toolchain
--bench integer-trivium \
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_kreyvium_gpu # Run benchmarks for kreyvium on GPU backend
bench_integer_kreyvium_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-kreyvium \
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1527,7 +1509,7 @@ bench_signed_integer_multi_bit_gpu: install_rs_check_toolchain
.PHONY: bench_integer_zk # Run benchmarks for integer encryption with ZK proofs
bench_integer_zk: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,zk-pok,pbs-stats \
@@ -1673,18 +1655,11 @@ bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
nvm use $(NODE_VERSION) && \
$(MAKE) bench_web_js_api_unsafe_coop_firefox
.PHONY: bench_hlapi_unsigned # Run benchmarks for integer operations
bench_hlapi_unsigned: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
.PHONY: bench_hlapi # Run benchmarks for integer operations
bench_hlapi: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi_unsigned \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_hlapi_signed # Run benchmarks for signed integer operations
bench_hlapi_signed: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi_signed \
--bench hlapi \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_hlapi_gpu # Run benchmarks for integer operations on GPU
@@ -1774,108 +1749,6 @@ bench_hlapi_noise_squash_gpu: install_rs_check_toolchain
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_hlapi_kvstore # Run benchmarks for Key-Value Store operations
bench_hlapi_kvstore: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-kvstore \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --
.PHONY: bench_summary # Run summary benchmarks
bench_summary: install_rs_check_toolchain
# Arithmetic operations: addition, multiplication, division, comparison
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi_unsigned \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::add|::mul|::gt|::div_rem'
# Noise squash
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::noise_squash::'
# ERC20
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,internal-keycache -p tfhe-benchmark -- '::transfer::overflow'
# DEX
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-dex \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark -- '::no_cmux::'
# ZK
# Proof is done on CPU node of the instance
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,zk-pok,pbs-stats \
-p tfhe-benchmark -- '::pke_zk_proof'
# Verify is done on GPUs
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,pbs-stats,zk-pok -p tfhe-benchmark --profile release_lto_off --
# Compression
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-glwe_packing_compression \
--features=integer,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_summary_gpu # Run summary benchmarks on GPU
bench_summary_gpu: install_rs_check_toolchain
# Arithmetic operations: addition, multiplication, division, comparison
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=FAST_DEFAULT __TFHE_RS_BENCH_BIT_SIZES_SET=FAST __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::add|::mul|::gt|::div_rem'
# Noise squash
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::noise_squash::'
# Noise squash and compression
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-noise-squash \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::decomp_noise_squash_comp::'
# ERC20
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,gpu,internal-keycache -p tfhe-benchmark --profile release_lto_off -- '::transfer::overflow'
# DEX
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-dex \
--features=integer,gpu,internal-keycache,pbs-stats -p tfhe-benchmark --profile release_lto_off -- '::no_cmux::'
# ZK
# Proof is done on CPU node of the instance
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,zk-pok,pbs-stats \
-p tfhe-benchmark -- '::pke_zk_proof'
# Verify is done on GPUs
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_OP_FLAVOR=fast_default __TFHE_RS_BENCH_BIT_SIZES_SET=fast \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-zk-pke \
--features=integer,internal-keycache,gpu,pbs-stats,zk-pok -p tfhe-benchmark --
# Compression
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=$(BENCH_PARAM_TYPE) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) __TFHE_RS_BENCH_BIT_SIZES_SET=FAST \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-glwe_packing_compression \
--features=integer,internal-keycache,gpu,pbs-stats -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_custom # Run benchmarks with a user-defined command
bench_custom: install_rs_check_toolchain

View File

@@ -87,7 +87,6 @@ fn main() {
"cuda/include/integer/rerand.h",
"cuda/include/aes/aes.h",
"cuda/include/trivium/trivium.h",
"cuda/include/kreyvium/kreyvium.h",
"cuda/include/zk/zk.h",
"cuda/include/keyswitch/keyswitch.h",
"cuda/include/keyswitch/ks_enums.h",

View File

@@ -29,13 +29,15 @@ template <typename Torus> struct int_aes_lut_buffers {
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus a, Torus b) -> Torus { return a & b; };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and_lut = streams.active_gpu_subset(
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
params.pbs_type);
this->and_lut->generate_and_broadcast_bivariate_lut(
active_streams_and_lut, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->and_lut->broadcast_lut(active_streams_and_lut);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->flush_lut = new int_radix_lut<Torus>(
@@ -44,11 +46,14 @@ template <typename Torus> struct int_aes_lut_buffers {
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
return x & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush_lut = streams.active_gpu_subset(
AES_STATE_BITS * num_aes_inputs, params.pbs_type);
this->flush_lut->generate_and_broadcast_lut(
active_streams_flush_lut, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->flush_lut->broadcast_lut(active_streams_flush_lut);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->carry_lut = new int_radix_lut<Torus>(
@@ -56,11 +61,14 @@ template <typename Torus> struct int_aes_lut_buffers {
std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
return (x >> 1) & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_lambda, allocate_gpu_memory);
auto active_streams_carry_lut =
streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
this->carry_lut->generate_and_broadcast_lut(
active_streams_carry_lut, {0}, {carry_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->carry_lut->broadcast_lut(active_streams_carry_lut);
this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

View File

@@ -10,7 +10,11 @@ extern std::mutex m;
extern bool p2p_enabled;
extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS_U128;
extern "C" {
int32_t cuda_setup_multi_gpu(int device_0_id);
}
// Define a variant type that can be either a vector or a single pointer
template <typename Torus>
using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
@@ -38,8 +42,6 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type);
uint32_t get_active_gpu_count_u128(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
@@ -78,15 +80,7 @@ public:
_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
}
// Returns a subset of this set as an active subset for pbs128. An active
// subset is one that is temporarily used to perform some computation. For
// pbs128, the threshold is different, because the original threshold was
// designed for 2_2 params.
CudaStreams active_gpu_subset_u128(int num_radix_blocks, PBS_TYPE pbs_type) {
return CudaStreams(
_streams, _gpu_indexes,
get_active_gpu_count_u128(num_radix_blocks, _gpu_count, pbs_type));
}
// Returns a CudaStreams struct containing only the ith stream
CudaStreams get_ith(int i) const {
return CudaStreams(&_streams[i], &_gpu_indexes[i], 1);
@@ -150,9 +144,9 @@ public:
_gpu_count(src._gpu_count), _owns_streams(false) {}
CudaStreams &operator=(CudaStreams const &other) {
/* PANIC_IF_FALSE(this->_streams == nullptr ||
this->_streams == other._streams,
"Assigning an already initialized CudaStreams");*/
PANIC_IF_FALSE(this->_streams == nullptr ||
this->_streams == other._streams,
"Assigning an already initialized CudaStreams");
this->_streams = other._streams;
this->_gpu_indexes = other._gpu_indexes;
this->_gpu_count = other._gpu_count;

View File

@@ -45,9 +45,12 @@ template <typename Torus> struct boolean_bitop_buffer {
// BooleanBlock can have degree 0 or 1. when ct is 0 path is hardcoded,
// only lut for degree = 1 is generated
lut->generate_and_broadcast_bivariate_lut(active_streams, {0},
{lut_bivariate_f},
LUT_0_FOR_ALL_BLOCKS, {}, 2);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, 2, gpu_memory_allocated);
lut->broadcast_lut(active_streams);
}
break;
default:
@@ -62,8 +65,14 @@ template <typename Torus> struct boolean_bitop_buffer {
return x % params.message_modulus;
};
message_extract_lut->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0),
message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
message_extract_lut->broadcast_lut(active_streams);
}
tmp_lwe_left = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -133,8 +142,12 @@ template <typename Torus> struct int_bitop_buffer {
}
};
lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {lut_bivariate_f}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(active_streams);
}
break;
default:
@@ -143,8 +156,6 @@ template <typename Torus> struct int_bitop_buffer {
num_radix_blocks, allocate_gpu_memory,
size_tracker);
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
@@ -160,13 +171,14 @@ template <typename Torus> struct int_bitop_buffer {
return x ^ rhs;
}
};
lut_funcs.push_back(lut_univariate_scalar_f);
lut_indices.push_back(i);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(active_streams);
}
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
LUT_0_FOR_ALL_BLOCKS);
}
}
@@ -199,11 +211,16 @@ template <typename Torus> struct boolean_bitnot_buffer {
return x % message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0),
message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
message_extract_lut->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
message_extract_lut->broadcast_lut(active_streams);
}
}

View File

@@ -28,16 +28,20 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
uint32_t bits_per_block = std::log2(params.message_modulus);
uint32_t msg_modulus = params.message_modulus;
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut->generate_and_broadcast_lut(
active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
[msg_modulus, bits_per_block](Torus x) {
const auto xm = x % msg_modulus;
const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
return (Torus)((msg_modulus - 1) * sign_bit);
}},
LUT_0_FOR_ALL_BLOCKS);
},
allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut->broadcast_lut(active_streams);
this->last_block = new CudaRadixCiphertextFFI;

View File

@@ -85,28 +85,42 @@ template <typename Torus> struct int_cmux_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
message_extract_lut_f, gpu_memory_allocated);
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams_pred =
streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
auto lut_index_generator = [num_radix_blocks](Torus *h_lut_indexes,
uint32_t num_indexes) {
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
};
predicate_lut->generate_and_broadcast_bivariate_lut(
active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
lut_index_generator);
predicate_lut->broadcast_lut(active_streams_pred);
auto active_streams_msg =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
message_extract_lut->generate_and_broadcast_lut(
active_streams_msg, {0}, {message_extract_lut_f}, LUT_0_FOR_ALL_BLOCKS);
message_extract_lut->broadcast_lut(active_streams_msg);
}
void release(CudaStreams streams) {

View File

@@ -28,7 +28,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
Torus total_modulus = params.message_modulus * params.carry_modulus;
uint32_t max_value = (total_modulus - 1) / (params.message_modulus - 1);
int max_chunks = CEIL_DIV(num_radix_blocks, max_value);
int max_chunks = (num_radix_blocks + max_value - 1) / max_value;
tmp_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), tmp_out, num_radix_blocks,
@@ -39,21 +39,22 @@ template <typename Torus> struct int_are_all_block_true_buffer {
max_chunks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(max_chunks, params.pbs_type);
auto is_max_value_f = [max_value](Torus x) -> Torus {
return x == max_value;
};
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
is_max_value->get_degree(0), is_max_value->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
is_max_value->generate_and_broadcast_lut(
active_streams, {0}, {is_max_value_f}, LUT_0_FOR_ALL_BLOCKS);
auto active_streams =
streams.active_gpu_subset(max_chunks, params.pbs_type);
is_max_value->broadcast_lut(active_streams);
}
void release(CudaStreams streams) {
@@ -102,10 +103,15 @@ template <typename Torus> struct int_comparison_eq_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
is_non_zero_lut->generate_and_broadcast_lut(
active_streams, {0}, {is_non_zero_lut_f}, LUT_0_FOR_ALL_BLOCKS);
is_non_zero_lut->broadcast_lut(active_streams);
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -123,27 +129,32 @@ template <typename Torus> struct int_comparison_eq_buffer {
return (lhs == rhs);
}
};
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < total_modulus; i++) {
auto lut_f = [i, operator_f](Torus x) -> Torus {
return operator_f(i, x);
};
lut_funcs.push_back(lut_f);
lut_indices.push_back(i);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
scalar_comparison_luts->get_lut(0, i),
scalar_comparison_luts->get_degree(i),
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, gpu_memory_allocated);
}
scalar_comparison_luts->generate_and_broadcast_lut(
active_streams, lut_indices, lut_funcs, LUT_0_FOR_ALL_BLOCKS);
scalar_comparison_luts->broadcast_lut(active_streams);
if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
operator_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
operator_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {operator_f}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, operator_f, gpu_memory_allocated);
operator_lut->broadcast_lut(active_streams);
} else {
operator_lut = nullptr;
}
@@ -210,6 +221,9 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// LUTs
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
tree_last_leaf_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -220,14 +234,15 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
block_selector_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {block_selector_f}, LUT_0_FOR_ALL_BLOCKS);
tree_inner_leaf_lut->broadcast_lut(active_streams);
}
void release(CudaStreams streams) {
@@ -411,8 +426,12 @@ template <typename Torus> struct int_comparison_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
identity_lut->generate_and_broadcast_lut(
active_streams, {0}, {identity_lut_f}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
identity_lut->broadcast_lut(active_streams);
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -422,8 +441,13 @@ template <typename Torus> struct int_comparison_buffer {
is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_zero_f, gpu_memory_allocated);
is_zero_lut->broadcast_lut(active_streams);
switch (op) {
case COMPARISON_TYPE::MAX:
@@ -498,9 +522,13 @@ template <typename Torus> struct int_comparison_buffer {
PANIC("Cuda error: sign_lut creation failed due to wrong function.")
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
signed_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {signed_lut_f}, LUT_0_FOR_ALL_BLOCKS);
signed_lut->broadcast_lut(active_streams);
}
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));

View File

@@ -11,26 +11,16 @@ template <typename Torus> struct int_compression {
Torus *tmp_glwe_array_out;
bool gpu_memory_allocated;
uint32_t lwe_per_glwe;
uint32_t max_num_glwes;
// num_radix_blocks: total number of LWE ciphertexts (radix blocks) to
// compress lwe_per_glwe: max LWEs packed per GLWE (= polynomial_size),
// defined by the chosen parameter set
int_compression(CudaStreams streams, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
// Calculate the actual number of GLWEs needed based on total radix blocks.
// This ensures we allocate enough memory when num_radix_blocks >
// lwe_per_glwe.
max_num_glwes = CEIL_DIV(num_radix_blocks, lwe_per_glwe);
tmp_lwe = static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
sizeof(Torus),
@@ -38,7 +28,7 @@ template <typename Torus> struct int_compression {
allocate_gpu_memory));
tmp_glwe_array_out =
static_cast<Torus *>(cuda_malloc_with_size_tracking_async(
max_num_glwes * glwe_accumulator_size * sizeof(Torus),
lwe_per_glwe * glwe_accumulator_size * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), size_tracker,
allocate_gpu_memory));
@@ -116,13 +106,19 @@ template <typename Torus> struct int_decompression {
encryption_params.carry_modulus;
auto effective_compression_carry_modulus = 1;
auto active_streams = streams.active_gpu_subset(
num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
decompression_rescale_lut->generate_and_broadcast_lut_with_encoding(
active_streams, {0}, {decompression_rescale_f},
generate_device_accumulator_with_encoding<Torus>(
streams.stream(0), streams.gpu_index(0),
decompression_rescale_lut->get_lut(0, 0),
decompression_rescale_lut->get_degree(0),
decompression_rescale_lut->get_max_degree(0),
encryption_params.glwe_dimension, encryption_params.polynomial_size,
effective_compression_message_modulus,
effective_compression_carry_modulus,
encryption_params.message_modulus, encryption_params.carry_modulus);
encryption_params.message_modulus, encryption_params.carry_modulus,
decompression_rescale_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(
num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
decompression_rescale_lut->broadcast_lut(active_streams);
}
}
void release(CudaStreams streams) {

View File

@@ -283,9 +283,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
zero_out_if_not_1_lut_2};
size_t lut_gpu_indexes[2] = {0, 3};
for (int j = 0; j < 2; j++) {
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
{0}, {zero_out_if_not_1_lut_f},
LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(lut_gpu_indexes[j]),
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
}
luts[0] = zero_out_if_not_2_lut_1;
@@ -293,9 +296,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
lut_gpu_indexes[0] = 1;
lut_gpu_indexes[1] = 2;
for (int j = 0; j < 2; j++) {
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
{0}, {zero_out_if_not_2_lut_f},
LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(lut_gpu_indexes[j]),
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
}
quotient_lut_1 =
@@ -315,12 +321,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
};
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
quotient_lut_1->generate_and_broadcast_lut(
streams.get_ith(2), {0}, {quotient_lut_1_f}, LUT_0_FOR_ALL_BLOCKS);
quotient_lut_2->generate_and_broadcast_lut(
streams.get_ith(1), {0}, {quotient_lut_2_f}, LUT_0_FOR_ALL_BLOCKS);
quotient_lut_3->generate_and_broadcast_lut(
streams.get_ith(0), {0}, {quotient_lut_3_f}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
message_extract_lut_1 = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -335,12 +350,15 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
luts[0] = message_extract_lut_1;
luts[1] = message_extract_lut_2;
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
for (int j = 0; j < 2; j++) {
luts[j]->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
luts[j]->broadcast_lut(active_streams);
}
}
@@ -485,35 +503,29 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
(Torus *)cuda_malloc_with_size_tracking_async(
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
} else {
h_lut_indexes[index] = 2;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
h_lut_indexes[index] = 2;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
};
generate_lut_indexes<Torus>(streams, index_generator,
first_indexes_for_overflow_sub_gpu_0[nb - 1],
nb, 2 * group_size + 1, h_lut_indexes,
allocate_gpu_memory);
}
cuda_memcpy_with_size_tracking_async_to_gpu(
first_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
allocate_gpu_memory);
}
// Extra indexes for the luts in second step
uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
for (int nb = 1; nb <= num_blocks; nb++) {
second_indexes_for_overflow_sub_gpu_0[nb - 1] =
(Torus *)cuda_malloc_with_size_tracking_async(
@@ -524,37 +536,24 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
uint32_t) {
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
}
};
generate_lut_indexes<Torus>(streams, index_generator,
second_indexes_for_overflow_sub_gpu_0[nb - 1],
nb, num_luts_second_step, h_lut_indexes,
allocate_gpu_memory);
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
bool may_have_its_padding_bit_set =
!is_in_first_grouping && (index_in_grouping == group_size - 1);
@@ -568,6 +567,10 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
h_scalar[index] = 0;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
second_indexes_for_overflow_sub_gpu_0[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
scalars_for_overflow_sub_gpu_0[nb - 1], h_scalar, nb * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
@@ -1004,14 +1007,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
masking_luts_2[i] = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
masking_luts_1[i]->generate_and_broadcast_lut(
active_streams_1, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);
masking_luts_1[i]->broadcast_lut(active_streams_1);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_2 =
streams.active_gpu_subset(num_blocks, params.pbs_type);
masking_luts_2[i]->generate_and_broadcast_lut(
active_streams_2, {0}, {lut_f_masking}, LUT_0_FOR_ALL_BLOCKS);
masking_luts_2[i]->broadcast_lut(active_streams_2);
}
// create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1029,12 +1042,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
for (int j = 0; j < 2; j++) {
luts[j]->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
luts[j]->broadcast_lut(active_streams);
}
// Give name to closures to improve readability
@@ -1060,14 +1076,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
}
};
zero_out_if_overflow_did_not_happen[0]
->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
LUT_0_FOR_ALL_BLOCKS, {},
params.message_modulus - 2);
zero_out_if_overflow_did_not_happen[1]
->generate_and_broadcast_bivariate_lut(active_streams, {0}, {cur_lut_f},
LUT_0_FOR_ALL_BLOCKS, {},
params.message_modulus - 1);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_if_overflow_did_not_happen[0]->get_lut(0, 0),
zero_out_if_overflow_did_not_happen[0]->get_degree(0),
zero_out_if_overflow_did_not_happen[0]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, params.message_modulus - 2,
gpu_memory_allocated);
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(active_streams);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
zero_out_if_overflow_did_not_happen[1]->get_degree(0),
zero_out_if_overflow_did_not_happen[1]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cur_lut_f, params.message_modulus - 1,
gpu_memory_allocated);
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(active_streams);
// create and generate zero_out_if_overflow_happened
zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
@@ -1084,12 +1110,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
}
};
zero_out_if_overflow_happened[0]->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
params.message_modulus - 2);
zero_out_if_overflow_happened[1]->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {overflow_happened_f}, LUT_0_FOR_ALL_BLOCKS, {},
params.message_modulus - 1);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_if_overflow_happened[0]->get_lut(0, 0),
zero_out_if_overflow_happened[0]->get_degree(0),
zero_out_if_overflow_happened[0]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
gpu_memory_allocated);
zero_out_if_overflow_happened[0]->broadcast_lut(active_streams);
generate_device_accumulator_bivariate_with_factor<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_if_overflow_happened[1]->get_lut(0, 0),
zero_out_if_overflow_happened[1]->get_degree(0),
zero_out_if_overflow_happened[1]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
gpu_memory_allocated);
zero_out_if_overflow_happened[1]->broadcast_lut(active_streams);
// merge_overflow_flags_luts
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
@@ -1103,8 +1141,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
active_gpu_count_for_bits, {0}, {lut_f_bit}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
merge_overflow_flags_luts[i]->get_lut(0, 0),
merge_overflow_flags_luts[i]->get_degree(0),
merge_overflow_flags_luts[i]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
}
}
@@ -1176,34 +1220,29 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
(Torus *)cuda_malloc_with_size_tracking_async(
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
auto index_generator = [nb, group_size](Torus *h_lut_indexes, uint32_t) {
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
} else {
h_lut_indexes[index] = 2;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
h_lut_indexes[index] = 2;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
};
generate_lut_indexes<Torus>(
streams, index_generator, first_indexes_for_overflow_sub[nb - 1], nb,
2 * group_size + 1, h_lut_indexes, allocate_gpu_memory);
}
cuda_memcpy_with_size_tracking_async_to_gpu(
first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
allocate_gpu_memory);
}
// Extra indexes for the luts in second step
uint32_t num_extra_luts = use_seq ? (group_size - 1) : 1;
uint32_t num_luts_second_step = 2 * group_size + num_extra_luts;
for (int nb = 1; nb <= num_blocks; nb++) {
second_indexes_for_overflow_sub[nb - 1] =
(Torus *)cuda_malloc_with_size_tracking_async(
@@ -1214,36 +1253,24 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
auto index_generator = [nb, group_size, use_seq](Torus *h_lut_indexes,
uint32_t) {
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
}
};
generate_lut_indexes<Torus>(
streams, index_generator, second_indexes_for_overflow_sub[nb - 1], nb,
num_luts_second_step, h_lut_indexes, allocate_gpu_memory);
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
bool may_have_its_padding_bit_set =
!is_in_first_grouping && (index_in_grouping == group_size - 1);
@@ -1257,6 +1284,10 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
h_scalar[index] = 0;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
@@ -1526,12 +1557,16 @@ template <typename Torus> struct int_div_rem_memory {
compare_signed_bits_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
compare_signed_bits_lut->get_lut(0, 0),
compare_signed_bits_lut->get_degree(0),
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
f_compare_extracted_signed_bits, gpu_memory_allocated);
auto active_gpu_count_cmp =
streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
LUT_0_FOR_ALL_BLOCKS);
compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
}
}

View File

@@ -53,8 +53,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
return count;
};
univ_lut_mem->generate_and_broadcast_lut(
active_streams, {0}, {generate_uni_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
univ_lut_mem->broadcast_lut(active_streams);
auto generate_bi_lut_lambda =
[num_bits](Torus block_num_bit_count,
@@ -65,8 +70,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
return 0;
};
biv_lut_mem->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {generate_bi_lut_lambda}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
biv_lut_mem->broadcast_lut(active_streams);
this->tmp_ct = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -222,7 +232,7 @@ template <typename Torus> struct int_ilog2_buffer {
this->sum_output_not_propagated, counter_num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
lut_message_not =
this->lut_message_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus)> lut_message_lambda =
@@ -230,11 +240,16 @@ template <typename Torus> struct int_ilog2_buffer {
uint64_t message = x % this->params.message_modulus;
return (~message) % this->params.message_modulus;
};
generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
this->lut_message_not->get_lut(0, 0),
this->lut_message_not->get_degree(0),
this->lut_message_not->get_max_degree(0),
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus,
lut_message_lambda, allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
lut_message_not->generate_and_broadcast_lut(
active_streams, {0}, {lut_message_lambda}, LUT_0_FOR_ALL_BLOCKS);
lut_message_not->broadcast_lut(active_streams);
this->lut_carry_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -244,8 +259,13 @@ template <typename Torus> struct int_ilog2_buffer {
uint64_t carry = x / this->params.message_modulus;
return (~carry) % this->params.message_modulus;
};
lut_carry_not->generate_and_broadcast_lut(
active_streams, {0}, {lut_carry_lambda}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0),
this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_carry_lambda, allocate_gpu_memory);
lut_carry_not->broadcast_lut(active_streams);
this->message_blocks_not = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -37,12 +37,17 @@ template <typename Torus> struct int_mul_memory {
zero_out_predicate_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_predicate_lut->get_lut(0, 0),
zero_out_predicate_lut->get_degree(0),
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_predicate_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {zero_out_predicate_lut_f},
LUT_0_FOR_ALL_BLOCKS);
zero_out_predicate_lut->broadcast_lut(active_streams);
zero_out_mem = new int_zero_out_if_buffer<Torus>(
streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -50,7 +55,10 @@ template <typename Torus> struct int_mul_memory {
return;
}
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// 'vector_result_lsb' contains blocks from all possible shifts of
// radix_lwe_left excluding zero ciphertext blocks
@@ -62,10 +70,6 @@ template <typename Torus> struct int_mul_memory {
int total_block_count = num_radix_blocks * num_radix_blocks;
GPU_ASSERT(lsb_vector_block_count + msb_vector_block_count ==
total_block_count,
"MSB and LSB vector block counts don't match");
// allocate memory for intermediate buffers
vector_result_sb = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -87,6 +91,8 @@ template <typename Torus> struct int_mul_memory {
// luts_array -> lut = {lsb_acc, msb_acc}
luts_array = new int_radix_lut<Torus>(streams, params, 2, total_block_count,
allocate_gpu_memory, size_tracker);
auto lsb_acc = luts_array->get_lut(0, 0);
auto msb_acc = luts_array->get_lut(0, 1);
// define functions for each accumulator
auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
@@ -96,21 +102,30 @@ template <typename Torus> struct int_mul_memory {
return (x * y) / message_modulus;
};
// generate accumulators
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lsb_acc,
luts_array->get_degree(0), luts_array->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_lsb, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), msb_acc,
luts_array->get_degree(1), luts_array->get_max_degree(1),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_msb, gpu_memory_allocated);
// lut_indexes_vec for luts_array should be reinitialized
// first lsb_vector_block_count value should reference to lsb_acc
// last msb_vector_block_count values should reference to msb_acc
// for message and carry default lut_indexes_vec is fine
if (allocate_gpu_memory)
cuda_set_value_async<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
auto active_streams =
streams.active_gpu_subset(total_block_count, params.pbs_type);
auto lut_index_generator = [lsb_vector_block_count](Torus *h_lut_indexes,
uint32_t num_indexes) {
for (uint32_t i = 0; i < num_indexes; i++) {
h_lut_indexes[i] = (i < lsb_vector_block_count) ? 0 : 1;
}
};
luts_array->generate_and_broadcast_bivariate_lut(
active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, lut_index_generator);
luts_array->broadcast_lut(active_streams);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, params, num_radix_blocks, 2 * num_radix_blocks,

View File

@@ -22,7 +22,8 @@ template <typename Torus> struct int_grouped_oprf_memory {
uint32_t calculated_active_blocks =
total_random_bits == 0
? 0
: CEIL_DIV(total_random_bits, message_bits_per_block);
: (total_random_bits + message_bits_per_block - 1) /
message_bits_per_block;
if (num_blocks_to_process != calculated_active_blocks) {
PANIC(
"num_blocks_to_process should be equal to calculated_active_blocks");
@@ -52,10 +53,6 @@ template <typename Torus> struct int_grouped_oprf_memory {
// Pre-generate all possible LUTs.
//
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
std::vector<uint64_t> lut_degrees;
for (uint32_t random_bit = 1; random_bit <= message_bits_per_block;
++random_bit) {
uint64_t p = 1ULL << random_bit;
@@ -73,13 +70,14 @@ template <typename Torus> struct int_grouped_oprf_memory {
uint64_t degree = 0;
uint32_t lut_index = random_bit - 1;
lut_funcs.push_back(lut_f);
lut_indices.push_back(lut_index);
generate_device_accumulator_no_encoding<Torus>(
streams.stream(0), streams.gpu_index(0), luts->get_lut(0, lut_index),
degree, params.message_modulus, params.carry_modulus,
params.glwe_dimension, params.polynomial_size, lut_f,
allocate_gpu_memory);
// In OPRF the degree is hard set to p - 1 instead of the LUT degree
degree = p - 1;
lut_degrees.push_back(degree);
*luts->get_degree(lut_index) = degree;
}
// For each block, this loop determines the exact number of bits to generate
@@ -104,6 +102,10 @@ template <typename Torus> struct int_grouped_oprf_memory {
Torus plaintext_to_add = (p - 1) * delta / 2;
h_corrections[i * lwe_size + params.big_lwe_dimension] = plaintext_to_add;
if (bits_for_this_block < 1) {
PANIC("bits_for_this_block should be greater than 1");
}
this->h_lut_indexes[i] = bits_for_this_block - 1;
bits_processed += bits_for_this_block;
}
@@ -120,35 +122,13 @@ template <typename Torus> struct int_grouped_oprf_memory {
// Copy the prepared LUT indexes to the GPU 0, before broadcast to all other
// GPUs.
cuda_memcpy_with_size_tracking_async_to_gpu(
luts->get_lut_indexes(0, 0), this->h_lut_indexes,
num_blocks_to_process * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
// No encoding for these LUTS. Generate LUT also sets LUT degrees to default
// values
auto luts_index_generator = [total_random_bits, message_bits_per_block](
Torus *h_lut_indexes, uint32_t num_blocks) {
uint64_t bits_processed = 0;
for (uint32_t i = 0; i < num_blocks; ++i) {
if (total_random_bits <= bits_processed) {
PANIC("total_random_bits should be greater than bits_processed");
}
uint64_t bits_remaining = total_random_bits - bits_processed;
uint32_t bits_for_this_block =
std::min((uint64_t)message_bits_per_block, bits_remaining);
if (bits_for_this_block < 1) {
PANIC("bits_for_this_block should be greater than 1");
}
h_lut_indexes[i] = bits_for_this_block - 1;
bits_processed += bits_for_this_block;
}
};
luts->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
luts_index_generator, false, {},
this->h_lut_indexes);
// OPRF requires custom LUT degrees
for (uint32_t i = 0; i < lut_degrees.size(); ++i) {
*luts->get_degree(i) = lut_degrees[i];
}
luts->broadcast_lut(active_streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(h_corrections);
@@ -190,7 +170,8 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_random_input_blocks =
CEIL_DIV(num_input_random_bits, message_bits_per_block);
(num_input_random_bits + message_bits_per_block - 1) /
message_bits_per_block;
this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
streams, params, this->num_random_input_blocks, message_bits_per_block,

View File

@@ -85,11 +85,15 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
// right shift
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);
cur_lut_bivariate->broadcast_lut(active_streams);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
@@ -168,10 +172,16 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
// right shift
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {shift_lut_f}, LUT_0_FOR_ALL_BLOCKS);
cur_lut_bivariate->broadcast_lut(active_streams);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
}
@@ -261,11 +271,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return shifted | padding;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
shift_last_block_lut_univariate->get_lut(0, 0),
shift_last_block_lut_univariate->get_degree(0),
shift_last_block_lut_univariate->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
auto active_streams_shift_last =
streams.active_gpu_subset(1, params.pbs_type);
shift_last_block_lut_univariate->generate_and_broadcast_lut(
active_streams_shift_last, {0}, {last_block_lut_f},
LUT_0_FOR_ALL_BLOCKS);
shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
}
@@ -283,8 +298,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return (params.message_modulus - 1) * x_sign_bit;
};
padding_block_lut_univariate->generate_and_broadcast_lut(
active_streams, {0}, {padding_block_lut_f}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
padding_block_lut_univariate->get_lut(0, 0),
padding_block_lut_univariate->get_degree(0),
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
padding_block_lut_f, gpu_memory_allocated);
// auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
padding_block_lut_univariate->broadcast_lut(active_streams);
lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -317,11 +339,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return message_of_current_block + carry_of_previous_block;
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
shift_blocks_lut_bivariate->get_lut(0, 0),
shift_blocks_lut_bivariate->get_degree(0),
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
blocks_lut_f, gpu_memory_allocated);
auto active_streams_shift_blocks =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams_shift_blocks, {0}, {blocks_lut_f},
LUT_0_FOR_ALL_BLOCKS);
shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
}

View File

@@ -113,20 +113,27 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
else
return current_bit;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
auto active_gpu_count_mux = streams.active_gpu_subset(
bits_per_block * num_radix_blocks, params.pbs_type);
mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
LUT_0_FOR_ALL_BLOCKS);
mux_lut->broadcast_lut(active_gpu_count_mux);
auto cleaning_lut_f = [params](Torus x) -> Torus {
return x % params.message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
auto active_gpu_count_cleaning =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cleaning_lut->generate_and_broadcast_lut(
active_gpu_count_cleaning, {0}, {cleaning_lut_f}, LUT_0_FOR_ALL_BLOCKS);
cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
}
void release(CudaStreams streams) {

View File

@@ -74,27 +74,45 @@ template <typename Torus> struct int_overflowing_sub_memory {
luts_array, size_tracker,
allocate_gpu_memory, size_tracker);
auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
// generate luts (aka accumulators)
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
luts_array->get_degree(0), luts_array->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_lut_does_block_generate_carry, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_does_block_generate_or_propagate, luts_array->get_degree(1),
luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
gpu_memory_allocated);
if (allocate_gpu_memory)
cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, 1), 1,
num_radix_blocks - 1);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_borrow_propagation_sum->get_lut(0, 0),
luts_borrow_propagation_sum->get_degree(0),
luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_luts_borrow_propagation_sum, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
message_acc->get_degree(0), message_acc->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_message_acc, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {f_luts_borrow_propagation_sum},
LUT_0_FOR_ALL_BLOCKS);
auto luts_array_index_generator = [](Torus *h_lut_indexes,
uint32_t num_indexes) {
for (uint32_t i = 0; i < num_indexes; i++) {
h_lut_indexes[i] = (i == 0) ? 0 : 1;
}
};
luts_array->generate_and_broadcast_lut(
active_streams, {0, 1},
{f_lut_does_block_generate_carry,
f_lut_does_block_generate_or_propagate},
luts_array_index_generator);
// generate luts (aka accumulators)
message_acc->generate_and_broadcast_lut(
active_streams, {0}, {f_message_acc}, LUT_0_FOR_ALL_BLOCKS);
luts_array->broadcast_lut(active_streams);
luts_borrow_propagation_sum->broadcast_lut(active_streams);
message_acc->broadcast_lut(active_streams);
}
void release(CudaStreams streams) {

View File

@@ -7,8 +7,7 @@
#include <functional>
#include <vector>
// If we use more than 5 streams the result is incorrect
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 5;
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
template <typename Torus> struct int_equality_selectors_buffer {
int_radix_params params;
@@ -61,10 +60,18 @@ template <typename Torus> struct int_equality_selectors_buffer {
fns.push_back([i](Torus x) -> Torus { return (x == i); });
}
this->comparison_luts->generate_and_broadcast_many_lut(
active_streams, {0}, {fns}, LUT_0_FOR_ALL_BLOCKS);
generate_many_lut_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->comparison_luts->get_lut(0, 0),
this->comparison_luts->get_degree(0),
this->comparison_luts->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
fns, allocate_gpu_memory);
fns.clear();
this->comparison_luts->broadcast_lut(active_streams);
this->tmp_many_luts_output = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->tmp_many_luts_output,
@@ -168,7 +175,8 @@ template <typename Torus> struct int_possible_results_buffer {
this->lut_stride =
(ciphertext_modulus / this->max_luts_per_call) * box_size;
this->num_lut_accumulators = CEIL_DIV(total_luts_needed, max_luts_per_call);
this->num_lut_accumulators =
(total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
stream_luts =
new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
@@ -194,10 +202,15 @@ template <typename Torus> struct int_possible_results_buffer {
fns.push_back([c](Torus x) -> Torus { return (x == 1) * c; });
}
current_lut->generate_and_broadcast_many_lut(
streams.active_gpu_subset(1, params.pbs_type), {0}, {fns},
LUT_0_FOR_ALL_BLOCKS);
generate_many_lut_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), current_lut->get_lut(0, 0),
current_lut->get_degree(0), current_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, fns,
allocate_gpu_memory);
current_lut->broadcast_lut(
streams.active_gpu_subset(1, params.pbs_type));
stream_luts[lut_count++] = current_lut;
lut_value_start += luts_in_this_call;
}
@@ -285,10 +298,14 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
id_fn, allocate_gpu_memory);
lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->stream_identity_luts[i] = lut;
}
@@ -301,17 +318,27 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->message_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
this->message_extract_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->message_extract_lut->get_lut(0, 0),
this->message_extract_lut->get_degree(0),
this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
msg_fn, allocate_gpu_memory);
this->message_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->carry_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
this->carry_extract_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->carry_extract_lut->get_lut(0, 0),
this->carry_extract_lut->get_degree(0),
this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
carry_fn, allocate_gpu_memory);
this->carry_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type));
this->partial_aggregated_vectors =
new CudaRadixCiphertextFFI *[num_streams];
@@ -1158,9 +1185,15 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
this->prefix_sum_lut = new int_radix_lut<Torus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{prefix_sum_fn}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
this->prefix_sum_lut->get_lut(0, 0),
this->prefix_sum_lut->get_degree(0),
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1170,9 +1203,14 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
};
this->cleanup_lut = new int_radix_lut<Torus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
this->cleanup_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{cleanup_fn}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
}
void release(CudaStreams streams) {
@@ -1338,9 +1376,15 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
this->prefix_sum_lut = new int_radix_lut<Torus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{prefix_sum_fn}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
this->prefix_sum_lut->get_lut(0, 0),
this->prefix_sum_lut->get_degree(0),
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1350,9 +1394,14 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
};
this->cleanup_lut = new int_radix_lut<Torus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
this->cleanup_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{cleanup_fn}, LUT_0_FOR_ALL_BLOCKS);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type));
}
void release(CudaStreams streams) {

View File

@@ -73,10 +73,9 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
int8_t **fp_ks_buffer,
bool gpu_memory_allocated);
void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log,
uint32_t level_count);
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log, uint32_t level_count);
}
#endif // CNCRT_KS_H_

View File

@@ -1,24 +0,0 @@
#ifndef KREYVIUM_H
#define KREYVIUM_H
#include "../integer/integer.h"
extern "C" {
uint64_t scratch_cuda_kreyvium_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
void cuda_kreyvium_generate_keystream_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cleanup_cuda_kreyvium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
}
#endif

View File

@@ -1,320 +0,0 @@
#ifndef KREYVIUM_UTILITIES_H
#define KREYVIUM_UTILITIES_H
#include "../integer/integer_utilities.h"
// Kreyvium specific constants
// The batch size is set to 64 to allow efficient parallel processing of 64
// steps at once.
constexpr uint32_t KREYVIUM_BATCH_SIZE = 64;
// In each Kreyvium step, there are exactly 3 non-linear AND operations:
// 1. (c109 & c108)
// 2. (a91 & a90)
// 3. (b82 & b81)
constexpr uint32_t KREYVIUM_NUM_AND_GATES = 3;
// In each Kreyvium step, there are 4 paths that require a "flush"
// to noise-cancel and extract the bit:
// 1. New bit for Register A
// 2. New bit for Register B
// 3. New bit for Register C
// 4. The Output Keystream bit
constexpr uint32_t KREYVIUM_NUM_FLUSH_PATHS = 4;
/// Struct to hold the LUTs.
template <typename Torus> struct int_kreyvium_lut_buffers {
// Bivariate AND Gate LUT:
// AND operation: f(a, b) = (a & 1) & (b & 1).
// This is a Bivariate PBS used for the non-linear parts of Kreyvium.
int_radix_lut<Torus> *and_lut;
// Univariate Flush/Identity LUT:
// MESSAGE EXTRACTION operation: f(x) = x & 1.
// This is a Univariate PBS used to "flush" the state (reset noise/carries).
int_radix_lut<Torus> *flush_lut;
int_kreyvium_lut_buffers(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_inputs,
uint64_t &size_tracker) {
uint32_t and_ops =
num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_AND_GATES;
uint32_t flush_ops =
num_inputs * KREYVIUM_BATCH_SIZE * KREYVIUM_NUM_FLUSH_PATHS;
this->and_lut = new int_radix_lut<Torus>(streams, params, 1, and_ops,
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus lhs, Torus rhs) -> Torus { return (lhs & 1) & (rhs & 1); };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and =
streams.active_gpu_subset(and_ops, params.pbs_type);
this->and_lut->broadcast_lut(active_streams_and);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->flush_lut = new int_radix_lut<Torus>(
streams, params, 1, flush_ops, allocate_gpu_memory, size_tracker);
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
return x & 1;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush =
streams.active_gpu_subset(flush_ops, params.pbs_type);
this->flush_lut->broadcast_lut(active_streams_flush);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}
void release(CudaStreams streams) {
this->and_lut->release(streams);
delete this->and_lut;
this->and_lut = nullptr;
this->flush_lut->release(streams);
delete this->flush_lut;
this->flush_lut = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
/// Struct to hold the Kreyvium internal state and temporary workspaces.
template <typename Torus> struct int_kreyvium_state_workspaces {
CudaRadixCiphertextFFI *a_reg;
CudaRadixCiphertextFFI *b_reg;
CudaRadixCiphertextFFI *c_reg;
CudaRadixCiphertextFFI *k_reg;
CudaRadixCiphertextFFI *iv_reg;
// Shift Workspace
CudaRadixCiphertextFFI *shift_workspace;
// Temporary Update Buffers
CudaRadixCiphertextFFI *temp_a;
CudaRadixCiphertextFFI *temp_b;
CudaRadixCiphertextFFI *temp_c;
CudaRadixCiphertextFFI *packed_and_lhs;
CudaRadixCiphertextFFI *packed_and_rhs;
CudaRadixCiphertextFFI *packed_and_out;
// Flush/Cleanup Packing Buffers
CudaRadixCiphertextFFI *packed_flush_in;
CudaRadixCiphertextFFI *packed_flush_out;
uint32_t max_batch_blocks;
uint32_t k_offset;
uint32_t iv_offset;
int_kreyvium_state_workspaces(CudaStreams streams,
const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_inputs,
uint64_t &size_tracker) {
uint32_t batch_blocks = KREYVIUM_BATCH_SIZE * num_inputs;
this->max_batch_blocks = batch_blocks;
this->k_offset = 0;
this->iv_offset = 0;
this->a_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->a_reg, 93 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->b_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->b_reg, 84 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->c_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->c_reg, 111 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->k_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->k_reg, 128 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->iv_reg = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->iv_reg, 128 * num_inputs,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->shift_workspace = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->shift_workspace,
128 * num_inputs, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
this->temp_a = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_a, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->temp_b = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_b, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->temp_c = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->temp_c, batch_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->packed_and_lhs = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_and_lhs,
KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->packed_and_rhs = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_and_rhs,
KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->packed_and_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_and_out,
KREYVIUM_NUM_AND_GATES * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->packed_flush_in = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_flush_in,
KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->packed_flush_out = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_flush_out,
KREYVIUM_NUM_FLUSH_PATHS * batch_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
}
void release(CudaStreams streams, bool allocate_gpu_memory) {
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->a_reg, allocate_gpu_memory);
delete this->a_reg;
this->a_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->b_reg, allocate_gpu_memory);
delete this->b_reg;
this->b_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->c_reg, allocate_gpu_memory);
delete this->c_reg;
this->c_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->k_reg, allocate_gpu_memory);
delete this->k_reg;
this->k_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->iv_reg, allocate_gpu_memory);
delete this->iv_reg;
this->iv_reg = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->shift_workspace, allocate_gpu_memory);
delete this->shift_workspace;
this->shift_workspace = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_a, allocate_gpu_memory);
delete this->temp_a;
this->temp_a = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_b, allocate_gpu_memory);
delete this->temp_b;
this->temp_b = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->temp_c, allocate_gpu_memory);
delete this->temp_c;
this->temp_c = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_and_lhs, allocate_gpu_memory);
delete this->packed_and_lhs;
this->packed_and_lhs = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_and_rhs, allocate_gpu_memory);
delete this->packed_and_rhs;
this->packed_and_rhs = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_and_out, allocate_gpu_memory);
delete this->packed_and_out;
this->packed_and_out = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_flush_in, allocate_gpu_memory);
delete this->packed_flush_in;
this->packed_flush_in = nullptr;
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
this->packed_flush_out, allocate_gpu_memory);
delete this->packed_flush_out;
this->packed_flush_out = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
template <typename Torus> struct int_kreyvium_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_kreyvium_lut_buffers<Torus> *luts;
int_kreyvium_state_workspaces<Torus> *state;
int_kreyvium_buffer(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_inputs,
uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_inputs = num_inputs;
this->luts = new int_kreyvium_lut_buffers<Torus>(
streams, params, allocate_gpu_memory, num_inputs, size_tracker);
this->state = new int_kreyvium_state_workspaces<Torus>(
streams, params, allocate_gpu_memory, num_inputs, size_tracker);
}
void release(CudaStreams streams) {
luts->release(streams);
delete luts;
luts = nullptr;
state->release(streams, allocate_gpu_memory);
delete state;
state = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
#endif

View File

@@ -429,9 +429,11 @@ uint64_t get_buffer_size_programmable_bootstrap_cg(
}
template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory, uint32_t base_log);
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);
template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(

View File

@@ -13,12 +13,12 @@ void cuda_fourier_polynomial_mul(void *stream, uint32_t gpu_index,
void cuda_convert_lwe_programmable_bootstrap_key_32(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t base_log);
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t base_log);
uint32_t polynomial_size);
void cuda_convert_lwe_programmable_bootstrap_key_128(
void *stream, uint32_t gpu_index, void *dest, void const *src,
@@ -61,13 +61,13 @@ uint64_t scratch_cuda_programmable_bootstrap_32(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t base_log);
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t base_log);
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_programmable_bootstrap_128(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,

View File

@@ -30,10 +30,15 @@ template <typename Torus> struct int_trivium_lut_buffers {
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus a, Torus b) -> Torus { return (a & 1) & (b & 1); };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and =
streams.active_gpu_subset(total_lut_ops, params.pbs_type);
this->and_lut->generate_and_broadcast_bivariate_lut(
active_streams_and, {0}, {and_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->and_lut->broadcast_lut(active_streams_and);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
uint32_t total_flush_ops = num_trivium_inputs * BATCH_SIZE * 4;
@@ -45,10 +50,15 @@ template <typename Torus> struct int_trivium_lut_buffers {
return x & 1;
};
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush =
streams.active_gpu_subset(total_flush_ops, params.pbs_type);
this->flush_lut->generate_and_broadcast_lut(
active_streams_flush, {0}, {flush_lambda}, LUT_0_FOR_ALL_BLOCKS);
this->flush_lut->broadcast_lut(active_streams_flush);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

View File

@@ -14,10 +14,10 @@ uint64_t scratch_cuda_expand_without_verification_64(
uint32_t casting_output_dimension, uint32_t casting_ks_level,
uint32_t casting_ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array, const uint32_t is_boolean_array_len,
uint32_t num_compact_lists, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, KS_TYPE casting_key_type,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
const bool *is_boolean_array, uint32_t num_compact_lists,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
KS_TYPE casting_key_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_expand_without_verification_64(
CudaStreamsFFI streams, void *lwe_array_out,

View File

@@ -118,8 +118,7 @@ template <typename Torus> struct zk_expand_mem {
zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
int_radix_params casting_params, KS_TYPE casting_key_type,
const uint32_t *num_lwes_per_compact_list,
const bool *is_boolean_array,
const uint32_t is_boolean_array_len, uint32_t num_compact_lists,
const bool *is_boolean_array, uint32_t num_compact_lists,
bool allocate_gpu_memory, uint64_t &size_tracker)
: computing_params(computing_params), casting_params(casting_params),
num_compact_lists(num_compact_lists),
@@ -175,6 +174,40 @@ template <typename Torus> struct zk_expand_mem {
message_and_carry_extract_luts = new int_radix_lut<Torus>(
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 0),
message_and_carry_extract_luts->get_degree(0),
message_and_carry_extract_luts->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, message_extract_lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 1),
message_and_carry_extract_luts->get_degree(1),
message_and_carry_extract_luts->get_max_degree(1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_extract_lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 2),
message_and_carry_extract_luts->get_degree(2),
message_and_carry_extract_luts->get_max_degree(2),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, message_extract_and_sanitize_bool_lut_f,
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_and_carry_extract_luts->get_lut(0, 3),
message_and_carry_extract_luts->get_degree(3),
message_and_carry_extract_luts->get_max_degree(3),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_extract_and_sanitize_bool_lut_f,
gpu_memory_allocated);
// We are always packing two LWEs. We just need to be sure we have enough
// space in the carry part to store a message of the same size as is in the
// message part.
@@ -237,65 +270,29 @@ template <typename Torus> struct zk_expand_mem {
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto lwe_index_in_list = i % num_lwes_in_kth;
PANIC_IF_FALSE(lwe_index < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
lwe_index, num_packed_msgs * num_lwes);
h_indexes_in[lwe_index] = lwe_index_in_list + offset;
h_indexes_out[lwe_index] =
num_packed_msgs * h_indexes_in[lwe_index] + i / num_lwes_in_kth;
PANIC_IF_FALSE(h_indexes_in[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
h_indexes_in[lwe_index], num_packed_msgs * num_lwes);
PANIC_IF_FALSE(h_indexes_out[lwe_index] < num_packed_msgs * num_lwes,
"Cuda error: index %d is beyond the max value %d",
h_indexes_out[lwe_index], num_packed_msgs * num_lwes);
// is_boolean_array tells us which input is a boolean and thus the
// related output needs boolean sanitization. It naturally has
// total_blocks entries, but h_indexes_out reaches
// message_modulus * ceil(total_blocks/2) - 1. When total_blocks is odd,
// the ceiling causes out-of-bounds access. Reading garbage "true" would
// set h_lut_indexes to an invalid index pointing to uninitialized
// memory instead of a real LUT. Rust pads is_boolean_array with FALSE
// to match.
PANIC_IF_FALSE(h_indexes_out[lwe_index] < is_boolean_array_len,
"Cuda error: index %d for is_boolean_array is out of "
"bounds (len is %d)",
h_indexes_out[lwe_index], is_boolean_array_len);
// If the input relates to a boolean, shift the LUT so the correct one
// with sanitization is used
auto boolean_offset =
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
}
offset += num_lwes_in_kth;
}
message_and_carry_extract_luts->set_lwe_indexes(
streams.stream(0), streams.gpu_index(0), h_indexes_in, h_indexes_out);
auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
// Index generator for message/carry extraction LUTs
auto index_gen = [num_compact_lists,
num_lwes_per_compact_list =
this->num_lwes_per_compact_list,
num_packed_msgs, is_boolean_array,
h_indexes_out](Torus *h_lut_indexes, uint32_t) {
auto offset = 0;
for (int k = 0; k < num_compact_lists; k++) {
auto num_lwes_in_kth = num_lwes_per_compact_list[k];
for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
auto lwe_index = i + num_packed_msgs * offset;
auto boolean_offset =
is_boolean_array[h_indexes_out[lwe_index]] ? num_packed_msgs : 0;
h_lut_indexes[lwe_index] = i / num_lwes_in_kth + boolean_offset;
}
offset += num_lwes_in_kth;
}
};
message_and_carry_extract_luts->generate_and_broadcast_lut(
active_streams, {0, 1, 2, 3},
{message_extract_lut_f, carry_extract_lut_f,
message_extract_and_sanitize_bool_lut_f,
carry_extract_and_sanitize_bool_lut_f},
index_gen, true, {}, h_lut_indexes);
message_and_carry_extract_luts->broadcast_lut(active_streams);
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
active_streams, 2 * num_lwes, size_tracker, allocate_gpu_memory);

View File

@@ -183,10 +183,9 @@ void cuda_packing_keyswitch_lwe_list_to_glwe_128(
base_log, level_count, num_lwes);
}
void cuda_closest_representable_64_async(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log,
uint32_t level_count) {
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log, uint32_t level_count) {
host_cuda_closest_representable(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<const uint64_t *>(input),
static_cast<uint64_t *>(output), base_log,

View File

@@ -10,6 +10,7 @@
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
@@ -350,7 +351,6 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
Torus state =
init_decomposer_state(block_lwe_array_in[i], base_log, level_count);
uint32_t offset = i * level_count * (lwe_dimension_out + 1);
#pragma unroll 1
for (int j = 0; j < level_count; j++) {
KSTorus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
@@ -363,15 +363,16 @@ keyswitch(KSTorus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
lwe_acc_out[shmem_index] = local_lwe_out;
}
for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
__syncthreads();
if (tid <= lwe_dimension_out && threadIdx.y < offset) {
if (tid <= lwe_dimension_out) {
for (int offset = blockDim.y / 2; offset > 0 && threadIdx.y < offset;
offset /= 2) {
__syncthreads();
lwe_acc_out[shmem_index] +=
lwe_acc_out[shmem_index + offset * blockDim.x];
}
if (threadIdx.y == 0)
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
}
if (tid <= lwe_dimension_out && threadIdx.y == 0)
block_lwe_array_out[tid] = -lwe_acc_out[shmem_index];
}
template <typename Torus, typename KSTorus>

View File

@@ -12,9 +12,12 @@
#include "polynomial/polynomial_math.cuh"
#include "torus.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <thread>
#include <vector>
#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
// Finish the keyswitching operation and prepare GLWEs for accumulation.
// 1. Finish the keyswitching computation partially performed with a GEMM:
// - negate the dot product between the GLWE and KSK polynomial

View File

@@ -6,7 +6,7 @@
#include "helper_multi_gpu.h"
#include "polynomial/parameters.cuh"
#include "types/int128.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
#include <limits>
template <typename T>

View File

@@ -1,5 +1,4 @@
#include "device.h"
#include "utils/helper.cuh"
#include <cstdint>
#include <cuda_runtime.h>
#include <mutex>
@@ -7,27 +6,6 @@
#include <cuda_profiler_api.h>
#endif
void validate_device_ptr_and_gpu_index(const void *ptr, uint32_t gpu_index) {
GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
if (attr.device != gpu_index || attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer.")
}
}
int validate_device_ptr(const void *ptr) {
GPU_ASSERT(ptr != nullptr, "Cuda error: null device ptr");
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, ptr));
if (attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer.")
}
return attr.device;
}
uint32_t cuda_get_device() {
int device;
check_cuda_error(cudaGetDevice(&device));
@@ -269,12 +247,13 @@ void cuda_memcpy_with_size_tracking_async_to_gpu(void *dest, const void *src,
cudaStream_t stream,
uint32_t gpu_index,
bool gpu_memory_allocated) {
GPU_ASSERT(src != nullptr, "Cuda error: null device ptr");
if (size == 0 || !gpu_memory_allocated)
return;
validate_device_ptr_and_gpu_index(dest, gpu_index);
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid device pointer in async copy to GPU.")
}
cuda_set_device(gpu_index);
check_cuda_error(
@@ -301,16 +280,28 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
uint32_t gpu_index, bool gpu_memory_allocated) {
if (size == 0 || !gpu_memory_allocated)
return;
GPU_ASSERT(dest != nullptr,
"Cuda error: trying to copy gpu->gpu to null ptr");
GPU_ASSERT(src != nullptr,
"Cuda error: trying to copy gpu->gpu from null ptr");
int src_gpu_index = validate_device_ptr(src);
int dest_gpu_index = validate_device_ptr(dest);
cudaPointerAttributes attr_dest;
check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
PANIC_IF_FALSE(
attr_dest.type == cudaMemoryTypeDevice,
"Cuda error: invalid dest device pointer in copy from GPU to GPU.");
cudaPointerAttributes attr_src;
check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
PANIC_IF_FALSE(
attr_src.type == cudaMemoryTypeDevice,
"Cuda error: invalid src device pointer in copy from GPU to GPU.");
cuda_set_device(gpu_index);
if (src_gpu_index == dest_gpu_index) {
if (attr_src.device == attr_dest.device) {
check_cuda_error(
cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream));
} else {
check_cuda_error(cudaMemcpyPeerAsync(dest, dest_gpu_index, src,
src_gpu_index, size, stream));
check_cuda_error(cudaMemcpyPeerAsync(dest, attr_dest.device, src,
attr_src.device, size, stream));
}
}
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
@@ -358,7 +349,11 @@ void cuda_memset_with_size_tracking_async(void *dest, uint64_t val,
bool gpu_memory_allocated) {
if (size == 0 || !gpu_memory_allocated)
return;
validate_device_ptr_and_gpu_index(dest, gpu_index);
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, dest));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid dest device pointer in cuda memset.")
}
cuda_set_device(gpu_index);
check_cuda_error(cudaMemsetAsync(dest, val, size, stream));
}
@@ -388,7 +383,7 @@ void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
}
cuda_set_device(gpu_index);
int block_size = 256;
int num_blocks = CEIL_DIV(n, block_size);
int num_blocks = (n + block_size - 1) / block_size;
// Launch the kernel
cuda_set_value_kernel<Torus>
@@ -411,10 +406,13 @@ template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
/// so it should be avoided at all costs
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
GPU_ASSERT(dest != nullptr, "Cuda error: null host ptr");
if (size == 0)
return;
validate_device_ptr_and_gpu_index(src, gpu_index);
cudaPointerAttributes attr;
check_cuda_error(cudaPointerGetAttributes(&attr, src));
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
PANIC("Cuda error: invalid src device pointer in copy to CPU async.")
}
cuda_set_device(gpu_index);
check_cuda_error(

View File

@@ -188,7 +188,7 @@ __device__ void NSMFFT_direct_2_2_params(double2 *A, double2 *fft_out,
}
Index twiddle_shift = 1;
for (Index l = LOG2_DEGREE - 1; l > 5; --l) {
for (Index l = LOG2_DEGREE - 1; l >= 5; --l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
twiddle_shift <<= 1;
@@ -221,8 +221,8 @@ __device__ void NSMFFT_direct_2_2_params(double2 *A, double2 *fft_out,
tid = tid + STRIDE;
}
}
__syncthreads();
for (Index l = 5; l >= 1; --l) {
for (Index l = 4; l >= 1; --l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
twiddle_shift <<= 1;
@@ -425,7 +425,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
}
Index twiddle_shift = DEGREE;
for (Index l = 1; l <= 5; ++l) {
for (Index l = 1; l <= 4; ++l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
tid = threadIdx.x;
@@ -459,7 +459,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
}
}
for (Index l = 6; l <= LOG2_DEGREE - 1; ++l) {
for (Index l = 5; l <= LOG2_DEGREE - 1; ++l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
tid = threadIdx.x;
@@ -467,7 +467,7 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
// at this point registers are ready for the butterfly
tid = threadIdx.x;
__syncthreads();
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = (u[i] - v[i]);
@@ -495,7 +495,6 @@ __device__ void NSMFFT_inverse_2_2_params(double2 *A, double2 *buffer_regs,
tid = tid + STRIDE;
}
__syncthreads();
}
// last iteration
@@ -541,44 +540,6 @@ __global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
}
}
/*
* global batch fft
* does fft in half size
* unrolling half size fft result in half size + 1 elements
* this function must be called with actual degree
* function takes as input already compressed input
*/
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT_classical_specialized(double2 *d_input,
double2 *d_output,
double2 *buffer) {
extern __shared__ double2 sharedMemoryFFT[];
// For specialized we will always have enough shared memory
double2 *fft = sharedMemoryFFT;
int tid = threadIdx.x;
double2 *shared_twiddles = fft + params::degree / 2;
double2 fft_regs[params::opt / 2];
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
shared_twiddles[tid] = negtwiddles[tid];
fft_regs[i] = d_input[blockIdx.x * (params::degree / 2) + tid];
tid = tid + params::degree / params::opt;
}
__syncthreads();
NSMFFT_direct_2_2_params<HalfDegree<params>>(fft, fft_regs, shared_twiddles);
__syncthreads();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
d_output[blockIdx.x * (params::degree / 2) + tid] = fft_regs[i];
tid = tid + params::degree / params::opt;
}
}
/*
* global batch polynomial multiplication
* only used for fft tests

View File

@@ -68,15 +68,9 @@ struct alignas(16) f128 {
auto t = two_sum(a.lo, b.lo);
double hi = s.hi;
#ifdef __CUDA_ARCH__
double lo = __dadd_rn(s.lo, t.hi);
hi = __dadd_rn(hi, lo);
lo = __dsub_rn(lo, __dsub_rn(hi, s.hi));
#else
double lo = s.lo + t.hi;
hi = hi + lo;
lo = lo - (hi - s.hi);
#endif
return f128(hi, lo + t.lo);
}
@@ -110,13 +104,8 @@ struct alignas(16) f128 {
__host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
auto s = two_diff(a.hi, b.hi);
auto t = two_diff(a.lo, b.lo);
#ifdef __CUDA_ARCH__
s = quick_two_sum(s.hi, __dadd_rn(s.lo, t.hi));
return quick_two_sum(s.hi, __dadd_rn(s.lo, t.lo));
#else
s = quick_two_sum(s.hi, s.lo + t.hi);
return quick_two_sum(s.hi, s.lo + t.lo);
#endif
}
// Multiplication
@@ -231,16 +220,16 @@ struct f128x2 {
// Subtraction
__host__ __device__ friend f128x2 operator-(const f128x2 &a,
const f128x2 &b) {
return f128x2(f128::sub_estimate(a.re, b.re),
f128::sub_estimate(a.im, b.im));
return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
}
// Multiplication (complex multiplication)
__host__ __device__ friend f128x2 operator*(const f128x2 &a,
const f128x2 &b) {
const f128 a_im_b_im = f128::mul(a.im, b.im);
f128 real_part =
f128::add(f128::mul(a.re, b.re), f128(-a_im_b_im.hi, -a_im_b_im.lo));
f128::add(f128::mul(a.re, b.re),
f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
return f128x2(real_part, imag_part);
}
@@ -254,8 +243,8 @@ struct f128x2 {
// Subtraction-assignment operator
__host__ __device__ f128x2 &operator-=(const f128x2 &other) {
re = f128::sub_estimate(re, other.re);
im = f128::sub_estimate(im, other.im);
re = f128::add(re, f128(-other.re.hi, -other.re.lo));
im = f128::add(im, f128(-other.im.hi, -other.im.lo));
return *this;
}
@@ -272,20 +261,12 @@ struct f128x2 {
};
__host__ __device__ inline uint64_t double_to_bits(double d) {
#ifdef __CUDA_ARCH__
uint64_t bits = __double_as_longlong(d);
#else
uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
#endif
return bits;
}
__host__ __device__ inline double bits_to_double(uint64_t bits) {
#ifdef __CUDA_ARCH__
double d = __longlong_as_double(bits);
#else
double d = *reinterpret_cast<double *>(&bits);
#endif
return d;
}
@@ -294,8 +275,6 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
const double A = ONE << 52;
const double B = ONE << 104;
const double C = ONE << 76;
// NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
// does not work here
const double D = 340282366920938500000000000000000000000.;
const __uint128_t threshold = (ONE << 104);
@@ -309,20 +288,15 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
uint64_t bits_l = A_bits | lower64;
double l_temp = bits_to_double(bits_l);
double l = l_temp - A;
uint64_t B_bits = double_to_bits(B);
uint64_t top64 = static_cast<uint64_t>(x >> 52);
uint64_t bits_h = B_bits | top64;
double h_temp = bits_to_double(bits_h);
#ifdef __CUDA_ARCH__
return __dadd_rn(__dsub_rn(l_temp, A), __dsub_rn(h_temp, B));
#else
double l = l_temp - A;
double h = h_temp - B;
return (l + h);
#endif
} else {
uint64_t C_bits = double_to_bits(C);
@@ -336,20 +310,15 @@ __host__ __device__ inline double u128_to_f64(__uint128_t x) {
uint64_t bits_l = C_bits | lower64 | mask_part;
double l_temp = bits_to_double(bits_l);
double l = l_temp - C;
uint64_t D_bits = double_to_bits(D);
uint64_t top64 = static_cast<uint64_t>(x >> 76);
uint64_t bits_h = D_bits | top64;
double h_temp = bits_to_double(bits_h);
#ifdef __CUDA_ARCH__
return __dadd_rn(__dsub_rn(l_temp, C), __dsub_rn(h_temp, D));
#else
double l = l_temp - C;
double h = h_temp - D;
return (l + h);
#endif
}
}
@@ -420,8 +389,6 @@ __host__ __device__ inline f128 u128_to_signed_to_f128(__uint128_t x) {
__host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
auto x = f128::sub_estimate(a, f128::f128_floor(a));
// NOTE: for some reason __longlong_as_double(0x37f0000000000000ULL)
// does not work here
const double normalization = 340282366920938500000000000000000000000.;
#ifdef __CUDA_ARCH__
x.hi = __dmul_rn(x.hi, normalization);
@@ -431,7 +398,7 @@ __host__ __device__ inline __uint128_t u128_from_torus_f128(const f128 &a) {
x.lo *= normalization;
#endif
x = f128::add_estimate(x, f128(0.5, 0.0));
// TODO has to be round
x = f128::f128_floor(x);
__uint128_t x0 = f64_to_u128(x.hi);

View File

@@ -12,9 +12,8 @@
using Index = unsigned;
#define NEG_TWID(i) \
f128x2( \
f128(__ldg(&neg_twiddles_re_hi[(i)]), __ldg(&neg_twiddles_re_lo[(i)])), \
f128(__ldg(&neg_twiddles_im_hi[(i)]), __ldg(&neg_twiddles_im_lo[(i)])))
f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]), \
f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
#define F64x4_TO_F128x2(f128x2_reg, ind) \
f128x2_reg.re.hi = dt_re_hi[ind]; \
@@ -76,11 +75,7 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
if (u_stays_in_register) {
F128x2_TO_F64x4(v[i], tid);
} else {
F128x2_TO_F64x4(u[i], tid);
}
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
tid = tid + STRIDE;
}
__syncthreads();
@@ -91,11 +86,8 @@ __device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F64x4_TO_F128x2(w, tid ^ lane_mask);
if (u_stays_in_register) {
v[i] = w;
} else {
u[i] = w;
}
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
w = NEG_TWID(tid / lane_mask + twiddle_shift);
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
@@ -159,11 +151,7 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
// keep one of the register for next iteration and store another one in sm
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
if (u_stays_in_register) {
F128x2_TO_F64x4(v[i], tid);
} else {
F128x2_TO_F64x4(u[i], tid);
}
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
tid = tid + STRIDE;
}
@@ -177,11 +165,8 @@ __device__ void negacyclic_backward_fft_f128(double *dt_re_hi, double *dt_re_lo,
bool u_stays_in_register = rank < lane_mask;
F64x4_TO_F128x2(w, tid ^ lane_mask);
if (u_stays_in_register) {
v[i] = w;
} else {
u[i] = w;
}
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
tid = tid + STRIDE;
}
@@ -233,7 +218,7 @@ __device__ void convert_u128_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re, const __uint128_t *in_im) {
const double normalization = __longlong_as_double(0x37f0000000000000ULL);
const double normalization = pow(2., -128.);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {
@@ -256,7 +241,7 @@ __device__ void convert_u128_on_regs_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re_on_regs, const __uint128_t *in_im_on_regs) {
const double normalization = __longlong_as_double(0x37f0000000000000ULL);
const double normalization = pow(2., -128.);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {

View File

@@ -12,7 +12,7 @@
#include "integer/subtraction.cuh"
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "utils/helper.cuh"
#include "utils/kernel_dimensions.cuh"
// lwe_dimension + 1 threads
// todo: This kernel MUST be refactored to a binary reduction
@@ -98,7 +98,7 @@ __host__ void are_all_comparisons_block_true(
while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = CEIL_DIV(remaining_blocks, max_value);
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
@@ -134,26 +134,28 @@ __host__ void are_all_comparisons_block_true(
auto is_equal_to_num_blocks_lut_f = [chunk_length](Torus x) -> Torus {
return x == chunk_length;
};
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0),
is_max_value_lut->get_lut(0, 1), is_max_value_lut->get_degree(1),
is_max_value_lut->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
is_equal_to_num_blocks_lut_f, true,
are_all_block_true_buffer->preallocated_h_lut);
auto num_blocks = is_max_value_lut->num_blocks;
Torus *h_lut_indexes = is_max_value_lut->h_lut_indexes;
for (int index = 0; index < num_chunks; index++) {
if (index == num_chunks - 1) {
h_lut_indexes[index] = 1;
} else {
h_lut_indexes[index] = 0;
}
}
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
h_lut_indexes, num_chunks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0));
auto active_streams =
streams.active_gpu_subset(num_chunks, params.pbs_type);
// Index generator: last chunk uses LUT 1, others use LUT 0
auto index_gen = [num_chunks, num_blocks](Torus *h_lut_indexes,
uint32_t) {
for (uint32_t index = 0; index < num_blocks; index++) {
if (index == num_chunks - 1) {
h_lut_indexes[index] = 1;
} else if (index < num_chunks - 1 || index >= num_chunks) {
h_lut_indexes[index] = 0;
}
}
};
is_max_value_lut->generate_and_broadcast_lut(
active_streams, {1}, {is_equal_to_num_blocks_lut_f}, index_gen,
true, {are_all_block_true_buffer->preallocated_h_lut});
is_max_value_lut->broadcast_lut(active_streams);
}
lut = is_max_value_lut;
}
@@ -165,10 +167,15 @@ __host__ void are_all_comparisons_block_true(
streams, lwe_array_out, accumulator, bsks, ksks, lut, 1);
// Reset max_value_lut_indexes before returning, otherwise if the lut is
// reused the lut indexes will be wrong
memset(is_max_value_lut->h_lut_indexes, 0,
is_max_value_lut->num_blocks * sizeof(Torus));
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
is_max_value_lut->h_lut_indexes,
is_max_value_lut->num_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0));
auto active_gpu_count_is_max = streams.active_gpu_subset(
is_max_value_lut->num_blocks, params.pbs_type);
is_max_value_lut->set_lut_indexes_and_broadcast_constant(
active_gpu_count_is_max, 0);
is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);
reset_radix_ciphertext_blocks(lwe_array_out, 1);
return;
@@ -215,7 +222,7 @@ __host__ void is_at_least_one_comparisons_block_true(
uint32_t remaining_blocks = num_radix_blocks;
while (remaining_blocks > 0) {
// Split in max_value chunks
int num_chunks = CEIL_DIV(remaining_blocks, max_value);
int num_chunks = (remaining_blocks + max_value - 1) / max_value;
// Since all blocks encrypt either 0 or 1, we can sum max_value of them
// as in the worst case we will be adding `max_value` ones
@@ -476,11 +483,14 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
y = x;
f = sign_handler_f;
}
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), last_lut->get_lut(0, 0),
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f, true,
tree_buffer->preallocated_h_lut);
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
last_lut->generate_and_broadcast_lut(active_streams, {0}, {f},
LUT_0_FOR_ALL_BLOCKS, true,
{tree_buffer->preallocated_h_lut});
last_lut->broadcast_lut(active_streams);
// Last leaf
integer_radix_apply_univariate_lookup_table<Torus>(streams, lwe_array_out, y,

View File

@@ -10,122 +10,7 @@
#include "integer/integer.cuh"
#include "linearalgebra/multiplication.cuh"
#include "polynomial/functions.cuh"
#include "utils/helper.cuh"
/*
* =============================================================================
* GPU Compression/Decompression Algorithm: Overview
* =============================================================================
*
* The compression algorithm transforms standard LWE ciphertexts into a compact
* packed format. Decompression reverses this process.
*
* -----------------------------------------------------------------------------
* COMPRESSION INPUT (lwe_array_in)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | lwe_array_in (GPU memory) |
* +-------------------------------------------------------------------------+
* +---------------------------+---------------------------+-----------------+
* | LWE 0 | LWE 1 | ... |
* | [mask, body] | [mask, body] | |
* +---------------------------+---------------------------+-----------------+
* |<-- lwe_dimension + 1 -->|
*
* Total LWEs: total_lwe_bodies_count (num_radix_blocks)
*
* -----------------------------------------------------------------------------
* COMPRESSION PROCESS
* -----------------------------------------------------------------------------
*
* 1. Message Shift (64-bit only):
* Each LWE is multiplied by message_modulus to shift the message to MSB
*
* 2. Packing Keyswitch (LWE -> GLWE):
* Groups of up to lwe_per_glwe LWEs are packed into a single GLWE:
*
* +--------------------------------------------------------------+
* | lwe_per_glwe LWEs (input batch) |
* | LWE[0], LWE[1], ..., LWE[lwe_per_glwe-1] |
* +--------------------------------------------------------------+
* |
* Packing Keyswitch
* v
* +--------------------------------------------------------------+
* | Single GLWE Ciphertext |
* | [A_0, A_1, ..., A_{k-1}, B] |
* | |<-- k * polynomial_size -->| |<-- polynomial_size -->| |
* +--------------------------------------------------------------+
*
* Number of output GLWEs: num_glwes = ceil(total_lwe_bodies_count /
* lwe_per_glwe)
*
* 3. Modulus Switch:
* Reduce precision from 64-bit torus to storage_log_modulus bits
*
* 4. Bit Packing:
* Pack multiple reduced-precision elements into dense bit representation
*
* -----------------------------------------------------------------------------
* COMPRESSION MEMORY LAYOUT (tmp_glwe_array_out)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | tmp_glwe_array_out (intermediate buffer) |
* +-------------------------------------------------------------------------+
* +----------------------------+----------------------------+---------------+
* | GLWE 0 | GLWE 1 | ... |
* | [A_0..A_{k-1}, B_0..B_N] | [A_0..A_{k-1}, B_0..B_N] | |
* +----------------------------+----------------------------+---------------+
* |<-- glwe_accumulator_size = (k+1)*N -->|
*
* Total size needed: num_glwes * glwe_accumulator_size elements
* Where: num_glwes = ceil(total_lwe_bodies_count / lwe_per_glwe)
*
* -----------------------------------------------------------------------------
* PACKED OUTPUT (glwe_array_out)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | Packed GLWE Ciphertext List (bit-packed) |
* +-------------------------------------------------------------------------+
* +-------------------------------------------------------------------------+
* | Elements packed with storage_log_modulus bits per original element |
* | Total packed size: ceil(in_len * storage_log_modulus / 64) elements |
* +-------------------------------------------------------------------------+
*
* =============================================================================
* DECOMPRESSION (Extract) Algorithm
* =============================================================================
*
* Decompression receives an array of LWE indexes. For each index, it identifies
* the corresponding GLWE, extracts that GLWE from the packed representation,
* and then sample-extracts the requested LWE from the GLWE.
*
* -----------------------------------------------------------------------------
* EXTRACT OUTPUT LAYOUT (glwe_array_out in host_extract)
* -----------------------------------------------------------------------------
*
* +-------------------------------------------------------------------------+
* | Extracted GLWE Ciphertext |
* +-------------------------------------------------------------------------+
* +---------------------------------------+-----------------+---------------+
* | Mask (A polynomials) | Body (B) | Tail |
* | [A_0, ..., A_{k-1}] | (body_count) | (zeroed) |
* | k * polynomial_size elements | elements | elements |
* +---------------------------------------+-----------------+---------------+
* |<------------------- initial_out_len ------------------->| |
* |<------------------------ glwe_ciphertext_size ------------------------->|
*
* For the last GLWE, body_count may be less than polynomial_size (partial).
* The tail region must be zeroed to ensure defined behavior.
*
* tail_size = glwe_ciphertext_size - initial_out_len
* tail_offset = initial_out_len (NOT 0!)
*
* =============================================================================
*/
#include "utils/kernel_dimensions.cuh"
template <typename Torus>
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
@@ -181,7 +66,7 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto nbits = sizeof(Torus) * 8;
auto out_len = CEIL_DIV(number_bits_to_pack, nbits);
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
int num_blocks = 0, num_threads = 0;
getNumBlocksAndThreads(out_len, 1024, num_blocks, num_threads);
@@ -223,8 +108,6 @@ host_integer_compress(CudaStreams streams,
uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
glwe_array_out->lwe_per_glwe - 1) /
glwe_array_out->lwe_per_glwe;
PANIC_IF_FALSE(num_glwes <= mem_ptr->max_num_glwes,
"Invalid number of GLWEs");
// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -317,7 +200,8 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
uint32_t num_glwes = CEIL_DIV(total_lwe_bodies_count, polynomial_size);
uint32_t num_glwes =
(total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
// Compressed length of the compressed GLWE we want to extract
uint32_t body_count = 0;
@@ -334,21 +218,19 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;
// Calculates how many bits this particular GLWE shall use
auto number_bits_to_unpack = initial_out_len * log_modulus;
auto nbits = sizeof(Torus) * 8;
// Calculate how many bits a full-packed GLWE uses, to determine
// the stride between consecutive packed GLWEs in the input buffer
auto number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
auto len = CEIL_DIV(number_bits_to_unpack, nbits);
// Calculates how many bits a full-packed GLWE shall use
number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
auto len = (number_bits_to_unpack + nbits - 1) / nbits;
// Uses that length to set the input pointer
auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;
// Ensure the tail of the GLWE is zeroed
// The extract kernel writes initial_out_len elements starting at offset 0.
// We must zero the tail region (from initial_out_len to
// glwe_ciphertext_size)
if (initial_out_len < glwe_ciphertext_size) {
cuda_memset_async(glwe_array_out + initial_out_len, 0,
cuda_memset_async(glwe_array_out, 0,
(glwe_ciphertext_size - initial_out_len) * sizeof(Torus),
stream, gpu_index);
}

View File

@@ -14,6 +14,7 @@
#include "utils/helper.cuh"
#include "utils/helper_multi_gpu.cuh"
#include "utils/helper_profile.cuh"
#include "utils/kernel_dimensions.cuh"
#include <algorithm>
#include <functional>
@@ -272,7 +273,8 @@ __global__ void device_radix_split_simulators_and_grouping_pgns(
}
}
if ((blockIdx.x / group_size + 1) < CEIL_DIV(blocks_count, group_size)) {
if ((blockIdx.x / group_size + 1) <
(blocks_count + group_size - 1) / group_size) {
size_t src_offset = (blockIdx.x + group_size - 1) * lwe_size;
size_t pgns_offset = (blockIdx.x / group_size) * lwe_size;
for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
@@ -361,7 +363,7 @@ __host__ void host_radix_sum_in_groups(cudaStream_t stream, uint32_t gpu_index,
num_radix_blocks > src1->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks should have more "
"blocks than the number used in sum in groups")
auto num_groups = CEIL_DIV(num_radix_blocks, group_size);
auto num_groups = (num_radix_blocks + group_size - 1) / group_size;
if (src2->num_radix_blocks < num_groups)
PANIC("Cuda error: second input in sum in groups should have at least "
"num_groups blocks")
@@ -542,24 +544,6 @@ __host__ void integer_radix_apply_univariate_lookup_table(
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
// Verify consistency between set_lut_indexes and apply_lookup_table
GPU_ASSERT(
num_radix_blocks <= lut->last_broadcast_num_radix_blocks,
"num_radix_blocks (%u) must match last_broadcast_num_radix_blocks (%u)",
num_radix_blocks, lut->last_broadcast_num_radix_blocks);
GPU_ASSERT(active_streams.count() <= lut->last_broadcast_streams.count(),
"active_streams count (%u) must match last_broadcast_streams "
"count (%u)",
active_streams.count(), lut->last_broadcast_streams.count());
for (uint32_t i = 0; i < active_streams.count(); i++) {
GPU_ASSERT(active_streams.gpu_index(i) ==
lut->last_broadcast_streams.gpu_index(i),
"active_streams gpu_index(%u) = %u must match "
"last_broadcast_streams gpu_index(%u) = %u",
i, active_streams.gpu_index(i), i,
lut->last_broadcast_streams.gpu_index(i));
}
if (active_streams.count() == 1) {
execute_keyswitch_async<Torus>(
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -978,9 +962,8 @@ uint64_t generate_many_lookup_table(
template <typename Torus>
void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,
uint32_t polynomial_size,
std::function<Torus(Torus)> f) {
std::function<Torus(uint32_t)> f) {
// accumulator number of elements is (glwe_dimension + 1) * polynomial_size
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
auto body = &acc[glwe_dimension * polynomial_size];
@@ -992,9 +975,9 @@ void generate_lookup_table_no_encoding(Torus *acc, uint32_t glwe_dimension,
template <typename Torus>
void generate_device_accumulator_no_encoding(
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t &degree,
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
uint32_t polynomial_size, std::function<Torus(Torus)> f,
uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
bool gpu_memory_allocated) {
Torus *h_lut =
@@ -1003,7 +986,7 @@ void generate_device_accumulator_no_encoding(
generate_lookup_table_no_encoding<Torus>(h_lut, glwe_dimension,
polynomial_size, f);
*degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;
degree = (uint64_t)message_modulus * (uint64_t)carry_modulus * 2;
cuda_memcpy_with_size_tracking_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
@@ -1755,9 +1738,12 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
signs_array_in, 0, num_sign_blocks);
if (num_sign_blocks > 2) {
auto lut = diff_buffer->reduce_signs_lut;
lut->generate_and_broadcast_lut(
lut->active_streams, {0}, {reduce_two_orderings_function},
LUT_0_FOR_ALL_BLOCKS, true, {diff_buffer->preallocated_h_lut1});
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
reduce_two_orderings_function, true, diff_buffer->preallocated_h_lut1);
lut->broadcast_lut(lut->active_streams);
while (num_sign_blocks > 2) {
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
@@ -1783,10 +1769,12 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
};
auto lut = diff_buffer->reduce_signs_lut;
lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
LUT_0_FOR_ALL_BLOCKS, true,
{diff_buffer->preallocated_h_lut2});
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
diff_buffer->preallocated_h_lut2);
lut->broadcast_lut(lut->active_streams);
pack_blocks<Torus>(streams.stream(0), streams.gpu_index(0), signs_b,
signs_a, num_sign_blocks, message_modulus);
@@ -1801,9 +1789,12 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
};
auto lut = mem_ptr->diff_buffer->reduce_signs_lut;
lut->generate_and_broadcast_lut(lut->active_streams, {0}, {final_lut_f},
LUT_0_FOR_ALL_BLOCKS, true,
{diff_buffer->preallocated_h_lut2});
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, final_lut_f, true,
diff_buffer->preallocated_h_lut2);
lut->broadcast_lut(lut->active_streams);
integer_radix_apply_univariate_lookup_table<Torus>(
streams, signs_array_out, signs_a, bsks, ksks, lut, 1);
@@ -2353,7 +2344,7 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
// from the output ct
auto active_streams = streams.active_gpu_subset_u128(
auto active_streams = streams.active_gpu_subset(
lwe_array_out->num_radix_blocks, params.pbs_type);
if (active_streams.count() == 1) {
execute_keyswitch_async<InputTorus>(

Some files were not shown because too many files have changed in this diff Show More