Compare commits

..

2 Commits

Author SHA1 Message Date
Enzo Di Maria
f3f16da856 chore(gpu): some tests on erc20 2025-12-11 14:09:32 +01:00
Enzo Di Maria
368b3c1b87 refactor(gpu): creating benchmarks for match_value 2025-12-11 10:31:44 +01:00
333 changed files with 4647 additions and 15002 deletions

View File

@@ -2,8 +2,6 @@
ignore = [
# Ignoring unmaintained 'paste' advisory as it is a widely used, low-risk build dependency.
"RUSTSEC-2024-0436",
# Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
"RUSTSEC-2025-0141",
]
[output]

View File

@@ -23,8 +23,6 @@ runs:
echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
sha256sum -c checksum
sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
sudo apt-get clean
sudo rm -rf /var/lib/apt/lists/*
sudo apt update
sudo apt remove -y unattended-upgrades
sudo apt install -y cmake-format libclang-dev

View File

@@ -66,7 +66,7 @@ jobs:
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'true' # Needed to pull lfs data
token: ${{ env.CHECKOUT_TOKEN }}
@@ -80,7 +80,7 @@ jobs:
- name: Retrieve data from cache
id: retrieve-data-cache
uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
with:
path: |
utils/tfhe-backward-compat-data/**/*.cbor
@@ -109,7 +109,7 @@ jobs:
- name: Store data in cache
if: steps.retrieve-data-cache.outputs.cache-hit != 'true'
continue-on-error: true
uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
with:
path: |
utils/tfhe-backward-compat-data/**/*.cbor

View File

@@ -63,7 +63,7 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -71,7 +71,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
dependencies:
@@ -171,7 +171,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -219,7 +219,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
with:
path: |
~/.nvm
@@ -232,7 +232,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |

View File

@@ -50,7 +50,7 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -58,7 +58,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
integer:
@@ -112,7 +112,7 @@ jobs:
timeout-minutes: 480 # 8 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -60,7 +60,7 @@ jobs:
timeout-minutes: 1440
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -51,7 +51,7 @@ jobs:
steps.changed-files.outputs.integer_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -59,7 +59,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
integer:
@@ -112,7 +112,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -72,7 +72,7 @@ jobs:
any_file_changed: ${{ env.IS_PULL_REQUEST == 'false' || steps.aggregated-changes.outputs.any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -80,7 +80,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
dependencies:
@@ -182,7 +182,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -64,7 +64,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -80,7 +80,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
with:
path: |
~/.nvm
@@ -93,7 +93,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |

View File

@@ -1,8 +1,6 @@
# Run benchmarks on an AWS instance and return parsed results to Slab CI bot.
name: benchmark_cpu
run-name: ${{ inputs.command }}::${{ inputs.bench_type}} (${{ inputs.op_flavor }}, ${{ inputs.precisions_set }}, ${{ inputs.params_type }})
on:
workflow_dispatch:
inputs:

View File

@@ -149,7 +149,7 @@ jobs:
params_type: ${{ fromJSON(needs.prepare-matrix.outputs.params_type) }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -223,13 +223,13 @@ jobs:
results_type: ${{ inputs.additional_results_type }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab

View File

@@ -49,7 +49,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -99,13 +99,13 @@ jobs:
--append-results
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_ct_key_sizes
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab

View File

@@ -8,13 +8,8 @@ on:
description: "Run CPU benchmarks"
type: boolean
default: true
# GPU benchmarks are split because of resource scarcity.
run-gpu-integer-benchmarks:
description: "Run GPU integer benchmarks"
type: boolean
default: true
run-gpu-core-crypto-benchmarks:
description: "Run GPU core-crypto benchmarks"
run-gpu-benchmarks:
description: "Run GPU benchmarks"
type: boolean
default: true
run-hpu-benchmarks:
@@ -57,7 +52,7 @@ jobs:
run-benchmarks-gpu-integer:
name: benchmark_documentation/run-benchmarks-gpu-integer
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-integer-benchmarks
if: inputs.run-gpu-benchmarks
with:
profile: multi-h100-sxm5
hardware_name: n3-H100-SXM5x8
@@ -118,7 +113,7 @@ jobs:
run-benchmarks-gpu-core-crypto:
name: benchmark_documentation/run-benchmarks-gpu-core-crypto
uses: ./.github/workflows/benchmark_gpu_common.yml
if: inputs.run-gpu-core-crypto-benchmarks
if: inputs.run-gpu-benchmarks
with:
profile: multi-h100-sxm5
hardware_name: n3-H100-SXM5x8
@@ -138,7 +133,7 @@ jobs:
generate-svgs-with-benchmarks-run:
name: benchmark-documentation/generate-svgs-with-benchmarks-run
if: ${{ always() &&
(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks ||inputs.run-hpu-benchmarks) &&
(inputs.run-cpu-benchmarks || inputs.run-gpu-benchmarks ||inputs.run-hpu-benchmarks) &&
inputs.generate-svgs }}
needs: [
run-benchmarks-cpu-integer, run-benchmarks-gpu-integer, run-benchmarks-hpu-integer,
@@ -148,7 +143,7 @@ jobs:
with:
time_span_days: 5
generate-cpu-svgs: ${{ inputs.run-cpu-benchmarks }}
generate-gpu-svgs: ${{ inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks }}
generate-gpu-svgs: ${{ inputs.run-gpu-benchmarks }}
generate-hpu-svgs: ${{ inputs.run-hpu-benchmarks }}
secrets:
DATA_EXTRACTOR_DATABASE_USER: ${{ secrets.DATA_EXTRACTOR_DATABASE_USER }}
@@ -157,7 +152,7 @@ jobs:
generate-svgs-without-benchmarks-run:
name: benchmark-documentation/generate-svgs-without-benchmarks-run
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-integer-benchmarks || inputs.run-gpu-core-crypto-benchmarks || inputs.run-hpu-benchmarks) &&
if: ${{ !(inputs.run-cpu-benchmarks || inputs.run-gpu-benchmarks || inputs.run-hpu-benchmarks) &&
inputs.generate-svgs }}
uses: ./.github/workflows/generate_svgs.yml
with:
@@ -180,12 +175,12 @@ jobs:
PATH_TO_DOC_ASSETS: tfhe/docs/.gitbook/assets
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
- name: Download SVG tables
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
path: svg_tables
merge-multiple: 'true'
@@ -203,7 +198,7 @@ jobs:
echo "date=$(date '+%g_%m_%d_%Hh%Mm%Ss')" >> "${GITHUB_OUTPUT}"
- name: Create pull-request
uses: peter-evans/create-pull-request@98357b18bf14b5342f975ff684046ec3b2a07725 # v8.0.0
uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
with:
sign-commits: true # Commit will be signed by github-actions bot
add-paths: ${{ env.PATH_TO_DOC_ASSETS }}/*.svg

View File

@@ -1,8 +1,6 @@
# Run CUDA benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
name: benchmark_gpu
run-name: ${{ inputs.command }}::${{ inputs.bench_type}} (${{ inputs.profile }}, ${{ inputs.op_flavor }}, ${{ inputs.precisions_set }}, ${{ inputs.params_type }})
on:
workflow_dispatch:
inputs:

View File

@@ -40,7 +40,7 @@ jobs:
timeout-minutes: 1440 # 24 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -63,7 +63,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab
@@ -89,7 +89,7 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_integer_multi_bit_gpu_default
path: ${{ env.RESULTS_FILENAME }}
@@ -123,7 +123,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -146,7 +146,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab
@@ -173,7 +173,7 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_core_crypto
path: ${{ env.RESULTS_FILENAME }}

View File

@@ -175,7 +175,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -209,7 +209,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -281,13 +281,13 @@ jobs:
BENCH_TYPE: ${{ matrix.bench_type }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_${{ matrix.command }}_${{ matrix.op_flavor }}_${{ inputs.profile }}_${{ matrix.bench_type }}_${{ matrix.params_type }}
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab

View File

@@ -130,7 +130,7 @@ jobs:
git lfs install
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
path: tfhe-rs
persist-credentials: false
@@ -141,7 +141,7 @@ jobs:
ls
- name: Checkout fhevm
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
repository: zama-ai/fhevm
persist-credentials: 'false'
@@ -192,10 +192,10 @@ jobs:
cargo install sqlx-cli
- name: Install foundry
uses: foundry-rs/foundry-toolchain@8b0419c685ef46cb79ec93fbdc131174afceb730
uses: foundry-rs/foundry-toolchain@50d5a8956f2e319df19e6b57539d7e2acb9f8c1e
- name: Cache cargo
uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
with:
path: |
~/.cargo/registry
@@ -223,7 +223,7 @@ jobs:
working-directory: fhevm/coprocessor/fhevm-engine/tfhe-worker
- name: Use Node.js
uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
with:
node-version: 20.x
@@ -262,7 +262,7 @@ jobs:
- name: Upload profile artifact
env:
REPORT_NAME: ${{ steps.nsys_profile_name.outputs.profile }}
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ env.REPORT_NAME }}
path: fhevm/coprocessor/fhevm-engine/tfhe-worker/${{ env.REPORT_NAME }}
@@ -293,13 +293,13 @@ jobs:
working-directory: fhevm/
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${COMMIT_SHA}_${BENCHMARKS}_${{ needs.parse-inputs.outputs.profile }}
path: fhevm/$${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab

View File

@@ -1,8 +1,6 @@
# Run benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
name: benchmark_hpu
run-name: ${{ inputs.command }}::${{ inputs.bench_type}} (${{ inputs.op_flavor }}, ${{ inputs.precisions_set }})
on:
workflow_dispatch:
inputs:

View File

@@ -126,7 +126,7 @@ jobs:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -185,13 +185,13 @@ jobs:
BENCH_TYPE: ${{ matrix.bench_type }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_${{ matrix.bench_type }}_${{ matrix.command }}_benchmarks
name: ${{ github.sha }}_${{ matrix.bench_type }}_integer_benchmarks
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab

View File

@@ -50,7 +50,7 @@ jobs:
pull-requests: write # Needed to write a comment in a pull-request
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -164,7 +164,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -191,7 +191,7 @@ jobs:
command: ${{ fromJson(needs.prepare-benchmarks.outputs.commands) }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0 # Needed to get commit hash
persist-credentials: 'false'
@@ -245,7 +245,7 @@ jobs:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab
@@ -280,7 +280,7 @@ jobs:
BENCH_TYPE: ${{ env.__TFHE_RS_BENCH_TYPE }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_regression_${{ env.RESULTS_FILE_SHA }} # RESULT_FILE_SHA is needed to avoid collision between matrix.command runs
path: ${{ env.RESULTS_FILENAME }}
@@ -305,7 +305,7 @@ jobs:
REF_NAME: ${{ github.head_ref || github.ref_name }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

View File

@@ -55,7 +55,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -96,13 +96,13 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_fft
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab

View File

@@ -55,7 +55,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -96,13 +96,13 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_ntt
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab

View File

@@ -39,7 +39,7 @@ jobs:
wasm_bench: ${{ steps.changed-files.outputs.wasm_bench_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -47,7 +47,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
wasm_bench:
@@ -91,7 +91,7 @@ jobs:
browser: [ chrome, firefox ]
steps:
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -119,7 +119,7 @@ jobs:
- name: Node cache restoration
id: node-cache
uses: actions/cache/restore@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
with:
path: |
~/.nvm
@@ -132,7 +132,7 @@ jobs:
make install_node
- name: Node cache save
uses: actions/cache/save@9255dc7a253b0ccc959486e2bca901246202afeb #v5.0.1
uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 #v4.3.0
if: steps.node-cache.outputs.cache-hit != 'true'
with:
path: |
@@ -153,12 +153,6 @@ jobs:
env:
BROWSER: ${{ matrix.browser }}
- name: Run benchmarks (unsafe coop)
run: |
make bench_web_js_api_unsafe_coop_"${BROWSER}"_ci
env:
BROWSER: ${{ matrix.browser }}
- name: Parse results
run: |
make parse_wasm_benchmarks
@@ -175,13 +169,13 @@ jobs:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_wasm_${{ matrix.browser }}
path: ${{ env.RESULTS_FILENAME }}
- name: Checkout Slab repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: zama-ai/slab
path: slab

View File

@@ -26,7 +26,7 @@ jobs:
name: cargo_audit/audit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
outputs:
matrix_command: ${{ steps.set-pcc-commands-matrix.outputs.commands }}
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -140,7 +140,7 @@ jobs:
result: ${{ steps.set_builds_result.outputs.result }}
steps:
- name: Checkout tfhe-rs repo
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -26,7 +26,7 @@ jobs:
fail-fast: false
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -24,7 +24,7 @@ jobs:
os: [ubuntu-latest, macos-latest, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -29,7 +29,7 @@ jobs:
fft_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.fft_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -37,7 +37,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
fft:
@@ -56,7 +56,7 @@ jobs:
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
fail-fast: false
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -92,7 +92,7 @@ jobs:
if: needs.should-run.outputs.fft_test == 'true'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -31,7 +31,7 @@ jobs:
ntt_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.ntt_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: "false"
@@ -39,7 +39,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
ntt:
@@ -87,7 +87,7 @@ jobs:
os: ${{fromJson(needs.setup-instance.outputs.matrix_os)}}
fail-fast: false
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -50,7 +50,7 @@ jobs:
version: ${{ steps.get_zizmor.outputs.version }}
- name: Ensure SHA pinned actions
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@6124774845927d14c601359ab8138699fa5b70c3 # v4.0.1
uses: zgosalvez/github-actions-ensure-sha-pinned-actions@9e9574ef04ea69da568d6249bd69539ccc704e74 # v4.0.0
with:
allowlist: |
slsa-framework/slsa-github-generator

View File

@@ -50,7 +50,7 @@ jobs:
timeout-minutes: 5760 # 4 days
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
@@ -62,7 +62,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
tfhe:
@@ -92,7 +92,7 @@ jobs:
make test_shortint_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}
@@ -106,7 +106,7 @@ jobs:
make test_integer_cov
- name: Upload tfhe coverage to Codecov
uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de
uses: codecov/codecov-action@5a1091511ad55cbe89839c7260b706298ca349f7
if: steps.changed-files.outputs.tfhe_any_changed == 'true'
with:
token: ${{ secrets.CODECOV_TOKEN }}

View File

@@ -62,7 +62,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -43,7 +43,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
@@ -75,22 +75,13 @@ jobs:
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
- name: Upload tables
if: inputs.backend_comparison == false
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
with:
name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
# This will upload all the file generated
path: ${{ inputs.output_filename }}*.svg
retention-days: 60
- name: Produce backends comparison table from database
if: inputs.backend_comparison == true
run: |
python3 -m pip install -r ci/data_extractor/requirements.txt
python3 ci/data_extractor/src/data_extractor.py "${OUTPUT_FILENAME}" \
--generate-svg \
--backends-comparison \
--backend-comparison\
--time-span-days "${TIME_SPAN}"
env:
OUTPUT_FILENAME: ${{ inputs.output_filename }}
@@ -99,11 +90,10 @@ jobs:
DATA_EXTRACTOR_DATABASE_HOST: ${{ secrets.DATA_EXTRACTOR_DATABASE_HOST }}
DATA_EXTRACTOR_DATABASE_PASSWORD: ${{ secrets.DATA_EXTRACTOR_DATABASE_PASSWORD }}
- name: Upload comparison tables
if: inputs.backend_comparison == true
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
- name: Upload tables
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4
with:
name: ${{ github.sha }}_backends_comparison_tables
name: ${{ github.sha }}_${{ inputs.backend }}_${{ inputs.layer }}_${{ inputs.pbs_kind }}_${{ inputs.bench_type }}_tables
# This will upload all the file generated
path: ${{ inputs.output_filename }}*.svg
retention-days: 60

View File

@@ -41,7 +41,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -23,8 +23,8 @@ on:
# Allows you to run this workflow manually from the Actions tab as an alternative.
workflow_dispatch:
schedule:
# every month
- cron: "0 0 1 * *"
# every 3 months
- cron: "0 0 1 */3 *"
permissions:
contents: read
@@ -50,7 +50,7 @@ jobs:
slab-url: ${{ secrets.SLAB_BASE_URL }}
job-secret: ${{ secrets.JOB_SECRET }}
backend: hyperstack
profile: single-h100
profile: gpu-test
# This instance will be spawned especially for pull-request from forked repository
- name: Start GitHub instance
@@ -79,7 +79,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -39,7 +39,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -47,7 +47,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -114,7 +114,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -68,7 +68,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -65,7 +65,7 @@ jobs:
timeout-minutes: 4320 # 72 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -78,7 +78,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -78,7 +78,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -74,7 +74,7 @@ jobs:
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -49,7 +49,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -117,7 +117,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -116,7 +116,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -40,7 +40,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -48,7 +48,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -129,7 +129,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
gpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.gpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -49,7 +49,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
gpu:
@@ -117,7 +117,7 @@ jobs:
gcc: 11
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -32,7 +32,7 @@ jobs:
hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -40,7 +40,7 @@ jobs:
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@e0021407031f5be11a464abee9a0776171c79891 # v47.0.1
uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0
with:
files_yaml: |
hpu:
@@ -83,7 +83,7 @@ jobs:
needs: setup-instance
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -53,7 +53,7 @@ jobs:
timeout-minutes: 4320 # 72 hours
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}

View File

@@ -41,7 +41,7 @@ jobs:
timeout-minutes: 720
steps:
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
- uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: "false"
token: ${{ env.CHECKOUT_TOKEN }}

View File

@@ -52,7 +52,7 @@ jobs:
hash: ${{ steps.hash.outputs.hash }}
steps:
- name: Checkout
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -62,7 +62,7 @@ jobs:
PACKAGE: ${{ inputs.package-name }}
run: |
cargo package -p "${PACKAGE}"
- uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
- uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: crate-${{ inputs.package-name }}
path: target/package/*.crate
@@ -93,14 +93,14 @@ jobs:
id-token: write # Needed for OIDC token exchange on crates.io
steps:
- name: Checkout
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Download artifact
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: crate-${{ inputs.package-name }}
path: target/package

View File

@@ -64,7 +64,7 @@ jobs:
CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
steps:
- name: Checkout
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
fetch-depth: 0
persist-credentials: "false"
@@ -104,7 +104,7 @@ jobs:
run: |
cargo package -p tfhe-cuda-backend
- uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
- uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
with:
name: crate-tfhe-cuda-backend
path: target/package/*.crate
@@ -174,7 +174,7 @@ jobs:
GCC_VERSION: ${{ matrix.gcc }}
- name: Download artifact
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
with:
name: crate-tfhe-cuda-backend
path: target/package

View File

@@ -41,7 +41,6 @@ jobs:
make-release:
name: make_release_tfhe/make-release
uses: ./.github/workflows/make_release_common.yml
if: ${{ inputs.push_to_crates }}
with:
package-name: "tfhe"
dry-run: ${{ inputs.dry_run }}
@@ -60,7 +59,6 @@ jobs:
make-release-js:
name: make_release_tfhe/make-release-js
needs: make-release
if: ${{ always() && needs.make-release.result != 'failure' }}
runs-on: ubuntu-latest
# For provenance of npmjs publish
permissions:
@@ -68,7 +66,7 @@ jobs:
id-token: write # also needed for OIDC token exchange on crates.io and npmjs.com
steps:
- name: Checkout
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
with:
fetch-depth: 0
persist-credentials: 'false'
@@ -85,9 +83,9 @@ jobs:
make build_web_js_api_parallel
- name: Authenticate on NPM
uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # v6.1.0
uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # v6.0.0
with:
node-version: '24'
node-version: '22'
registry-url: 'https://registry.npmjs.org'
- name: Publish web package

View File

@@ -60,7 +60,7 @@ jobs:
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
@@ -71,7 +71,7 @@ jobs:
toolchain: stable
- name: Checkout lattice-estimator
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3
with:
repository: malb/lattice-estimator
path: lattice_estimator

View File

@@ -1,67 +0,0 @@
name: pr_milestone_check
on:
pull_request:
types: [opened, edited, synchronize, reopened, milestoned, demilestoned]
permissions: {}
# zizmor: ignore[concurrency-limits] only Zama organization members can trigger this workflow
# external contributors workflows are manually approved
jobs:
check-empty-milestone:
name: pr_milestone_check/check-empty-milestone
runs-on: ubuntu-latest
if: github.event.pull_request.milestone == null
permissions:
pull-requests: write # Need write access on pull requests to post comment
steps:
- name: Post Reminder Comment
uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
with:
route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
body: |
'### ❌ Milestone Missing
Please assign a milestone to this pull request. If your PR targets the next version of
TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
If your PR targets a patch version for previous releases: consider creating a dedicated
milestone e.g. v1.5.1 if it does not exist yet.'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Check Final Status
run: |
echo "::error::Milestone is missing. This check is failing."
exit 1
check-milestone-open:
name: pr_milestone_check/check-milestone-open
runs-on: ubuntu-latest
if: github.event.pull_request.milestone != null && github.event.pull_request.milestone.state == 'closed'
permissions:
pull-requests: write # Need write access on pull requests to post comment
steps:
- name: Post Reminder Comment
uses: octokit/request-action@dad4362715b7fb2ddedf9772c8670824af564f0d # v2.4.0
with:
route: POST /repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments
body: |
'### ❌ Milestone is closed
Please assign an open milestone to this pull request. If your PR targets the next version of
TFHE-rs please use the current quarter milestone, e.g. "Q1 26".
If your PR targets a patch version for previous releases: consider creating a dedicated
milestone e.g. v1.5.1 if it does not exist yet.'
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Check Final Status
run: |
echo "::error::Milestone is closed. This check is failing."
exit 1

View File

@@ -17,7 +17,7 @@ jobs:
issues: read # Needed to fetch all issues
pull-requests: write # Needed to write message and close the PR
steps:
- uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1
- uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
with:
stale-pr-message: 'This PR is unverified and has been open for 2 days, it will now be closed. If you want to contribute please sign the CLA as indicated by the bot.'
days-before-stale: 2

1
.gitignore vendored
View File

@@ -10,7 +10,6 @@ target/
**/*.rmeta
**/Cargo.lock
**/*.bin
**/.DS_Store
# Some of our bench outputs
/tfhe/benchmarks_parameters

View File

@@ -11,7 +11,7 @@
/tfhe/src/core_crypto/gpu @agnesLeroy
/tfhe/src/core_crypto/hpu @zama-ai/hardware
/tfhe/src/shortint/ @mayeul-zama @nsarlin-zama
/tfhe/src/shortint/ @mayeul-zama
/tfhe/src/integer/ @tmontaigu
/tfhe/src/integer/gpu @agnesLeroy
@@ -19,12 +19,8 @@
/tfhe/src/high_level_api/ @tmontaigu
/tfhe-zk-pok/ @nsarlin-zama
/tfhe-benchmark/ @soonum
/utils/ @nsarlin-zama
/Makefile @IceTDrinker @soonum
/mockups/tfhe-hpu-mockup @zama-ai/hardware

View File

@@ -27,7 +27,7 @@ rust-version = "1.91.1"
[workspace.dependencies]
aligned-vec = { version = "0.6", default-features = false }
bytemuck = "1.24"
dyn-stack = { version = "0.13", default-features = false }
dyn-stack = { version = "0.11", default-features = false }
itertools = "0.14"
num-complex = "0.4"
pulp = { version = "0.22", default-features = false }
@@ -36,8 +36,6 @@ rayon = "1.11"
serde = { version = "1.0", default-features = false }
wasm-bindgen = "0.2.101"
getrandom = "0.2.8"
# The project maintainers consider that this is the last version of the 1.3 branch, any newer version should not be trusted
bincode = "=1.3.3"
[profile.bench]
lto = "fat"

View File

@@ -20,7 +20,7 @@ BENCH_TYPE?=latency
BENCH_PARAM_TYPE?=classical
BENCH_PARAMS_SET?=default
BENCH_CUSTOM_COMMAND:=
NODE_VERSION=24.12
NODE_VERSION=22.6
BACKWARD_COMPAT_DATA_DIR=utils/tfhe-backward-compat-data
BACKWARD_COMPAT_DATA_GEN_VERSION:=$(TFHE_VERSION)
TEST_VECTORS_DIR=apps/test-vectors
@@ -996,15 +996,6 @@ test_noise_check:
--features=boolean,shortint,integer -p tfhe -- noise_check \
--test-threads=1 --nocapture
.PHONY: test_noise_check_gpu # Run dedicated noise and pfail check tests on gpu backend
test_noise_check_gpu:
@# First run the sanity checks to make sure the atomic patterns are correct
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
--features=boolean,shortint,integer,gpu -p tfhe -- gpu_sanity_check
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
--features=boolean,shortint,integer,gpu -p tfhe -- gpu_noise_check \
--test-threads=1 --nocapture
.PHONY: test_safe_serialization # Run the tests for safe serialization
test_safe_serialization: install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo test --profile $(CARGO_PROFILE) \
@@ -1300,14 +1291,13 @@ run_web_js_api_parallel: build_web_js_api_parallel setup_venv
--browser-path $(browser_path) \
--driver-path $(driver_path) \
--browser-kind $(browser_kind) \
--server-cmd $(server_cmd) \
--server-cmd "npm run server" \
--server-workdir "$(WEB_SERVER_DIR)" \
--id-pattern $(filter)
test_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
test_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
test_web_js_api_parallel_chrome: browser_kind = chrome
test_web_js_api_parallel_chrome: server_cmd = "npm run server:multithreaded"
test_web_js_api_parallel_chrome: filter = Test
.PHONY: test_web_js_api_parallel_chrome # Run tests for the web wasm api on Chrome
@@ -1323,7 +1313,6 @@ test_web_js_api_parallel_chrome_ci: setup_venv
test_web_js_api_parallel_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
test_web_js_api_parallel_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
test_web_js_api_parallel_firefox: browser_kind = firefox
test_web_js_api_parallel_firefox: server_cmd = "npm run server:multithreaded"
test_web_js_api_parallel_firefox: filter = Test
.PHONY: test_web_js_api_parallel_firefox # Run tests for the web wasm api on Firefox
@@ -1354,6 +1343,7 @@ dieharder_csprng: install_dieharder build_tfhe_csprng
.PHONY: clippy_bench # Run clippy lints on tfhe-benchmark
clippy_bench: install_rs_check_toolchain
! (grep --recursive "trivial" tfhe-benchmark && echo "trivial found in benches")
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=boolean,shortint,integer,internal-keycache,pbs-stats,zk-pok \
-p tfhe-benchmark -- --no-deps -D warnings
@@ -1454,13 +1444,6 @@ bench_integer_aes256_gpu: install_rs_check_toolchain
--bench integer-aes256 \
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_trivium_gpu # Run benchmarks for trivium on GPU backend
bench_integer_trivium_gpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-trivium \
--features=integer,internal-keycache,gpu, -p tfhe-benchmark --profile release_lto_off --
.PHONY: bench_integer_multi_bit # Run benchmarks for unsigned integer using multi-bit parameters
bench_integer_multi_bit: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_PARAM_TYPE=MULTI_BIT __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1580,7 +1563,6 @@ bench_pbs128_gpu: install_rs_check_toolchain
bench_web_js_api_parallel_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
bench_web_js_api_parallel_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
bench_web_js_api_parallel_chrome: browser_kind = chrome
bench_web_js_api_parallel_chrome: server_cmd = "npm run server:multithreaded"
bench_web_js_api_parallel_chrome: filter = Bench
.PHONY: bench_web_js_api_parallel_chrome # Run benchmarks for the web wasm api
@@ -1596,7 +1578,6 @@ bench_web_js_api_parallel_chrome_ci: setup_venv
bench_web_js_api_parallel_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
bench_web_js_api_parallel_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
bench_web_js_api_parallel_firefox: browser_kind = firefox
bench_web_js_api_parallel_firefox: server_cmd = "npm run server:multithreaded"
bench_web_js_api_parallel_firefox: filter = Bench
.PHONY: bench_web_js_api_parallel_firefox # Run benchmarks for the web wasm api
@@ -1609,38 +1590,6 @@ bench_web_js_api_parallel_firefox_ci: setup_venv
nvm use $(NODE_VERSION) && \
$(MAKE) bench_web_js_api_parallel_firefox
bench_web_js_api_unsafe_coop_chrome: browser_path = "$(WEB_RUNNER_DIR)/chrome/chrome-linux64/chrome"
bench_web_js_api_unsafe_coop_chrome: driver_path = "$(WEB_RUNNER_DIR)/chrome/chromedriver-linux64/chromedriver"
bench_web_js_api_unsafe_coop_chrome: browser_kind = chrome
bench_web_js_api_unsafe_coop_chrome: server_cmd = "npm run server:unsafe-coop"
bench_web_js_api_unsafe_coop_chrome: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
.PHONY: bench_web_js_api_unsafe_coop_chrome # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_unsafe_coop_chrome: run_web_js_api_parallel
.PHONY: bench_web_js_api_unsafe_coop_chrome_ci # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_unsafe_coop_chrome_ci: setup_venv
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) bench_web_js_api_unsafe_coop_chrome
bench_web_js_api_unsafe_coop_firefox: browser_path = "$(WEB_RUNNER_DIR)/firefox/firefox/firefox"
bench_web_js_api_unsafe_coop_firefox: driver_path = "$(WEB_RUNNER_DIR)/firefox/geckodriver"
bench_web_js_api_unsafe_coop_firefox: browser_kind = firefox
bench_web_js_api_unsafe_coop_firefox: server_cmd = "npm run server:unsafe-coop"
bench_web_js_api_unsafe_coop_firefox: filter = ZeroKnowledgeBench # Only bench zk with unsafe coop
.PHONY: bench_web_js_api_unsafe_coop_firefox # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_unsafe_coop_firefox: run_web_js_api_parallel
.PHONY: bench_web_js_api_unsafe_coop_firefox_ci # Run benchmarks for the web wasm api without cross-origin isolation
bench_web_js_api_unsafe_coop_firefox_ci: setup_venv
source ~/.nvm/nvm.sh && \
nvm install $(NODE_VERSION) && \
nvm use $(NODE_VERSION) && \
$(MAKE) bench_web_js_api_unsafe_coop_firefox
.PHONY: bench_hlapi # Run benchmarks for integer operations
bench_hlapi: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_BIT_SIZES_SET=$(BIT_SIZES_SET) \

View File

@@ -79,7 +79,7 @@ tfhe = { version = "*", features = ["boolean", "shortint", "integer"] }
```
> [!Note]
> Note: You need Rust version 1.91.1 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
> Note: You need Rust version 1.84 or newer to compile TFHE-rs. You can check your version with `rustc --version`.
> [!Note]
> Note: AArch64-based machines are not supported for Windows as it's currently missing an entropy source to be able to seed the [CSPRNGs](https://en.wikipedia.org/wiki/Cryptographically_secure_pseudorandom_number_generator) used in TFHE-rs.

View File

@@ -1,32 +1,24 @@
08f31a47c29cc4d72ad32c0b5411fa20b3deef5b84558dd2fb892d3cdf90528a data/toy_params/glwe_after_id_br_karatsuba.cbor
29b6e3e7d27700004b70dca24d225816500490e2d6ee49b9af05837fd421896b data/valid_params_128/lwe_after_spec_pbs.cbor
2c70d1d78cc3760733850a353ace2b9c4705e840141b75841739e90e51247e18 data/valid_params_128/small_lwe_secret_key.cbor
2fb4bb45c259b8383da10fc8f9459c40a6972c49b1696eb107f0a75640724be5 data/toy_params/lwe_after_id_pbs_karatsuba.cbor
36c9080b636475fcacca503ce041bbfeee800fd3e1890dee559ea18defff9fe8 data/toy_params/glwe_after_id_br.cbor
377761beeb4216cf5aa2624a8b64b8259f5a75c32d28e850be8bced3a0cdd6f5 data/toy_params/ksk.cbor
59dba26d457f96478eda130cab5301fce86f23c6a8807de42f2a1e78c4985ca7 data/valid_params_128/lwe_ks.cbor
5d80dd93fefae4f4f89484dfcd65bbe99cc32e7e3b0a90c33dd0d77516c0a023 data/valid_params_128/glwe_after_id_br_karatsuba.cbor
656f0009c7834c5bcb61621e222047516054b9bc5d0593d474ab8f1c086b67a6 data/valid_params_128/lwe_after_id_pbs.cbor
699580ca92b9c2f9e1f57fb1e312c9e8cb29714f7acdef9d2ba05f798546751f data/toy_params/lwe_sum.cbor
6e54ab41056984595b077baff70236d934308cf5c0c33b4482fbfb129b3756c6 data/valid_params_128/glwe_after_id_br.cbor
70f5e5728822de05b49071efb5ec28551b0f5cc87aa709a455d8e7f04b9c96ee data/toy_params/lwe_after_id_pbs.cbor
76a5c52cab7fec1dc167da676c6cd39479cda6b2bb9f4e0573cb7d99c2692faa data/valid_params_128/lwe_after_id_pbs_karatsuba.cbor
7cc6803f5fbc3d5a1bf597f2b979ce17eecd3d6baca12183dea21022a7b65c52 data/toy_params/bsk.cbor
7f3c40a134623b44779a556212477fea26eaed22450f3b6faeb8721d63699972 data/valid_params_128/lwe_sum.cbor
837b3bd3245d4d0534ed255fdef896fb4fa6998a258a14543dfdadd0bfc9b6dd data/toy_params/lwe_prod.cbor
9ece8ca9c1436258b94e8c5e629b8722f9b18fdd415dd5209b6167a9dde8491c data/toy_params/glwe_after_spec_br_karatsuba.cbor
aa44aea29efd6d9e4d35a21a625d9cba155672e3f7ed3eddee1e211e62ad146b data/valid_params_128/lwe_ms.cbor
b7a037b9eaa88d6385167579b93e26a0cb6976d9b8967416fd1173e113bda199 data/valid_params_128/large_lwe_secret_key.cbor
b7b8e3586128887bd682120f3e3a43156139bce5e3fe0b03284f8753a864d647 data/toy_params/lwe_after_spec_pbs_karatsuba.cbor
bd00a8ae7494e400de5753029552ee1647efe7e17409b863a26a13b081099b8c data/toy_params/lwe_after_spec_pbs.cbor
c6df98676de04fe54b5ffc2eb30a82ebb706c9d7d5a4e0ed509700fec88761f7 data/toy_params/lwe_ms.cbor
c7d5a864d5616a7d8ad50bbf40416e41e6c9b60c546dc14d4aa8fc40a418baa7 data/toy_params/large_lwe_secret_key.cbor
c806533b325b1009db38be2f9bef5f3b2fad6b77b4c71f2855ccc9d3b4162e98 data/valid_params_128/lwe_b.cbor
c9eb75bd2993639348a679cf48c06e3c38d1a513f48e5b0ce0047cea8cff6bbc data/toy_params/lwe_a.cbor
d3391969acf26dc69de0927ba279139d8d79999944069addc8ff469ad6c5ae2d data/valid_params_128/lwe_after_spec_pbs_karatsuba.cbor
d6da5baef0e787f6be56e218d8354e26904652602db964844156fdff08350ce6 data/toy_params/lwe_ks.cbor
e591ab9af1b6a0aede273f9a3abb65a4c387feb5fa06a6959e9314058ca0f7e5 data/valid_params_128/ksk.cbor
e59b002df3a9b01ad321ec51cf076fa35131ab9dbef141d1c54b717d61426c92 data/valid_params_128/glwe_after_spec_br_karatsuba.cbor
e628354c81508a2d888016e8282df363dd12f1e19190b6475d4eb9d7ab8ae007 data/valid_params_128/glwe_after_spec_br.cbor
e69d2d2c064fc8c0460b39191ca65338146990349954f5ec5ebd01d93610e7eb data/valid_params_128/lwe_a.cbor
e76c24b2a0c9a842ad13dda35473c2514f9e7d20983b5ea0759c4521a91626d9 data/valid_params_128/lwe_prod.cbor

View File

@@ -1,46 +1,43 @@
# Test vectors for TFHE
These test vectors are generated using [TFHE-rs](https://github.com/zama-ai/tfhe-rs), with the git tag `tfhe-test-vectors-0.2.0`.
They are TFHE-rs objects serialized in the [cbor format](https://cbor.io/). These can be deserialized using any cbor library for any programming languages. For example, using the [cbor2](https://pypi.org/project/cbor2/) program, the command to run is: `cbor2 --pretty toy_params/lwe_a.cbor`.
They are TFHE-rs objects serialized in the [cbor format](https://cbor.io/). You can deserialize them using any cbor library for the language of your choice. For example, using the [cbor2](https://pypi.org/project/cbor2/) program, run: `cbor2 --pretty toy_params/lwe_a.cbor`.
There are 2 folders with test vectors for different parameter sets:
- `valid_params_128`: valid classical PBS parameters using a Gaussian noise distribution, providing 128-bits of security in the IND-CPA model (i.e., the probability of failure is smaller than 2^{-64}).
- `toy_params`: insecure parameters that yield smaller values to simplify the bit comparison of the results.
You will find 2 folders with test vectors for different parameter sets:
- `valid_params_128`: valid classical PBS parameters using a gaussian noise distribution, providing 128bits of security in the IND-CPA model and a bootstrapping probability of failure of 2^{-64}.
- `toy_params`: insecure parameters that yield smaller values
The values are generated to compute a keyswitch (KS) followed by a bootstrap (PBS). The cleartext inputs are 2 values, A and B defined below.
The values are generated for the keyswitch -> bootstrap (KS-PBS) atomic pattern. The cleartext inputs are 2 values, A and B defined below.
All the random values are generated from a fixed seed, that can be found in the `RAND_SEED` constant below. The PRNG used is the one based on the AES block cipher in counter mode, from tfhe `tfhe-csprng` crate.
The bootstrap is applied twice, with 2 different lut, the identity lut and a specific one computing the double of the input value (i.e., f(x) = 2*x).
The programmable bootstrap is applied twice, with 2 different lut, the identity lut and a specific one (currently a x2 operation)
## Vectors
The following values are generated:
### Keys
| name | description | TFHE-rs type |
|------------------------|-----------------------------------------------------------------------------------------|-----------------------------|
| `large_lwe_secret_key` | Encryption secret key, before the KS and after the PBS | `LweSecretKey<Vec<u64>>` |
| `small_lwe_secret_key` | Secret key encrypting ciphertexts between the KS and the PBS | `LweSecretKey<Vec<u64>>` |
| `ksk` | The keyswitching key to convert a ct from the large key to the small one | `LweKeyswitchKey<Vec<u64>>` |
| name | description | TFHE-rs type |
|------------------------|---------------------------------------------------------------------------------------|-----------------------------|
| `large_lwe_secret_key` | Encryption secret key, before the KS and after the PBS | `LweSecretKey<Vec<u64>>` |
| `small_lwe_secret_key` | Secret key encrypting ciphertexts between the KS and the PBS | `LweSecretKey<Vec<u64>>` |
| `ksk` | The keyswitching key to convert a ct from the large key to the small one | `LweKeyswitchKey<Vec<u64>>` |
| `bsk` | the bootstrapping key to perform a programmable bootstrap on the keyswitched ciphertext | `LweBootstrapKey<Vec<u64>>` |
### Ciphertexts
| name | description | TFHE-rs type | Cleartext |
|----------------------|-----------------------------------------------------------------------------------------------------|----------------------------|----------------------|
| `lwe_a` | LWE Ciphertext encrypting A | `LweCiphertext<Vec<u64>>` | `A` |
| `lwe_b` | LWE Ciphertext encrypting B | `LweCiphertext<Vec<u64>>` | `B` |
| `lwe_sum` | LWE Ciphertext encrypting A plus lwe encryption of B | `LweCiphertext<Vec<u64>>` | `A+B` |
| `lwe_prod` | LWE Ciphertext encrypting A times cleartext B | `LweCiphertext<Vec<u64>>` | `A*B` |
| `lwe_ms` | LWE Ciphertext encrypting A after a Modulus Switch from q to 2*N ([note](#non-native-encoding)) | `LweCiphertext<Vec<u64>>` | `A` |
| `lwe_ks` | LWE Ciphertext encrypting A after a keyswitch from `large_lwe_secret_key` to `small_lwe_secret_key` | `LweCiphertext<Vec<u64>>` | `A` |
| `glwe_after_id_br` | GLWE Ciphertext encrypting A after the application of the identity blind rotation on `lwe_ms` | `GlweCiphertext<Vec<u64>>` | rotation of id LUT |
| `lwe_after_id_pbs` | LWE Ciphertext encrypting A after the sample extract operation on `glwe_after_id_br` | `LweCiphertext<Vec<u64>>` | `A` |
| `glwe_after_spec_br` | GLWE Ciphertext encrypting spec(A) after the application of the spec blind rotation on `lwe_ms` | `GlweCiphertext<Vec<u64>>` | rotation of spec LUT |
| `lwe_after_spec_pbs` | LWE Ciphertext encrypting spec(A) after the sample extract operation on `glwe_after_spec_br` | `LweCiphertext<Vec<u64>>` | `spec(A)` |
Ciphertexts with the `_karatsuba` suffix are generated using the Karatsuba polynomial multiplication algorithm in the blind rotation, while default ciphertexts are generated using an FFT multiplication.
This makes it easier to reproduce bit exact results.
| name | description | TFHE-rs type | Cleartext |
|----------------------|--------------------------------------------------------------------------------------------------------------|----------------------------|--------------|
| `lwe_a` | Lwe encryption of A | `LweCiphertext<Vec<u64>>` | `A` |
| `lwe_b` | Lwe encryption of B | `LweCiphertext<Vec<u64>>` | `B` |
| `lwe_sum` | Lwe encryption of A plus lwe encryption of B | `LweCiphertext<Vec<u64>>` | `A+B` |
| `lwe_prod` | Lwe encryption of A times cleartext B | `LweCiphertext<Vec<u64>>` | `A*B` |
| `lwe_ms` | The lwe ciphertext after the modswitch part of the PBS ([note](#non-native-encoding)) | `LweCiphertext<Vec<u64>>` | `A` |
| `lwe_ks` | The lwe ciphertext after the keyswitch | `LweCiphertext<Vec<u64>>` | `A` |
| `glwe_after_id_br` | The glwe returned by the application of the identity blind rotation on the mod switched ciphertexts. | `GlweCiphertext<Vec<u64>>` | rot id LUT |
| `lwe_after_id_pbs` | The lwe returned by the application of the sample extract operation on the output of the id blind rotation | `LweCiphertext<Vec<u64>>` | `A` |
| `glwe_after_spec_br` | The glwe returned by the application of the spec blind rotation on the mod switched ciphertexts. | `GlweCiphertext<Vec<u64>>` | rot spec LUT |
| `lwe_after_spec_pbs` | The lwe returned by the application of the sample extract operation on the output of the spec blind rotation | `LweCiphertext<Vec<u64>>` | `spec(A)` |
### Encodings
#### Non native encoding

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:08f31a47c29cc4d72ad32c0b5411fa20b3deef5b84558dd2fb892d3cdf90528a
size 4679

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9ece8ca9c1436258b94e8c5e629b8722f9b18fdd415dd5209b6167a9dde8491c
size 4679

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2fb4bb45c259b8383da10fc8f9459c40a6972c49b1696eb107f0a75640724be5
size 2365

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b7b8e3586128887bd682120f3e3a43156139bce5e3fe0b03284f8753a864d647
size 2365

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5d80dd93fefae4f4f89484dfcd65bbe99cc32e7e3b0a90c33dd0d77516c0a023
size 36935

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e59b002df3a9b01ad321ec51cf076fa35131ab9dbef141d1c54b717d61426c92
size 36935

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:76a5c52cab7fec1dc167da676c6cd39479cda6b2bb9f4e0573cb7d99c2692faa
size 18493

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d3391969acf26dc69de0927ba279139d8d79999944069addc8ff469ad6c5ae2d
size 18493

View File

@@ -265,7 +265,6 @@ fn generate_test_vectors<P: AsRef<Path>>(
let mut id_lut = encoding.encode_lut(glwe_dimension, polynomial_size, ID_LUT);
assert_data_not_zero(&id_lut);
let mut id_lut_karatsuba = id_lut.clone();
blind_rotate_assign(&modswitched, &mut id_lut, &fourier_bsk);
assert_data_not_zero(&id_lut);
@@ -288,32 +287,8 @@ fn generate_test_vectors<P: AsRef<Path>>(
assert_data_not_zero(&lwe_pbs_id);
store_data(path, &lwe_pbs_id, "lwe_after_id_pbs");
blind_rotate_karatsuba_assign(&modswitched, &mut id_lut_karatsuba, &bsk);
store_data(path, &id_lut_karatsuba, "glwe_after_id_br_karatsuba");
let mut lwe_pbs_karatsuba_id = LweCiphertext::new(
0u64,
glwe_dimension
.to_equivalent_lwe_dimension(polynomial_size)
.to_lwe_size(),
encoding.ciphertext_modulus,
);
extract_lwe_sample_from_glwe_ciphertext(
&id_lut_karatsuba,
&mut lwe_pbs_karatsuba_id,
MonomialDegree(0),
);
let decrypted_pbs_id = decrypt_lwe_ciphertext(&large_lwe_secret_key, &lwe_pbs_karatsuba_id);
let res = encoding.decode(decrypted_pbs_id);
assert_eq!(res, MSG_A);
store_data(path, &lwe_pbs_karatsuba_id, "lwe_after_id_pbs_karatsuba");
let mut spec_lut = encoding.encode_lut(glwe_dimension, polynomial_size, SPEC_LUT);
assert_data_not_zero(&spec_lut);
let mut spec_lut_karatsuba = spec_lut.clone();
blind_rotate_assign(&modswitched, &mut spec_lut, &fourier_bsk);
assert_data_not_zero(&spec_lut);
@@ -335,33 +310,6 @@ fn generate_test_vectors<P: AsRef<Path>>(
assert_eq!(res, SPEC_LUT(MSG_A));
assert_data_not_zero(&lwe_pbs_spec);
store_data(path, &lwe_pbs_spec, "lwe_after_spec_pbs");
blind_rotate_karatsuba_assign(&modswitched, &mut spec_lut_karatsuba, &bsk);
store_data(path, &spec_lut_karatsuba, "glwe_after_spec_br_karatsuba");
let mut lwe_pbs_karatsuba_spec = LweCiphertext::new(
0u64,
glwe_dimension
.to_equivalent_lwe_dimension(polynomial_size)
.to_lwe_size(),
encoding.ciphertext_modulus,
);
extract_lwe_sample_from_glwe_ciphertext(
&spec_lut_karatsuba,
&mut lwe_pbs_karatsuba_spec,
MonomialDegree(0),
);
let decrypted_pbs_spec = decrypt_lwe_ciphertext(&large_lwe_secret_key, &lwe_pbs_karatsuba_spec);
let res = encoding.decode(decrypted_pbs_spec);
assert_eq!(res, SPEC_LUT(MSG_A));
store_data(
path,
&lwe_pbs_karatsuba_spec,
"lwe_after_spec_pbs_karatsuba",
);
}
fn rm_dir_except_readme<P: AsRef<Path>>(dir: P) {

View File

@@ -1,6 +1,6 @@
[package]
name = "tfhe-cuda-backend"
version = "0.13.0"
version = "0.12.0"
edition = "2021"
authors = ["Zama team"]
license = "BSD-3-Clause-Clear"

View File

@@ -86,7 +86,6 @@ fn main() {
"cuda/include/integer/integer.h",
"cuda/include/integer/rerand.h",
"cuda/include/aes/aes.h",
"cuda/include/trivium/trivium.h",
"cuda/include/zk/zk.h",
"cuda/include/keyswitch/keyswitch.h",
"cuda/include/keyswitch/ks_enums.h",

View File

@@ -29,13 +29,14 @@ template <typename Torus> struct int_aes_lut_buffers {
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus, Torus)> and_lambda =
[](Torus a, Torus b) -> Torus { return a & b; };
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), this->and_lut->get_lut(0, 0),
this->and_lut->get_degree(0), this->and_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, and_lambda, allocate_gpu_memory);
auto active_streams_and_lut = streams.active_gpu_subset(
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
params.pbs_type);
this->and_lut->generate_and_broadcast_bivariate_lut(
active_streams_and_lut, {0}, {and_lambda}, allocate_gpu_memory);
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
this->and_lut->broadcast_lut(active_streams_and_lut);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->flush_lut = new int_radix_lut<Torus>(
@@ -44,11 +45,14 @@ template <typename Torus> struct int_aes_lut_buffers {
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
return x & 1;
};
auto active_streams_flush_lut = streams.active_gpu_subset(
AES_STATE_BITS * num_aes_inputs, params.pbs_type);
this->flush_lut->generate_and_broadcast_lut(
active_streams_flush_lut, {0}, {flush_lambda}, allocate_gpu_memory);
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->flush_lut->get_lut(0, 0),
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, flush_lambda, allocate_gpu_memory);
auto active_streams_flush_lut =
streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
this->flush_lut->broadcast_lut(active_streams_flush_lut);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->carry_lut = new int_radix_lut<Torus>(
@@ -56,11 +60,13 @@ template <typename Torus> struct int_aes_lut_buffers {
std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
return (x >> 1) & 1;
};
auto active_streams_carry_lut =
streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
this->carry_lut->generate_and_broadcast_lut(
active_streams_carry_lut, {0}, {carry_lambda}, allocate_gpu_memory);
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0), this->carry_lut->get_lut(0, 0),
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, carry_lambda, allocate_gpu_memory);
auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
this->carry_lut->broadcast_lut(active_streams_carry_lut);
this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
}

View File

@@ -35,9 +35,17 @@ void cuda_centered_modulus_switch_64(void *stream, uint32_t gpu_index,
uint32_t lwe_dimension,
uint32_t log_modulus);
void cuda_improve_noise_modulus_switch_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *lwe_array_indexes,
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
uint32_t log_modulus);
void cuda_glwe_sample_extract_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *glwe_array_in, uint32_t const *nth_array, uint32_t num_nths,
uint32_t lwe_per_glwe, uint32_t glwe_dimension, uint32_t polynomial_size);
}
#endif

View File

@@ -8,8 +8,7 @@
extern std::mutex m;
extern bool p2p_enabled;
extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
extern const int THRESHOLD_MULTI_GPU;
extern "C" {
int32_t cuda_setup_multi_gpu(int device_0_id);
@@ -40,8 +39,7 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
}
}
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
PBS_TYPE pbs_type);
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
@@ -75,10 +73,9 @@ public:
// Returns a subset of this set as an active subset. An active subset is one
// that is temporarily used to perform some computation
CudaStreams active_gpu_subset(int num_radix_blocks, PBS_TYPE pbs_type) {
return CudaStreams(
_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
CudaStreams active_gpu_subset(int num_radix_blocks) {
return CudaStreams(_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count));
}
// Returns a CudaStreams struct containing only the ith stream

View File

@@ -20,8 +20,7 @@ template <typename Torus> struct boolean_bitop_buffer {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
auto active_streams =
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
this->unchecked = is_unchecked;
switch (op) {
case BITAND:
@@ -65,8 +64,14 @@ template <typename Torus> struct boolean_bitop_buffer {
return x % params.message_modulus;
};
message_extract_lut->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0),
message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
message_extract_lut->broadcast_lut(active_streams);
}
tmp_lwe_left = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -114,8 +119,7 @@ template <typename Torus> struct int_bitop_buffer {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
switch (op) {
case BITAND:
case BITOR:
@@ -136,8 +140,12 @@ template <typename Torus> struct int_bitop_buffer {
}
};
lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {lut_bivariate_f}, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(active_streams);
}
break;
default:
@@ -146,8 +154,6 @@ template <typename Torus> struct int_bitop_buffer {
num_radix_blocks, allocate_gpu_memory,
size_tracker);
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
@@ -163,13 +169,14 @@ template <typename Torus> struct int_bitop_buffer {
return x ^ rhs;
}
};
lut_funcs.push_back(lut_univariate_scalar_f);
lut_indices.push_back(i);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(active_streams);
}
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funcs,
gpu_memory_allocated);
}
}
@@ -202,11 +209,15 @@ template <typename Torus> struct boolean_bitnot_buffer {
return x % message_modulus;
};
auto active_streams =
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
message_extract_lut->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0),
message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message_extract, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
message_extract_lut->broadcast_lut(active_streams);
}
}

View File

@@ -28,17 +28,20 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
uint32_t bits_per_block = std::log2(params.message_modulus);
uint32_t msg_modulus = params.message_modulus;
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
lut->generate_and_broadcast_lut(
active_streams, {0}, {[msg_modulus, bits_per_block](Torus x) {
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
[msg_modulus, bits_per_block](Torus x) {
const auto xm = x % msg_modulus;
const auto sign_bit = (xm >> (bits_per_block - 1)) & 1;
return (Torus)((msg_modulus - 1) * sign_bit);
}},
},
allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
lut->broadcast_lut(active_streams);
this->last_block = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -14,8 +14,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
tmp = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -85,6 +84,24 @@ template <typename Torus> struct int_cmux_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),
predicate_lut->get_degree(0), predicate_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, inverted_lut_f, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 1),
predicate_lut->get_degree(1), predicate_lut->get_max_degree(1),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
message_extract_lut->get_lut(0, 0), message_extract_lut->get_degree(0),
message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
message_extract_lut_f, gpu_memory_allocated);
Torus *h_lut_indexes = predicate_lut->h_lut_indexes;
for (int index = 0; index < 2 * num_radix_blocks; index++) {
if (index < num_radix_blocks) {
@@ -97,18 +114,10 @@ template <typename Torus> struct int_cmux_buffer {
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams_pred =
streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
predicate_lut->generate_and_broadcast_bivariate_lut(
active_streams_pred, {0, 1}, {inverted_lut_f, lut_f},
gpu_memory_allocated);
auto active_streams_msg =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
message_extract_lut->generate_and_broadcast_lut(
active_streams_msg, {0}, {message_extract_lut_f}, gpu_memory_allocated);
auto active_streams_pred = streams.active_gpu_subset(2 * num_radix_blocks);
predicate_lut->broadcast_lut(active_streams_pred);
auto active_streams_msg = streams.active_gpu_subset(num_radix_blocks);
message_extract_lut->broadcast_lut(active_streams_msg);
}
void release(CudaStreams streams) {

View File

@@ -39,21 +39,21 @@ template <typename Torus> struct int_are_all_block_true_buffer {
max_chunks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(max_chunks, params.pbs_type);
auto is_max_value_f = [max_value](Torus x) -> Torus {
return x == max_value;
};
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_max_value->get_lut(0, 0),
is_max_value->get_degree(0), is_max_value->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
is_max_value->generate_and_broadcast_lut(
active_streams, {0}, {is_max_value_f}, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(max_chunks);
is_max_value->broadcast_lut(active_streams);
}
void release(CudaStreams streams) {
@@ -102,10 +102,14 @@ template <typename Torus> struct int_comparison_eq_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
is_non_zero_lut->generate_and_broadcast_lut(
active_streams, {0}, {is_non_zero_lut_f}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
is_non_zero_lut->broadcast_lut(active_streams);
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
@@ -123,28 +127,32 @@ template <typename Torus> struct int_comparison_eq_buffer {
return (lhs == rhs);
}
};
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < total_modulus; i++) {
auto lut_f = [i, operator_f](Torus x) -> Torus {
return operator_f(i, x);
};
lut_funcs.push_back(lut_f);
lut_indices.push_back(i);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
scalar_comparison_luts->get_lut(0, i),
scalar_comparison_luts->get_degree(i),
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, gpu_memory_allocated);
}
scalar_comparison_luts->generate_and_broadcast_lut(
active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
scalar_comparison_luts->broadcast_lut(active_streams);
if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
operator_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
operator_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {operator_f}, gpu_memory_allocated);
// operator_lut->broadcast_lut(active_streams);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, operator_f, gpu_memory_allocated);
operator_lut->broadcast_lut(active_streams);
} else {
operator_lut = nullptr;
}
@@ -211,6 +219,9 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
streams.stream(0), streams.gpu_index(0), tmp_y, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// LUTs
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
tree_last_leaf_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
@@ -221,14 +232,14 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
tree_inner_leaf_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {block_selector_f}, allocate_gpu_memory);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
tree_inner_leaf_lut->get_lut(0, 0), tree_inner_leaf_lut->get_degree(0),
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
block_selector_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
tree_inner_leaf_lut->broadcast_lut(active_streams);
}
void release(CudaStreams streams) {
@@ -379,8 +390,7 @@ template <typename Torus> struct int_comparison_buffer {
this->op = op;
this->is_signed = is_signed;
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
identity_lut_f = [](Torus x) -> Torus { return x; };
@@ -412,8 +422,12 @@ template <typename Torus> struct int_comparison_buffer {
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
identity_lut->generate_and_broadcast_lut(
active_streams, {0}, {identity_lut_f}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
identity_lut->broadcast_lut(active_streams);
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
auto is_zero_f = [total_modulus](Torus x) -> Torus {
@@ -423,8 +437,13 @@ template <typename Torus> struct int_comparison_buffer {
is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
is_zero_lut->generate_and_broadcast_lut(active_streams, {0}, {is_zero_f},
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_zero_f, gpu_memory_allocated);
is_zero_lut->broadcast_lut(active_streams);
switch (op) {
case COMPARISON_TYPE::MAX:
@@ -499,9 +518,13 @@ template <typename Torus> struct int_comparison_buffer {
PANIC("Cuda error: sign_lut creation failed due to wrong function.")
};
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
signed_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {signed_lut_f}, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), signed_lut->get_lut(0, 0),
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(1);
signed_lut->broadcast_lut(active_streams);
}
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));

View File

@@ -65,16 +65,6 @@ void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
void cleanup_cuda_integer_decompress_radix_ciphertext_128(
CudaStreamsFFI streams, int8_t **mem_ptr_void);
void cuda_integer_extract_glwe_128(
CudaStreamsFFI streams, void *glwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_list,
uint32_t const glwe_index);
void cuda_integer_extract_glwe_64(
CudaStreamsFFI streams, void *glwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_list,
uint32_t const glwe_index);
}
#endif

View File

@@ -116,8 +116,7 @@ template <typename Torus> struct int_decompression {
effective_compression_carry_modulus,
encryption_params.message_modulus, encryption_params.carry_modulus,
decompression_rescale_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(
num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
decompression_rescale_lut->broadcast_lut(active_streams);
}
}

View File

@@ -283,9 +283,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
zero_out_if_not_1_lut_2};
size_t lut_gpu_indexes[2] = {0, 3};
for (int j = 0; j < 2; j++) {
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
{0}, {zero_out_if_not_1_lut_f},
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(lut_gpu_indexes[j]),
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
}
luts[0] = zero_out_if_not_2_lut_1;
@@ -293,9 +296,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
lut_gpu_indexes[0] = 1;
lut_gpu_indexes[1] = 2;
for (int j = 0; j < 2; j++) {
luts[j]->generate_and_broadcast_lut(streams.get_ith(lut_gpu_indexes[j]),
{0}, {zero_out_if_not_2_lut_f},
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(lut_gpu_indexes[j]),
streams.gpu_index(lut_gpu_indexes[j]), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
}
quotient_lut_1 =
@@ -315,12 +321,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
};
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
quotient_lut_1->generate_and_broadcast_lut(
streams.get_ith(2), {0}, {quotient_lut_1_f}, gpu_memory_allocated);
quotient_lut_2->generate_and_broadcast_lut(
streams.get_ith(1), {0}, {quotient_lut_2_f}, gpu_memory_allocated);
quotient_lut_3->generate_and_broadcast_lut(
streams.get_ith(0), {0}, {quotient_lut_3_f}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(2), streams.gpu_index(2), quotient_lut_1->get_lut(0, 0),
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(1), streams.gpu_index(1), quotient_lut_2->get_lut(0, 0),
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), quotient_lut_3->get_lut(0, 0),
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
message_extract_lut_1 = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -335,12 +350,14 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
luts[0] = message_extract_lut_1;
luts[1] = message_extract_lut_2;
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
for (int j = 0; j < 2; j++) {
luts[j]->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_blocks);
luts[j]->broadcast_lut(active_streams);
}
}
@@ -989,14 +1006,23 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
masking_luts_2[i] = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
masking_luts_1[i]->generate_and_broadcast_lut(
active_streams_1, {0}, {lut_f_masking}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
masking_luts_1[i]->get_lut(0, 0), masking_luts_1[i]->get_degree(0),
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_1 = streams.active_gpu_subset(1);
masking_luts_1[i]->broadcast_lut(active_streams_1);
auto active_streams_2 =
streams.active_gpu_subset(num_blocks, params.pbs_type);
masking_luts_2[i]->generate_and_broadcast_lut(
active_streams_2, {0}, {lut_f_masking}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
masking_luts_2[i]->get_lut(0, 0), masking_luts_2[i]->get_degree(0),
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_masking, gpu_memory_allocated);
auto active_streams_2 = streams.active_gpu_subset(num_blocks);
masking_luts_2[i]->broadcast_lut(active_streams_2);
}
// create and generate message_extract_lut_1 and message_extract_lut_2
@@ -1014,12 +1040,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
auto active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_blocks);
for (int j = 0; j < 2; j++) {
luts[j]->generate_and_broadcast_lut(
active_streams, {0}, {lut_f_message_extract}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
luts[j]->broadcast_lut(active_streams);
}
// Give name to closures to improve readability
@@ -1100,8 +1128,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
// merge_overflow_flags_luts
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
auto active_gpu_count_for_bits =
streams.active_gpu_subset(1, params.pbs_type);
auto active_gpu_count_for_bits = streams.active_gpu_subset(1);
for (int i = 0; i < num_bits_in_message; i++) {
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
return (x == 0 && y == 0) << i;
@@ -1110,8 +1137,14 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
merge_overflow_flags_luts[i]->generate_and_broadcast_bivariate_lut(
active_gpu_count_for_bits, {0}, {lut_f_bit}, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
merge_overflow_flags_luts[i]->get_lut(0, 0),
merge_overflow_flags_luts[i]->get_degree(0),
merge_overflow_flags_luts[i]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
merge_overflow_flags_luts[i]->broadcast_lut(active_gpu_count_for_bits);
}
}
@@ -1119,8 +1152,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
uint32_t num_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
auto active_streams =
streams.active_gpu_subset(2 * num_blocks, params.pbs_type);
auto active_streams = streams.active_gpu_subset(2 * num_blocks);
this->params = params;
if (params.message_modulus == 4 && params.carry_modulus == 4 &&
@@ -1441,8 +1473,7 @@ template <typename Torus> struct int_div_rem_memory {
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->params = params;
this->is_signed = is_signed;
@@ -1520,12 +1551,16 @@ template <typename Torus> struct int_div_rem_memory {
compare_signed_bits_lut = new int_radix_lut<Torus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
compare_signed_bits_lut->get_lut(0, 0),
compare_signed_bits_lut->get_degree(0),
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
f_compare_extracted_signed_bits, gpu_memory_allocated);
auto active_gpu_count_cmp =
streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
compare_signed_bits_lut->generate_and_broadcast_bivariate_lut(
active_gpu_count_cmp, {0}, {f_compare_extracted_signed_bits},
gpu_memory_allocated);
streams.active_gpu_subset(1); // only 1 block needed
compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
}
}

View File

@@ -20,8 +20,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
this->direction = direction;
this->bit_value = bit_value;
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
this->univ_lut_mem =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
@@ -53,8 +52,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
return count;
};
univ_lut_mem->generate_and_broadcast_lut(
active_streams, {0}, {generate_uni_lut_lambda}, allocate_gpu_memory);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), univ_lut_mem->get_lut(0, 0),
univ_lut_mem->get_degree(0), univ_lut_mem->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
univ_lut_mem->broadcast_lut(active_streams);
auto generate_bi_lut_lambda =
[num_bits](Torus block_num_bit_count,
@@ -65,8 +69,13 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
return 0;
};
biv_lut_mem->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {generate_bi_lut_lambda}, allocate_gpu_memory);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), biv_lut_mem->get_lut(0, 0),
biv_lut_mem->get_degree(0), biv_lut_mem->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
biv_lut_mem->broadcast_lut(active_streams);
this->tmp_ct = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -222,7 +231,7 @@ template <typename Torus> struct int_ilog2_buffer {
this->sum_output_not_propagated, counter_num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
lut_message_not =
this->lut_message_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus)> lut_message_lambda =
@@ -230,11 +239,15 @@ template <typename Torus> struct int_ilog2_buffer {
uint64_t message = x % this->params.message_modulus;
return (~message) % this->params.message_modulus;
};
auto active_streams =
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
lut_message_not->generate_and_broadcast_lut(
active_streams, {0}, {lut_message_lambda}, allocate_gpu_memory);
generate_device_accumulator(streams.stream(0), streams.gpu_index(0),
this->lut_message_not->get_lut(0, 0),
this->lut_message_not->get_degree(0),
this->lut_message_not->get_max_degree(0),
params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus,
lut_message_lambda, allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(counter_num_blocks);
lut_message_not->broadcast_lut(active_streams);
this->lut_carry_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
@@ -244,8 +257,13 @@ template <typename Torus> struct int_ilog2_buffer {
uint64_t carry = x / this->params.message_modulus;
return (~carry) % this->params.message_modulus;
};
lut_carry_not->generate_and_broadcast_lut(
active_streams, {0}, {lut_carry_lambda}, allocate_gpu_memory);
generate_device_accumulator(
streams.stream(0), streams.gpu_index(0),
this->lut_carry_not->get_lut(0, 0), this->lut_carry_not->get_degree(0),
this->lut_carry_not->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_carry_lambda, allocate_gpu_memory);
lut_carry_not->broadcast_lut(active_streams);
this->message_blocks_not = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -883,10 +883,6 @@ void cuda_unchecked_first_index_in_clears_64(
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
int8_t *mem, void *const *bsks, void *const *ksks);
void cuda_small_scalar_multiplication_integer_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, uint64_t scalar,
const uint32_t message_modulus, const uint32_t carry_modulus);
void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);

View File

@@ -9,7 +9,6 @@
#include "utils/helper_multi_gpu.cuh"
#include <cmath>
#include <functional>
#include <map>
#include <queue>
#include <stdio.h>
@@ -44,8 +43,6 @@ public:
"parameters"); \
} else if ((msg_mod) == 0 && (carry_mod) == 0) { \
break; \
} else if ((msg_mod) == 4 && (carry_mod) == 32) { \
break; \
} else { \
PANIC("Invalid message modulus or carry modulus") \
} \
@@ -372,8 +369,7 @@ struct int_radix_lut_custom_input_output {
this->num_input_blocks = num_input_blocks;
this->gpu_memory_allocated = allocate_gpu_memory;
this->active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_radix_blocks);
}
void setup_degrees() {
@@ -384,18 +380,14 @@ struct int_radix_lut_custom_input_output {
void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
bool allocate_gpu_memory, uint64_t &size_tracker) {
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
for (uint i = 0; i < active_streams.count(); i++) {
cuda_set_device(active_streams.gpu_index(i));
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu = std::min(
(int)num_radix_blocks,
std::max(threshold, get_num_inputs_on_gpu(num_radix_blocks, i,
active_streams.count())));
auto num_blocks_on_gpu =
std::min((int)num_radix_blocks,
std::max(THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_radix_blocks, i,
active_streams.count())));
uint64_t size = 0;
execute_scratch_pbs<OutputTorus>(
@@ -430,22 +422,18 @@ struct int_radix_lut_custom_input_output {
/// back to the original indexing
multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
num_radix_blocks, params.big_lwe_dimension + 1,
size_tracker, params.pbs_type,
allocate_gpu_memory);
size_tracker, allocate_gpu_memory);
multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
num_radix_blocks, params.small_lwe_dimension + 1,
size_tracker, params.pbs_type,
allocate_gpu_memory);
size_tracker, allocate_gpu_memory);
if (num_many_lut > 1) {
multi_gpu_alloc_lwe_many_lut_output_async(
active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
params.big_lwe_dimension + 1, size_tracker, params.pbs_type,
allocate_gpu_memory);
params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
} else {
multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
num_radix_blocks, params.big_lwe_dimension + 1,
size_tracker, params.pbs_type,
allocate_gpu_memory);
size_tracker, allocate_gpu_memory);
}
multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
num_radix_blocks, size_tracker,
@@ -461,14 +449,12 @@ struct int_radix_lut_custom_input_output {
}
void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
auto inputs_on_gpu = std::min(
(int)num_input_blocks,
std::max(threshold, get_num_inputs_on_gpu(num_input_blocks, 0,
active_streams.count())));
auto inputs_on_gpu =
std::min((int)num_input_blocks,
std::max(THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(num_input_blocks, 0,
active_streams.count())));
if (inputs_on_gpu >= get_threshold_ks_gemm()) {
for (auto i = 0; i < active_streams.count(); ++i) {
@@ -810,20 +796,16 @@ struct int_radix_lut_custom_input_output {
void allocate_lwe_vector_for_non_trivial_indexes(
CudaStreams streams, uint64_t max_num_radix_blocks,
uint64_t &size_tracker, bool allocate_gpu_memory) {
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
// We need to create the auxiliary array only in GPU 0
if (active_streams.count() > 1) {
lwe_aligned_vec.resize(active_streams.count());
for (uint i = 0; i < active_streams.count(); i++) {
uint64_t size_tracker_on_array_i = 0;
auto inputs_on_gpu = std::min(
(int)max_num_radix_blocks,
std::max(threshold, get_num_inputs_on_gpu(max_num_radix_blocks, i,
active_streams.count())));
auto inputs_on_gpu =
std::min((int)max_num_radix_blocks,
std::max(THRESHOLD_MULTI_GPU,
get_num_inputs_on_gpu(max_num_radix_blocks, i,
active_streams.count())));
InputTorus *d_array =
(InputTorus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * (params.big_lwe_dimension + 1) *
@@ -836,56 +818,6 @@ struct int_radix_lut_custom_input_output {
}
}
void generate_and_broadcast_lut(
const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
std::vector<std::function<OutputTorus(OutputTorus)>> f,
bool gpu_memory_allocated) {
// streams should be a subset of active_streams
for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
generate_device_accumulator<OutputTorus>(
streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, f[i], gpu_memory_allocated);
}
//broadcast_lut(streams);
}
void generate_and_broadcast_bivariate_lut(
const CudaStreams &streams, std::vector<uint32_t> lut_indexes,
std::vector<std::function<OutputTorus(OutputTorus, OutputTorus)>> f,
bool gpu_memory_allocated) {
// streams should be a subset of active_streams
/* for (int fidx = 0; fidx < f.size(); ++fidx) {
__int128_t f_hash = 0;
uint32_t bits_per_lut_val = 5;
uint32_t input_modulus_sup =
params.message_modulus * params.carry_modulus;
for (uint32_t i = 0; i < input_modulus_sup; ++i) {
OutputTorus f_eval =
f[fidx](i / params.message_modulus, i % params.message_modulus);
GPU_ASSERT(f_eval < (1 << bits_per_lut_val),
"LUT value expected bitwidth overflow");
f_hash |= f_eval;
f_hash <<= bits_per_lut_val;
}
printf("%016llX%016llX\n",
(unsigned long long)((f_hash >> 64) & 0xFFFFFFFFFFFFFFFF),
(unsigned long long)(f_hash & 0xFFFFFFFFFFFFFFFF));
}
*/
for (uint32_t i = 0; i < lut_indexes.size(); ++i) {
generate_device_accumulator_bivariate<InputTorus>(
streams.stream(0), streams.gpu_index(0), get_lut(0, lut_indexes[i]),
get_degree(lut_indexes[i]), get_max_degree(lut_indexes[i]),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, f[i], gpu_memory_allocated);
}
//broadcast_lut(streams);
}
void release(CudaStreams streams) {
PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
"Lut vec and Lut vec indexes must have the same size");
@@ -1036,15 +968,18 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
bits_per_block * num_radix_blocks,
allocate_gpu_memory, size_tracker);
std::vector<std::function<Torus(Torus)>> lut_funs;
std::vector<uint32_t> lut_indices;
for (int i = 0; i < bits_per_block; i++) {
auto operator_f = [i, final_offset](Torus x) -> Torus {
Torus y = (x >> i) & 1;
return y << final_offset;
};
lut_funs.push_back(operator_f);
lut_indices.push_back(i);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
operator_f, gpu_memory_allocated);
}
/**
@@ -1061,12 +996,9 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
num_radix_blocks * bits_per_block * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams = streams.active_gpu_subset(
bits_per_block * num_radix_blocks, params.pbs_type);
lut->generate_and_broadcast_lut(active_streams, lut_indices, lut_funs,
gpu_memory_allocated);
// lut->broadcast_lut(active_streams);
auto active_streams =
streams.active_gpu_subset(bits_per_block * num_radix_blocks);
lut->broadcast_lut(active_streams);
/**
* the input indexes should take the first bits_per_block PBS to target
@@ -1142,6 +1074,24 @@ template <typename Torus> struct int_fullprop_buffer {
};
//
Torus *lut_buffer_message = lut->get_lut(0, 0);
uint64_t *message_degree = lut->get_degree(0);
uint64_t *message_max_degree = lut->get_max_degree(0);
Torus *lut_buffer_carry = lut->get_lut(0, 1);
uint64_t *carry_degree = lut->get_degree(1);
uint64_t *carry_max_degree = lut->get_max_degree(1);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut_buffer_message,
message_degree, message_max_degree, params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_message, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
carry_max_degree, params.glwe_dimension, params.polynomial_size,
params.message_modulus, params.carry_modulus, lut_f_carry,
gpu_memory_allocated);
uint64_t lwe_indexes_size = 2 * sizeof(Torus);
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
@@ -1151,15 +1101,9 @@ template <typename Torus> struct int_fullprop_buffer {
cuda_memcpy_with_size_tracking_async_to_gpu(
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
//
// No broadcast is needed because full prop is done on 1 single GPU.
// By passing a single-GPU CudaStreams with streams.get_ith(0) the LUT is
// not broadcast.
//
lut->generate_and_broadcast_lut(streams.get_ith(0), {0, 1},
{lut_f_message, lut_f_carry},
gpu_memory_allocated);
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -1277,10 +1221,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
if (total_ciphertexts > 0 ||
reduce_degrees_for_single_carry_propagation) {
uint64_t size_tracker = 0;
allocated_luts_message_carry = true;
luts_message_carry = new int_radix_lut<Torus>(
streams, params, 2, pbs_count, true, size_tracker);
allocated_luts_message_carry = true;
uint64_t message_modulus_bits =
(uint64_t)std::log2(params.message_modulus);
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
@@ -1296,9 +1239,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
streams, upper_bound_num_blocks, size_tracker, true);
}
}
if (allocated_luts_message_carry) {
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
@@ -1310,11 +1251,21 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
return x / message_modulus;
};
auto active_gpu_count_mc =
streams.active_gpu_subset(pbs_count, params.pbs_type);
luts_message_carry->generate_and_broadcast_lut(
active_gpu_count_mc, {0, 1}, {lut_f_message, lut_f_carry},
gpu_memory_allocated);
// generate accumulators
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), message_acc,
luts_message_carry->get_degree(0),
luts_message_carry->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_message, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), carry_acc,
luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, gpu_memory_allocated);
auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
luts_message_carry->broadcast_lut(active_gpu_count_mc);
}
}
int_sum_ciphertexts_vec_memory(
@@ -1449,6 +1400,10 @@ template <typename Torus> struct int_seq_group_prop_memory {
uint32_t group_size, uint32_t big_lwe_size_bytes,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
grouping_size = group_size;
group_resolved_carries = new CudaRadixCiphertextFFI;
@@ -1458,33 +1413,31 @@ template <typename Torus> struct int_seq_group_prop_memory {
allocate_gpu_memory);
int num_seq_luts = grouping_size - 1;
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
lut_sequential_algorithm =
new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
allocate_gpu_memory, size_tracker);
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_indices;
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
for (int index = 0; index < num_seq_luts; index++) {
auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
return (propa_cum_sum_block >> (index + 1)) & 1;
};
lut_funcs.push_back(f_lut_sequential);
auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), seq_lut,
lut_sequential_algorithm->get_degree(index),
lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
gpu_memory_allocated);
h_seq_lut_indexes[index] = index;
lut_indices.push_back(index);
}
Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(num_seq_luts, params.pbs_type);
lut_sequential_algorithm->generate_and_broadcast_lut(
active_streams, lut_indices, lut_funcs, gpu_memory_allocated);
// lut_sequential_algorithm->broadcast_lut(active_streams);
auto active_streams = streams.active_gpu_subset(num_seq_luts);
lut_sequential_algorithm->broadcast_lut(active_streams);
free(h_seq_lut_indexes);
}
};
void release(CudaStreams streams) {
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
group_resolved_carries,
@@ -1506,6 +1459,10 @@ template <typename Torus> struct int_hs_group_prop_memory {
uint32_t num_groups, uint32_t big_lwe_size_bytes,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
if (msb == 2) {
@@ -1525,11 +1482,15 @@ template <typename Torus> struct int_hs_group_prop_memory {
lut_hillis_steele = new int_radix_lut<Torus>(
streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);
auto active_streams =
streams.active_gpu_subset(num_groups, params.pbs_type);
lut_hillis_steele->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {f_lut_hillis_steele}, gpu_memory_allocated);
}
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_hillis_steele,
gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_groups);
lut_hillis_steele->broadcast_lut(active_streams);
};
void release(CudaStreams streams) {
lut_hillis_steele->release(streams);
@@ -1704,8 +1665,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
luts_array_first_step->broadcast_lut(active_streams);
};
void release(CudaStreams streams) {
@@ -1819,6 +1779,112 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
num_extra_luts = 1;
}
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
luts_array_second_step = new int_radix_lut<Torus>(
streams, params, num_luts_second_step, num_radix_blocks,
allocate_gpu_memory, size_tracker);
// luts for first group inner propagation
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
auto f_first_grouping_inner_propagation =
[lut_id](Torus propa_cum_sum_block) -> Torus {
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
if (carry != 0) {
return 2ull; // Generates Carry
} else {
return 0ull; // Does not generate carry
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_first_grouping_inner_propagation, gpu_memory_allocated);
}
auto f_first_grouping_outer_propagation =
[num_bits_in_block](Torus block) -> Torus {
return (block >> (num_bits_in_block - 1)) & 1;
};
int lut_id = grouping_size - 1;
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_first_grouping_outer_propagation, gpu_memory_allocated);
// for other groupings inner propagation
for (int index = 0; index < grouping_size; index++) {
uint32_t lut_id = index + grouping_size;
auto f_other_groupings_inner_propagation =
[index](Torus propa_cum_sum_block) -> Torus {
uint64_t mask = (2 << index) - 1;
if (propa_cum_sum_block >= (2 << index)) {
return 2ull; // Generates
} else if ((propa_cum_sum_block & mask) == mask) {
return 1ull; // Propagate
} else {
return 0ull; // Nothing
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_other_groupings_inner_propagation, gpu_memory_allocated);
}
if (use_sequential_algorithm_to_resolve_group_carries) {
for (int index = 0; index < grouping_size - 1; index++) {
uint32_t lut_id = index + 2 * grouping_size;
auto f_group_propagation = [index, block_modulus,
num_bits_in_block](Torus block) -> Torus {
if (block == (block_modulus - 1)) {
return 0ull;
} else {
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_group_propagation, gpu_memory_allocated);
}
} else {
uint32_t lut_id = 2 * grouping_size;
auto f_group_propagation = [block_modulus](Torus block) {
if (block == (block_modulus - 1)) {
return 2ull;
} else {
return UINT64_MAX % (block_modulus * 2ull);
}
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_array_second_step->get_lut(0, lut_id),
luts_array_second_step->get_degree(lut_id),
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_group_propagation,
gpu_memory_allocated);
}
Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
for (int index = 0; index < num_radix_blocks; index++) {
@@ -1854,11 +1920,6 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
}
}
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
luts_array_second_step = new int_radix_lut<Torus>(
streams, params, num_luts_second_step, num_radix_blocks,
allocate_gpu_memory, size_tracker);
// copy the indexes to the gpu
Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1869,92 +1930,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
scalar_array_cum_sum, h_scalar_array_cum_sum,
num_radix_blocks * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
std::vector<std::function<Torus(Torus)>> lut_funcs;
std::vector<uint32_t> lut_ids;
// luts for first group inner propagation
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
auto f_first_grouping_inner_propagation =
[lut_id](Torus propa_cum_sum_block) -> Torus {
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
if (carry != 0) {
return 2ull; // Generates Carry
} else {
return 0ull; // Does not generate carry
}
};
lut_funcs.push_back(f_first_grouping_inner_propagation);
lut_ids.push_back(lut_id);
}
auto f_first_grouping_outer_propagation =
[num_bits_in_block](Torus block) -> Torus {
return (block >> (num_bits_in_block - 1)) & 1;
};
int lut_id = grouping_size - 1;
lut_funcs.push_back(f_first_grouping_outer_propagation);
lut_ids.push_back(lut_id);
// for other groupings inner propagation
for (int index = 0; index < grouping_size; index++) {
uint32_t lut_id = index + grouping_size;
auto f_other_groupings_inner_propagation =
[index](Torus propa_cum_sum_block) -> Torus {
uint64_t mask = (2 << index) - 1;
if (propa_cum_sum_block >= (2 << index)) {
return 2ull; // Generates
} else if ((propa_cum_sum_block & mask) == mask) {
return 1ull; // Propagate
} else {
return 0ull; // Nothing
}
};
lut_funcs.push_back(f_other_groupings_inner_propagation);
lut_ids.push_back(lut_id);
}
if (use_sequential_algorithm_to_resolve_group_carries) {
for (int index = 0; index < grouping_size - 1; index++) {
uint32_t lut_id = index + 2 * grouping_size;
auto f_group_propagation = [index, block_modulus,
num_bits_in_block](Torus block) -> Torus {
if (block == (block_modulus - 1)) {
return 0ull;
} else {
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
}
};
lut_funcs.push_back(f_group_propagation);
lut_ids.push_back(lut_id);
}
} else {
uint32_t lut_id = 2 * grouping_size;
auto f_group_propagation = [block_modulus](Torus block) {
if (block == (block_modulus - 1)) {
return 2ull;
} else {
return UINT64_MAX % (block_modulus * 2ull);
}
};
lut_funcs.push_back(f_group_propagation);
lut_ids.push_back(lut_id);
}
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_array_second_step->generate_and_broadcast_lut(
active_streams, lut_ids, lut_funcs, gpu_memory_allocated);
// luts_array_second_step->broadcast_lut(active_streams);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
luts_array_second_step->broadcast_lut(active_streams);
if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -1978,8 +1955,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
auto new_active_streams = streams.active_gpu_subset(
new_num_blocks, luts_array_second_step->params.pbs_type);
auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
// We just need to update the lut indexes so we use false here
luts_array_second_step->broadcast_lut(new_active_streams, false);
@@ -2042,28 +2018,12 @@ template <typename Torus> struct int_sc_prop_memory {
uint32_t requested_flag;
bool gpu_memory_allocated;
void setup_message_extract_indices_for_carry_async(CudaStreams streams,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
}
int_sc_prop_memory(CudaStreams streams, int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
@@ -2086,6 +2046,24 @@ template <typename Torus> struct int_sc_prop_memory {
streams, params, num_radix_blocks, grouping_size, num_groups,
allocate_gpu_memory, size_tracker);
// Step 3 elements
int num_luts_message_extract =
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
lut_message_extract = new int_radix_lut<Torus>(
streams, params, num_luts_message_extract, num_radix_blocks + 1,
allocate_gpu_memory, size_tracker);
// lut for the first block in the first grouping
auto f_message_extract = [message_modulus](Torus block) -> Torus {
return (block >> 1) % message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_message_extract,
gpu_memory_allocated);
// This store a single block that with be used to store the overflow or
// carry results
output_flag = new CudaRadixCiphertextFFI;
@@ -2136,30 +2114,22 @@ template <typename Torus> struct int_sc_prop_memory {
return output1 << 3 | output2 << 2;
};
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
lut_overflow_flag_prep->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {f_overflow_fp}, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_overflow_flag_prep->get_lut(0, 0),
lut_overflow_flag_prep->get_degree(0),
lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(1);
lut_overflow_flag_prep->broadcast_lut(active_streams);
}
// Step 3 elements
int num_luts_message_extract =
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
lut_message_extract = new int_radix_lut<Torus>(
streams, params, num_luts_message_extract, num_radix_blocks + 1,
allocate_gpu_memory, size_tracker);
// lut for the first block in the first grouping
auto f_message_extract = [message_modulus](Torus block) -> Torus {
return (block >> 1) % message_modulus;
};
auto active_streams =
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
// For the final cleanup in case of overflow or carry (it seems that I can)
// It seems that this lut could be apply together with the other one but for
// now we won't do it
switch (requested_flag) {
case outputFlag::FLAG_OVERFLOW: { // Overflow case
if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
auto f_overflow_last = [num_radix_blocks,
requested_flag_in](Torus block) -> Torus {
uint32_t position = (num_radix_blocks == 1 &&
@@ -2171,38 +2141,61 @@ template <typename Torus> struct int_sc_prop_memory {
Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
if (input_carry == outputFlag::FLAG_OVERFLOW) {
return does_overflow_if_carry_is_1;
} else {
return does_overflow_if_carry_is_0;
}
return does_overflow_if_carry_is_0;
};
setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
allocate_gpu_memory);
lut_message_extract->generate_and_broadcast_lut(
active_streams, {0, 1}, {f_message_extract, f_overflow_last},
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 1),
lut_message_extract->get_degree(1),
lut_message_extract->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
gpu_memory_allocated);
break;
}
case outputFlag::FLAG_CARRY: { // Carry case
setup_message_extract_indices_for_carry_async(streams, num_radix_blocks,
allocate_gpu_memory);
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
}
if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
auto f_carry_last = [](Torus block) -> Torus {
return ((block >> 2) & 1);
};
lut_message_extract->generate_and_broadcast_lut(
active_streams, {0, 1}, {f_message_extract, f_carry_last},
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 1),
lut_message_extract->get_degree(1),
lut_message_extract->get_max_degree(1), glwe_dimension,
polynomial_size, message_modulus, carry_modulus, f_carry_last,
gpu_memory_allocated);
break;
}
default:
lut_message_extract->generate_and_broadcast_lut(
active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
break;
}
// lut_message_extract->broadcast_lut(active_streams);
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
for (int index = 0; index < num_radix_blocks + 1; index++) {
if (index < num_radix_blocks) {
h_lut_indexes[index] = 0;
} else {
h_lut_indexes[index] = 1;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
}
auto active_streams = streams.active_gpu_subset(num_radix_blocks + 1);
lut_message_extract->broadcast_lut(active_streams);
};
void release(CudaStreams streams) {
@@ -2398,8 +2391,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
// Do I need to do something else for the multi-gpu?
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
luts_array_first_step->broadcast_lut(active_streams);
};
@@ -2410,8 +2402,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
auto new_active_streams = streams.active_gpu_subset(
new_num_blocks, luts_array_first_step->params.pbs_type);
auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
// We just need to update the lut indexes so we use false here
luts_array_first_step->broadcast_lut(new_active_streams, false);
}
@@ -2500,11 +2491,15 @@ template <typename Torus> struct int_borrow_prop_memory {
return (block >> 1) % message_modulus;
};
active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_message_extract,
gpu_memory_allocated);
active_streams = streams.active_gpu_subset(num_radix_blocks);
lut_message_extract->generate_and_broadcast_lut(
active_streams, {0}, {f_message_extract}, gpu_memory_allocated);
lut_message_extract->broadcast_lut(active_streams);
if (compute_overflow) {
lut_borrow_flag =
@@ -2515,12 +2510,15 @@ template <typename Torus> struct int_borrow_prop_memory {
return ((block >> 2) & 1);
};
lut_borrow_flag->generate_and_broadcast_lut(
active_streams, {0}, {f_borrow_flag}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
lut_borrow_flag->broadcast_lut(active_streams);
}
active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
active_streams = streams.active_gpu_subset(num_radix_blocks);
internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
2);
};

View File

@@ -37,14 +37,16 @@ template <typename Torus> struct int_mul_memory {
zero_out_predicate_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_predicate_lut->get_lut(0, 0),
zero_out_predicate_lut->get_degree(0),
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
zero_out_predicate_lut_f, gpu_memory_allocated);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
zero_out_predicate_lut->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {zero_out_predicate_lut_f},
gpu_memory_allocated);
// zero_out_predicate_lut->broadcast_lut(active_streams);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
zero_out_predicate_lut->broadcast_lut(active_streams);
zero_out_mem = new int_zero_out_if_buffer<Torus>(
streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
@@ -52,7 +54,10 @@ template <typename Torus> struct int_mul_memory {
return;
}
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
auto carry_modulus = params.carry_modulus;
// 'vector_result_lsb' contains blocks from all possible shifts of
// radix_lwe_left excluding zero ciphertext blocks
@@ -96,6 +101,18 @@ template <typename Torus> struct int_mul_memory {
return (x * y) / message_modulus;
};
// generate accumulators
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), lsb_acc,
luts_array->get_degree(0), luts_array->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_lsb, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), msb_acc,
luts_array->get_degree(1), luts_array->get_max_degree(1),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_msb, gpu_memory_allocated);
// lut_indexes_vec for luts_array should be reinitialized
// first lsb_vector_block_count value should reference to lsb_acc
// last msb_vector_block_count values should reference to msb_acc
@@ -105,12 +122,8 @@ template <typename Torus> struct int_mul_memory {
streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
msb_vector_block_count);
auto active_streams =
streams.active_gpu_subset(total_block_count, params.pbs_type);
luts_array->generate_and_broadcast_bivariate_lut(
active_streams, {0, 1}, {lut_f_lsb, lut_f_msb}, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(total_block_count);
luts_array->broadcast_lut(active_streams);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, params, num_radix_blocks, 2 * num_radix_blocks,

View File

@@ -126,11 +126,9 @@ template <typename Torus> struct int_grouped_oprf_memory {
luts->get_lut_indexes(0, 0), this->h_lut_indexes,
num_blocks_to_process * sizeof(Torus), streams.stream(0),
streams.gpu_index(0), allocate_gpu_memory);
auto active_streams =
streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
luts->broadcast_lut(active_streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(h_corrections);
}

View File

@@ -6,8 +6,6 @@ void release_radix_ciphertext_async(cudaStream_t const stream,
CudaRadixCiphertextFFI *data,
const bool gpu_memory_allocated);
void release_cpu_radix_ciphertext_async(CudaRadixCiphertextFFI *data);
void reset_radix_ciphertext_blocks(CudaRadixCiphertextFFI *data,
uint32_t new_num_blocks);

View File

@@ -85,11 +85,14 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
// right shift
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
cur_lut_bivariate->broadcast_lut(active_streams);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
@@ -168,10 +171,15 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
// right shift
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cur_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {shift_lut_f}, gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
cur_lut_bivariate->get_lut(0, 0), cur_lut_bivariate->get_degree(0),
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
shift_lut_f, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
cur_lut_bivariate->broadcast_lut(active_streams);
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}
}
@@ -212,7 +220,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
auto active_streams = streams.active_gpu_subset(1);
// In the arithmetic shift, a PBS has to be applied to the last rotated
// block twice: once to shift it, once to compute the padding block to be
// copied onto all blocks to the left of the last rotated block
@@ -261,11 +269,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return shifted | padding;
};
auto active_streams_shift_last =
streams.active_gpu_subset(1, params.pbs_type);
shift_last_block_lut_univariate->generate_and_broadcast_lut(
active_streams_shift_last, {0}, {last_block_lut_f},
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
shift_last_block_lut_univariate->get_lut(0, 0),
shift_last_block_lut_univariate->get_degree(0),
shift_last_block_lut_univariate->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
auto active_streams_shift_last = streams.active_gpu_subset(1);
shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
}
@@ -283,8 +295,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return (params.message_modulus - 1) * x_sign_bit;
};
padding_block_lut_univariate->generate_and_broadcast_lut(
active_streams, {0}, {padding_block_lut_f}, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
padding_block_lut_univariate->get_lut(0, 0),
padding_block_lut_univariate->get_degree(0),
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
padding_block_lut_f, gpu_memory_allocated);
// auto active_streams = streams.active_gpu_subset(1);
padding_block_lut_univariate->broadcast_lut(active_streams);
lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -317,11 +336,16 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
return message_of_current_block + carry_of_previous_block;
};
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
shift_blocks_lut_bivariate->get_lut(0, 0),
shift_blocks_lut_bivariate->get_degree(0),
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
blocks_lut_f, gpu_memory_allocated);
auto active_streams_shift_blocks =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
shift_blocks_lut_bivariate->generate_and_broadcast_bivariate_lut(
active_streams_shift_blocks, {0}, {blocks_lut_f},
gpu_memory_allocated);
streams.active_gpu_subset(num_radix_blocks);
shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
}

View File

@@ -113,21 +113,27 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
else
return current_bit;
};
;
auto active_gpu_count_mux = streams.active_gpu_subset(
bits_per_block * num_radix_blocks, params.pbs_type);
mux_lut->generate_and_broadcast_lut(active_gpu_count_mux, {0}, {mux_lut_f},
gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), mux_lut->get_lut(0, 0),
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
auto active_gpu_count_mux =
streams.active_gpu_subset(bits_per_block * num_radix_blocks);
mux_lut->broadcast_lut(active_gpu_count_mux);
auto cleaning_lut_f = [params](Torus x) -> Torus {
return x % params.message_modulus;
};
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), cleaning_lut->get_lut(0, 0),
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
auto active_gpu_count_cleaning =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
cleaning_lut->generate_and_broadcast_lut(
active_gpu_count_cleaning, {0}, {cleaning_lut_f}, gpu_memory_allocated);
streams.active_gpu_subset(num_radix_blocks);
cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
}
void release(CudaStreams streams) {

View File

@@ -74,26 +74,44 @@ template <typename Torus> struct int_overflowing_sub_memory {
luts_array, size_tracker,
allocate_gpu_memory, size_tracker);
auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
// generate luts (aka accumulators)
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut_does_block_generate_carry,
luts_array->get_degree(0), luts_array->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_lut_does_block_generate_carry, gpu_memory_allocated);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
lut_does_block_generate_or_propagate, luts_array->get_degree(1),
luts_array->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate,
gpu_memory_allocated);
if (allocate_gpu_memory)
cuda_set_value_async<Torus>(streams.stream(0), streams.gpu_index(0),
luts_array->get_lut_indexes(0, 1), 1,
num_radix_blocks - 1);
auto active_streams =
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
luts_borrow_propagation_sum->generate_and_broadcast_bivariate_lut(
active_streams, {0}, {f_luts_borrow_propagation_sum},
gpu_memory_allocated);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
luts_borrow_propagation_sum->get_lut(0, 0),
luts_borrow_propagation_sum->get_degree(0),
luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension,
polynomial_size, message_modulus, carry_modulus,
f_luts_borrow_propagation_sum, gpu_memory_allocated);
luts_array->generate_and_broadcast_lut(
active_streams, {0, 1},
{f_lut_does_block_generate_carry,
f_lut_does_block_generate_or_propagate},
gpu_memory_allocated);
// generate luts (aka accumulators)
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), message_acc->get_lut(0, 0),
message_acc->get_degree(0), message_acc->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
f_message_acc, gpu_memory_allocated);
message_acc->generate_and_broadcast_lut(
active_streams, {0}, {f_message_acc}, gpu_memory_allocated);
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
luts_array->broadcast_lut(active_streams);
luts_borrow_propagation_sum->broadcast_lut(active_streams);
message_acc->broadcast_lut(active_streams);
}
void release(CudaStreams streams) {

View File

@@ -38,8 +38,7 @@ template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();

View File

@@ -40,8 +40,7 @@ template <typename Torus> struct int_equality_selectors_buffer {
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -155,8 +154,7 @@ template <typename Torus> struct int_possible_results_buffer {
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -209,8 +207,7 @@ template <typename Torus> struct int_possible_results_buffer {
params.message_modulus, params.carry_modulus, fns,
allocate_gpu_memory);
current_lut->broadcast_lut(
streams.active_gpu_subset(1, params.pbs_type));
current_lut->broadcast_lut(streams.active_gpu_subset(1));
stream_luts[lut_count++] = current_lut;
lut_value_start += luts_in_this_call;
}
@@ -285,8 +282,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams);
@@ -298,10 +294,13 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {id_fn},
allocate_gpu_memory);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
id_fn, allocate_gpu_memory);
lut->broadcast_lut(streams.active_gpu_subset(num_blocks));
this->stream_identity_luts[i] = lut;
}
@@ -314,17 +313,27 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->message_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
this->message_extract_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {msg_fn},
allocate_gpu_memory);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->message_extract_lut->get_lut(0, 0),
this->message_extract_lut->get_degree(0),
this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
msg_fn, allocate_gpu_memory);
this->message_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks));
this->carry_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
this->carry_extract_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_blocks, params.pbs_type), {0}, {carry_fn},
allocate_gpu_memory);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->carry_extract_lut->get_lut(0, 0),
this->carry_extract_lut->get_degree(0),
this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
carry_fn, allocate_gpu_memory);
this->carry_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks));
this->partial_aggregated_vectors =
new CudaRadixCiphertextFFI *[num_streams];
@@ -619,8 +628,7 @@ template <typename Torus> struct int_unchecked_contains_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -695,8 +703,7 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -1087,8 +1094,7 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -1171,9 +1177,14 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
this->prefix_sum_lut = new int_radix_lut<Torus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{prefix_sum_fn}, allocate_gpu_memory);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
this->prefix_sum_lut->get_lut(0, 0),
this->prefix_sum_lut->get_degree(0),
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1183,9 +1194,13 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
};
this->cleanup_lut = new int_radix_lut<Torus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
this->cleanup_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{cleanup_fn}, allocate_gpu_memory);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
}
void release(CudaStreams streams) {
@@ -1277,8 +1292,7 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -1351,9 +1365,14 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
this->prefix_sum_lut = new int_radix_lut<Torus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
this->prefix_sum_lut->generate_and_broadcast_bivariate_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{prefix_sum_fn}, allocate_gpu_memory);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
this->prefix_sum_lut->get_lut(0, 0),
this->prefix_sum_lut->get_degree(0),
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
prefix_sum_fn, allocate_gpu_memory);
this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
Torus val = x % params.message_modulus;
@@ -1363,9 +1382,13 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
};
this->cleanup_lut = new int_radix_lut<Torus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
this->cleanup_lut->generate_and_broadcast_lut(
streams.active_gpu_subset(num_inputs, params.pbs_type), {0},
{cleanup_fn}, allocate_gpu_memory);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
cleanup_fn, allocate_gpu_memory);
this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
}
void release(CudaStreams streams) {
@@ -1439,8 +1462,7 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
@@ -1501,8 +1523,7 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams =
streams.active_gpu_subset(num_blocks, params.pbs_type);
this->active_streams = streams.active_gpu_subset(num_blocks);
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);

View File

@@ -5,14 +5,21 @@
extern "C" {
void cuda_keyswitch_lwe_ciphertext_vector_64_64(
void cuda_keyswitch_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples);
void cuda_keyswitch_lwe_ciphertext_vector_64_32(
void cuda_keyswitch_gemm_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);
void cuda_keyswitch_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
@@ -24,20 +31,6 @@ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t num_lwes, bool allocate_gpu_memory);
void cuda_keyswitch_gemm_lwe_ciphertext_vector_64_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);
void cuda_keyswitch_gemm_lwe_ciphertext_vector_64_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);
uint64_t scratch_cuda_keyswitch_gemm_64(void *stream, uint32_t gpu_index,
void **ks_tmp_memory,
uint32_t lwe_dimension_in,
@@ -72,10 +65,6 @@ void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
uint32_t gpu_index,
int8_t **fp_ks_buffer,
bool gpu_memory_allocated);
void cuda_closest_representable_64(void *stream, uint32_t gpu_index,
void const *input, void *output,
uint32_t base_log, uint32_t level_count);
}
#endif // CNCRT_KS_H_

View File

@@ -1,24 +0,0 @@
#ifndef TRIVIUM_H
#define TRIVIUM_H
#include "../integer/integer.h"
extern "C" {
uint64_t scratch_cuda_trivium_64(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type, uint32_t num_inputs);
void cuda_trivium_generate_keystream_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *keystream_output,
const CudaRadixCiphertextFFI *key, const CudaRadixCiphertextFFI *iv,
uint32_t num_inputs, uint32_t num_steps, int8_t *mem_ptr, void *const *bsks,
void *const *ksks);
void cleanup_cuda_trivium_64(CudaStreamsFFI streams, int8_t **mem_ptr_void);
}
#endif

Some files were not shown because too many files have changed in this diff Show More