Compare commits

..

3 Commits

Author SHA1 Message Date
Agnes Leroy
84af5e983a revert optim flags 2025-05-13 10:05:23 +02:00
Agnes Leroy
069d47309d fix 2025-05-12 17:35:31 +02:00
Beka Barbakadze
7389494c45 draft 2025-05-12 15:38:19 +02:00
364 changed files with 1248 additions and 46640 deletions

View File

@@ -6,7 +6,6 @@ self-hosted-runner:
- large_windows_16_latest
- large_ubuntu_16
- large_ubuntu_16-22.04
- v80-desktop
# Configuration variables in array of strings defined in your repository or
# organization. `null` means disabling configuration variables check.
# Empty array means no configuration variable is allowed.

View File

@@ -23,8 +23,8 @@ on:
workflow_dispatch:
pull_request:
permissions:
contents: read
permissions: {}
jobs:
setup-instance:

View File

@@ -24,8 +24,8 @@ on:
workflow_dispatch:
pull_request:
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -30,8 +30,8 @@ on:
branches:
- main
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -30,8 +30,8 @@ on:
branches:
- main
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -27,8 +27,8 @@ on:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -23,8 +23,8 @@ on:
pull_request:
types: [ labeled ]
permissions:
contents: read
permissions: {}
jobs:
setup-instance:

View File

@@ -93,7 +93,7 @@ jobs:
- name: Parse key sizes results
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/boolean_key_sizes.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe/boolean_key_sizes.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results

View File

@@ -97,13 +97,13 @@ jobs:
- name: Parse swap request PBS counts
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe/dex_swap_request_pbs_count.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results
- name: Parse swap claim PBS counts
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe/dex_swap_claim_pbs_count.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results

View File

@@ -98,7 +98,7 @@ jobs:
- name: Parse PBS counts
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/erc20_pbs_count.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe/erc20_pbs_count.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results

View File

@@ -10,14 +10,13 @@ on:
type: choice
options:
- "l40 (n3-L40x1)"
- "4-l40 (n3-L40x4)"
- "multi-a100-nvlink (n3-A100x8-NVLink)"
- "single-h100 (n3-H100x1)"
- "2-h100 (n3-H100x2)"
- "4-h100 (n3-H100x4)"
- "multi-h100 (n3-H100x8)"
- "multi-h100-nvlink (n3-H100x8-NVLink)"
- "multi-h100-sxm5 (n3-H100x8-SXM5)"
- "multi-a100-nvlink (n3-A100x8-NVLink)"
command:
description: "Benchmark command to run"
type: choice

View File

@@ -22,8 +22,8 @@ on:
# Weekly benchmarks will be triggered each Friday at 9p.m.
- cron: "0 21 * * 5"
permissions:
contents: read
permissions: {}
jobs:
cuda-integer-benchmarks:

View File

@@ -275,11 +275,11 @@ jobs:
- name: Run benchmarks
run: |
make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" BENCH_PARAM_TYPE="${BENCH_PARAMS_TYPE}" bench_"${BENCH_COMMAND}"_gpu
make BENCH_OP_FLAVOR="${OP_FLAVOR}" BENCH_TYPE="${BENCH_TYPE}" BENCH_PARAM_TYPE="${PARAMS_TYPE}" bench_"${COMMAND}"_gpu
env:
OP_FLAVOR: ${{ matrix.op_flavor }}
BENCH_TYPE: ${{ matrix.bench_type }}
BENCH_PARAMS_TYPE: ${{ matrix.params_type }}
BENCH_PARAM_TYPE: ${{ matrix.params_type }}
BENCH_COMMAND: ${{ matrix.command }}
- name: Parse results

View File

@@ -10,8 +10,6 @@ on:
type: choice
options:
- "l40 (n3-L40x1)"
- "4-l40 (n3-L40x4)"
- "multi-a100-nvlink (n3-A100x8-NVLink)"
- "single-h100 (n3-H100x1)"
- "2-h100 (n3-H100x2)"
- "4-h100 (n3-H100x4)"

View File

@@ -10,8 +10,6 @@ on:
type: choice
options:
- "l40 (n3-L40x1)"
- "4-l40 (n3-L40x4)"
- "multi-a100-nvlink (n3-A100x8-NVLink)"
- "single-h100 (n3-H100x1)"
- "2-h100 (n3-H100x2)"
- "4-h100 (n3-H100x4)"

View File

@@ -1,88 +0,0 @@
# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot.
name: Hpu Integer Benchmarks
on:
workflow_dispatch:
env:
CARGO_TERM_COLOR: always
RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
RUST_BACKTRACE: "full"
RUST_MIN_STACK: "8388608"
permissions: {}
jobs:
integer-benchmarks-hpu:
name: Execute integer & erc20 benchmarks for HPU backend
runs-on: v80-desktop
concurrency:
group: ${{ github.workflow }}_${{ github.ref }}
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
timeout-minutes: 1440 # 24 hours
steps:
# Needed as long as hw_regmap repository is private
- name: Configure SSH
uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Checkout tfhe-rs repo with tags
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Get benchmark details
run: |
{
echo "BENCH_DATE=$(date --iso-8601=seconds)";
echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
echo "COMMIT_HASH=$(git describe --tags --dirty)";
} >> "${GITHUB_ENV}"
- name: Install rust
uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203
with:
toolchain: nightly
- name: Checkout Slab repo
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
repository: zama-ai/slab
path: slab
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Run benchmarks
run: |
make bench_integer_hpu
make bench_hlapi_erc20_hpu
- name: Parse results
run: |
python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \
--database tfhe_rs \
--hardware "hpu_x1" \
--backend hpu \
--project-version "${COMMIT_HASH}" \
--branch "${REF_NAME}" \
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--walk-subdirs
env:
REF_NAME: ${{ github.ref_name }}
- name: Upload parsed results artifact
uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08
with:
name: ${{ github.sha }}_integer_benchmarks
path: ${{ env.RESULTS_FILENAME }}
- name: Send data to Slab
shell: bash
run: |
python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \
--slab-url "${{ secrets.SLAB_URL }}"

View File

@@ -137,7 +137,7 @@ jobs:
- name: Parse key sizes results
if: matrix.op_flavor == 'default'
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/shortint_key_sizes.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe/shortint_key_sizes.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results

View File

@@ -146,7 +146,7 @@ jobs:
- name: Parse results
run: |
make parse_wasm_benchmarks
python3 ./ci/benchmark_parser.py tfhe-benchmark/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe/wasm_pk_gen.csv "${RESULTS_FILENAME}" \
--database tfhe_rs \
--hardware "m6i.4xlarge" \
--project-version "${COMMIT_HASH}" \
@@ -154,7 +154,7 @@ jobs:
--commit-date "${COMMIT_DATE}" \
--bench-date "${BENCH_DATE}" \
--key-gen
rm tfhe-benchmark/wasm_pk_gen.csv
rm tfhe/wasm_pk_gen.csv
env:
REF_NAME: ${{ github.ref_name }}
@@ -167,7 +167,7 @@ jobs:
- name: Parse key and ciphertext sizes results
if: matrix.browser == 'chrome'
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/hlapi_cpk_and_cctl_sizes.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe/hlapi_cpk_and_cctl_sizes.csv "${RESULTS_FILENAME}" \
--key-gen \
--append-results

View File

@@ -184,7 +184,7 @@ jobs:
- name: Parse CRS sizes results
run: |
python3 ./ci/benchmark_parser.py tfhe-benchmark/pke_zk_crs_sizes.csv "${RESULTS_FILENAME}" \
python3 ./ci/benchmark_parser.py tfhe/pke_zk_crs_sizes.csv "${RESULTS_FILENAME}" \
--object-sizes \
--append-results

View File

@@ -14,8 +14,8 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:
contents: read
permissions: {}
jobs:
cargo-builds:
@@ -94,10 +94,5 @@ jobs:
run: |
make build_tfhe_coverage
- name: Run Hpu pcc checks
if: ${{ contains(matrix.os, 'ubuntu') }}
run: |
make pcc_hpu
# The wasm build check is a bit annoying to set-up here and is done during the tests in
# aws_tfhe_tests.yml

View File

@@ -12,8 +12,8 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:
contents: read
permissions: {}
jobs:
cargo-builds-fft:

View File

@@ -12,8 +12,8 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:
contents: read
permissions: {}
jobs:
cargo-builds-ntt:

View File

@@ -16,8 +16,8 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:
contents: read
permissions: {}
jobs:
should-run:
@@ -51,7 +51,7 @@ jobs:
runs-on: ${{ matrix.runner_type }}
strategy:
matrix:
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
runner_type: [ubuntu-latest, macos-latest, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -82,7 +82,7 @@ jobs:
runs-on: ${{ matrix.runner_type }}
strategy:
matrix:
runner_type: [ ubuntu-latest, macos-latest, windows-latest ]
runner_type: [ubuntu-latest, macos-latest, windows-latest]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:

View File

@@ -16,8 +16,8 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions:
contents: read
permissions: {}
jobs:
should-run:
@@ -51,7 +51,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-latest, macos-latest, windows-latest ]
os: [ubuntu-latest, macos-latest, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
@@ -77,7 +77,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ ubuntu-latest, macos-latest, windows-latest ]
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:

View File

@@ -3,9 +3,8 @@ name: Check commit and PR compliance
on:
pull_request:
permissions:
contents: read
pull-requests: read # Permission needed to scan commits in a pull-request
permissions: {}
jobs:
check-commit-pr:

View File

@@ -9,8 +9,7 @@ env:
ACTIONLINT_CHECKSUM: "023070a287cd8cccd71515fedc843f1985bf96c436b7effaecce67290e7e0757"
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
permissions:
contents: read
permissions: {}
jobs:
lint-check:

View File

@@ -17,8 +17,8 @@ on:
workflow_dispatch:
# Code coverage workflow is only run via workflow_dispatch event since execution duration is not stabilized yet.
permissions:
contents: read
permissions: {}
jobs:
setup-instance:

View File

@@ -21,8 +21,8 @@ on:
pull_request:
types: [ labeled ]
permissions:
contents: read
permissions: {}
jobs:
setup-instance:

View File

@@ -22,8 +22,8 @@ on:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
permissions:
contents: read
permissions: {}
jobs:
cuda-tests-linux:

View File

@@ -25,8 +25,8 @@ on:
pull_request:
types: [ labeled ]
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -24,8 +24,8 @@ on:
workflow_dispatch:
pull_request:
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -25,8 +25,8 @@ on:
pull_request:
types: [ labeled ]
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -19,8 +19,8 @@ on:
# Nightly tests will be triggered each evening 8p.m.
- cron: "0 20 * * *"
permissions:
contents: read
permissions: {}
jobs:
setup-instance:

View File

@@ -23,8 +23,8 @@ env:
on:
pull_request:
permissions:
contents: read
permissions: {}
jobs:
setup-instance:

View File

@@ -25,8 +25,8 @@ on:
pull_request:
types: [ labeled ]
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -25,8 +25,9 @@ on:
pull_request:
types: [ labeled ]
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -29,8 +29,8 @@ on:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -25,8 +25,9 @@ on:
pull_request:
types: [ labeled ]
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -25,8 +25,8 @@ on:
pull_request:
types: [ labeled ]
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -29,8 +29,8 @@ on:
# Nightly tests @ 1AM after each work day
- cron: "0 1 * * MON-FRI"
permissions:
contents: read
permissions: {}
jobs:
should-run:

View File

@@ -1,73 +0,0 @@
# Test tfhe-fft
name: Cargo Test HLAPI HPU
on:
pull_request:
push:
branches:
- main
env:
CARGO_TERM_COLOR: always
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
permissions: { }
jobs:
should-run:
runs-on: ubuntu-latest
permissions:
pull-requests: read
outputs:
hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }}
steps:
- name: Checkout tfhe-rs
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Check for file changes
id: changed-files
uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
with:
files_yaml: |
hpu:
- tfhe/Cargo.toml
- Makefile
- backends/tfhe-hpu-backend/**
- mockups/tfhe-hpu-mockup/**
cargo-tests-hpu:
needs: should-run
if: needs.should-run.outputs.hpu_test == 'true'
runs-on: large_ubuntu_16
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
with:
persist-credentials: 'false'
token: ${{ env.CHECKOUT_TOKEN }}
- name: Install Rust
uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af
with:
toolchain: stable
override: true
- name: Install Just
run: |
cargo install just
- name: Test HLAPI HPU
run: |
source setup_hpu.sh
just -f mockups/tfhe-hpu-mockup/Justfile BUILD_PROFILE=release mockup &
make HPU_CONFIG=sim test_high_level_api_hpu

View File

@@ -27,8 +27,7 @@ concurrency:
group: ${{ github.workflow_ref }}
cancel-in-progress: true
permissions:
contents: read
permissions: {}
jobs:
cargo-builds-m1:

View File

@@ -110,10 +110,7 @@ jobs:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p tfhe --token "${CRATES_TOKEN}" ${DRY_RUN}
cargo publish -p tfhe --token "${CRATES_TOKEN}" "${DRY_RUN}"
- name: Generate hash
id: published_hash

View File

@@ -159,10 +159,7 @@ jobs:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p tfhe-cuda-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
cargo publish -p tfhe-cuda-backend --token "${CRATES_TOKEN}" "${DRY_RUN}"
- name: Generate hash
id: published_hash

View File

@@ -1,105 +0,0 @@
name: Publish HPU release
on:
workflow_dispatch:
inputs:
dry_run:
description: "Dry-run"
type: boolean
default: true
env:
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
permissions: {}
jobs:
verify_tag:
uses: ./.github/workflows/verify_tagged_commit.yml
secrets:
RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }}
READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }}
package:
runs-on: ubuntu-latest
needs: verify_tag
outputs:
hash: ${{ steps.hash.outputs.hash }}
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Prepare package
run: |
cargo package -p tfhe-hpu-backend
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: crate
path: target/package/*.crate
- name: generate hash
id: hash
run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
provenance:
if: ${{ !inputs.dry_run }}
needs: [package]
uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0
permissions:
# Needed to detect the GitHub Actions environment
actions: read
# Needed to create the provenance via GitHub OIDC
id-token: write
# Needed to upload assets/artifacts
contents: write
with:
# SHA-256 hashes of the Crate package.
base64-subjects: ${{ needs.package.outputs.hash }}
publish_release:
name: Publish tfhe-hpu-backend Release
runs-on: ubuntu-latest
needs: [verify_tag, package] # for comparing hashes
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
persist-credentials: 'false'
token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
- name: Publish crate.io package
env:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN}
- name: Generate hash
id: published_hash
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"
- name: Slack notification (hashes comparison)
if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
env:
SLACK_COLOR: failure
SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})"
- name: Slack Notification
if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
continue-on-error: true
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3
env:
SLACK_COLOR: ${{ job.status }}
SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})"

View File

@@ -84,10 +84,7 @@ jobs:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p tfhe-csprng --token "${CRATES_TOKEN}" ${DRY_RUN}
cargo publish -p tfhe-csprng --token "${CRATES_TOKEN}" "${DRY_RUN}"
- name: Generate hash
id: published_hash
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

View File

@@ -80,10 +80,7 @@ jobs:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p tfhe-fft --token "${CRATES_TOKEN}" ${DRY_RUN}
cargo publish -p tfhe-fft --token "${CRATES_TOKEN}" "${DRY_RUN}"
- name: Generate hash
id: published_hash

View File

@@ -80,10 +80,7 @@ jobs:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p tfhe-ntt --token "${CRATES_TOKEN}" ${DRY_RUN}
cargo publish -p tfhe-ntt --token "${CRATES_TOKEN}" "${DRY_RUN}"
- name: Generate hash
id: published_hash

View File

@@ -81,10 +81,7 @@ jobs:
CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }}
run: |
# DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish
# would fail. This is safe since DRY_RUN is handled in the env section above.
# shellcheck disable=SC2086
cargo publish -p tfhe-zk-pok --token "${CRATES_TOKEN}" ${DRY_RUN}
cargo publish -p tfhe-zk-pok --token "${CRATES_TOKEN}" "${DRY_RUN}"
- name: Verify hash
id: published_hash
run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}"

View File

@@ -1,2 +0,0 @@
[lfs]
fetchexclude = *

View File

@@ -9,12 +9,10 @@ members = [
"tasks",
"tfhe-csprng",
"backends/tfhe-cuda-backend",
"backends/tfhe-hpu-backend",
"utils/tfhe-versionable",
"utils/tfhe-versionable-derive",
"utils/param_dedup",
"tests",
"mockups/tfhe-hpu-mockup",
]
exclude = [

View File

@@ -2,7 +2,6 @@ SHELL:=$(shell /usr/bin/env which bash)
OS:=$(shell uname)
RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n')
CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN)
CARGO_BUILD_JOBS=default
CPU_COUNT=$(shell ./scripts/cpu_count.sh)
RS_BUILD_TOOLCHAIN:=stable
CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN)
@@ -56,9 +55,6 @@ REGEX_PATTERN?=''
TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda
TFHECUDA_BUILD=$(TFHECUDA_SRC)/build
# tfhe-hpu-backend
HPU_CONFIG=v80
# Exclude these files from coverage reports
define COVERAGE_EXCLUDED_FILES
--exclude-files apps/trivium/src/trivium/* \
@@ -305,13 +301,6 @@ check_gpu: install_rs_check_toolchain
--all-targets \
-p $(TFHE_SPEC)
.PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled
clippy_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \
--features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \
--all-targets \
-p $(TFHE_SPEC) -- --no-deps -D warnings
.PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant
fix_newline: check_linelint_installed
linelint -a .
@@ -484,11 +473,6 @@ clippy_cuda_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-cuda-backend -- --no-deps -D warnings
.PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend
clippy_hpu_backend: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
-p tfhe-hpu-backend -- --no-deps -D warnings
.PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend
check_rust_bindings_did_not_change:
cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \
@@ -718,28 +702,6 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n
--cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \
--signed-only --tfhe-package "$(TFHE_SPEC)"
.PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend
test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest
cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu
.PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup
test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest
source ./setup_hpu.sh --config sim ; \
cargo build --release --bin hpu_mockup; \
coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \
HPU_TEST_ITER=1 \
cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
kill %1
.PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup.
test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest
source ./setup_hpu.sh --config sim ; \
cargo build --profile devo --bin hpu_mockup; \
coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \
HPU_TEST_ITER=1 \
cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \
kill %1
.PHONY: test_boolean # Run the tests of the boolean module
test_boolean: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -895,22 +857,6 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
--features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \
-E "test(/high_level_api::.*gpu.*/)"
test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
ifeq ($(HPU_CONFIG), v80)
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
--build-jobs=$(CARGO_BUILD_JOBS) \
--test-threads=1 \
--features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \
-E "test(/high_level_api::.*hpu.*/)"
else
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
--build-jobs=$(CARGO_BUILD_JOBS) \
--test-threads=1 \
--features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \
-E "test(/high_level_api::.*hpu.*/)"
endif
.PHONY: test_strings # Run the tests for strings ci
test_strings: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
@@ -1154,12 +1100,6 @@ clippy_bench_gpu: install_rs_check_toolchain
--features=gpu,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \
-p tfhe-benchmark -- --no-deps -D warnings
.PHONY: clippy_bench_hpu # Run clippy lints on tfhe-benchmark
clippy_bench_hpu: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \
--features=hpu,shortint,integer,internal-keycache,pbs-stats\
-p tfhe-benchmark -- --no-deps -D warnings
.PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks
print_doc_bench_parameters:
RUSTFLAGS="" cargo run --example print_doc_bench_parameters \
@@ -1193,14 +1133,6 @@ bench_signed_integer_gpu: install_rs_check_toolchain
--bench integer-signed-bench \
--features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark --
.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend
bench_integer_hpu: install_rs_check_toolchain
source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench integer-bench \
--features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick
.PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression
bench_integer_compression: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \
@@ -1392,14 +1324,6 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain
--bench hlapi-dex \
--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark --
.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU
bench_hlapi_erc20_hpu: install_rs_check_toolchain
source ./setup_hpu.sh --config $(HPU_CONFIG) ; \
RUSTFLAGS="$(RUSTFLAGS)" \
cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
--bench hlapi-erc20 \
--features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick
.PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
bench_tfhe_zk_pok: install_rs_check_toolchain
RUSTFLAGS="$(RUSTFLAGS)" \
@@ -1499,9 +1423,6 @@ tfhe_lints
pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \
clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu
.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation
pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast
.PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast
fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \
check_md_docs_are_tested clippy_fast check_compile_tests

View File

@@ -11,13 +11,11 @@ extend-ignore-identifiers-re = [
# Example with string replacing "hello" with "herlo"
"herlo",
# Example in trivium
"C9217BA0D762ACA1",
"0x[0-9a-fA-F]+"
"C9217BA0D762ACA1"
]
[files]
extend-exclude = [
"backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu",
"backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu",
"backends/tfhe-hpu-backend/config_store/**/*.link_summary",
]

View File

@@ -1116,14 +1116,32 @@ template <typename Torus> struct int_overflowing_sub_memory {
};
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
CudaRadixCiphertextFFI *new_blocks;
CudaRadixCiphertextFFI *new_blocks_copy;
CudaRadixCiphertextFFI *old_blocks;
CudaRadixCiphertextFFI *small_lwe_vector;
int_radix_params params;
int32_t *d_smart_copy_in;
int32_t *d_smart_copy_out;
int_radix_params params;
uint32_t active_gpu_count;
size_t chunk_size;
size_t max_pbs_count;
// temporary buffers
CudaRadixCiphertextFFI *current_blocks;
CudaRadixCiphertextFFI *small_lwe_vector;
uint32_t *d_columns_data;
uint32_t *d_columns_counter;
uint32_t **d_columns;
uint32_t *d_new_columns_data;
uint32_t *d_new_columns_counter;
uint32_t **d_new_columns;
uint64_t *d_degrees;
uint32_t *d_pbs_counters;
// additional streams
cudaStream_t *helper_streams;
// lookup table for extracting message and carry
int_radix_lut<Torus> *luts_message_carry;
bool mem_reuse = false;
bool gpu_memory_allocated;
@@ -1137,100 +1155,139 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
uint64_t *size_tracker) {
this->params = params;
gpu_memory_allocated = allocate_gpu_memory;
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
(params.message_modulus - 1);
this->max_pbs_count =
num_blocks_in_radix * max_num_radix_in_vec * 2 / chunk_size;
this->active_gpu_count = get_active_gpu_count(2 * max_pbs_count, gpu_count);
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
size_t max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
uint32_t message_modulus = params.message_modulus;
printf("max_total_blocks_in_vec: %d\n", max_total_blocks_in_vec);
// process streams
helper_streams =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
helper_streams[j] = cuda_create_stream(gpu_indexes[j]);
}
// allocate gpu memory for intermediate buffers
new_blocks = new CudaRadixCiphertextFFI;
current_blocks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
new_blocks_copy = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
old_blocks = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
small_lwe_vector = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
}
int_sum_ciphertexts_vec_memory(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
CudaRadixCiphertextFFI *old_blocks,
CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
uint64_t *size_tracker) {
mem_reuse = true;
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
d_pbs_counters = (uint32_t *)cuda_malloc_with_size_tracking_async(
3 * sizeof(uint32_t), streams[0], gpu_indexes[0], size_tracker,
allocate_gpu_memory);
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
auto setup_columns = [num_blocks_in_radix, max_num_radix_in_vec, streams,
gpu_indexes, size_tracker, allocate_gpu_memory](
uint32_t **&columns, uint32_t *&columns_data,
uint32_t *&columns_counter) {
columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * sizeof(uint32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
streams[0], gpu_indexes[0], allocate_gpu_memory);
// assign gpu memory for intermediate buffers
this->new_blocks = new_blocks;
this->old_blocks = old_blocks;
this->small_lwe_vector = small_lwe_vector;
new_blocks_copy = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
for (int i = 0; i < num_blocks_in_radix; ++i) {
h_columns[i] = columns_data + i * max_num_radix_in_vec;
}
columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
num_blocks_in_radix * sizeof(uint32_t *), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
columns, h_columns, num_blocks_in_radix * sizeof(uint32_t *),
streams[0], gpu_indexes[0], allocate_gpu_memory);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete[] h_columns;
};
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
cuda_memset_with_size_tracking_async(
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
gpu_indexes[0], allocate_gpu_memory);
setup_columns(d_columns, d_columns_data, d_columns_counter);
setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter);
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, params, 2, max_total_blocks_in_vec,
allocate_gpu_memory, size_tracker);
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc,
luts_message_carry->get_degree(0),
luts_message_carry->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_message, allocate_gpu_memory);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc,
luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), params.glwe_dimension,
params.polynomial_size, message_modulus, params.carry_modulus,
lut_f_carry, allocate_gpu_memory);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_pbs_counters, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
gpu_indexes[0], gpu_memory_allocated);
cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
gpu_indexes[0], gpu_memory_allocated);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_destroy_stream(helper_streams[i], gpu_indexes[i]);
}
free(helper_streams);
if (!mem_reuse) {
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
small_lwe_vector, gpu_memory_allocated);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete new_blocks;
delete old_blocks;
luts_message_carry->release(streams, gpu_indexes, gpu_count);
delete luts_message_carry;
delete current_blocks;
delete small_lwe_vector;
}
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
gpu_memory_allocated);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
delete new_blocks_copy;
}
};
// For sequential algorithm in group propagation
@@ -2604,8 +2661,7 @@ template <typename Torus> struct int_mul_memory {
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
allocate_gpu_memory, size_tracker);
2 * num_radix_blocks, allocate_gpu_memory, size_tracker);
uint32_t uses_carry = 0;
uint32_t requested_flag = outputFlag::FLAG_NONE;
sc_prop_mem = new int_sc_prop_memory<Torus>(

View File

@@ -212,6 +212,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory, bool allocate_ms_array) {
printf("pbs_type: %d\n", pbs_type);
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
@@ -234,11 +235,6 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
PANIC("Cuda error: input vector length should be a multiple of the "
"output's number of radix blocks")
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
// the general case
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
}
switch (mem->params.polynomial_size) {
case 512:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(

View File

@@ -20,6 +20,7 @@
#include <fstream>
#include <iostream>
#include <omp.h>
#include <queue>
#include <sstream>
#include <string>
#include <vector>
@@ -123,6 +124,173 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
}
}
__global__ inline void radix_vec_to_columns(
uint32_t *const *const columns, uint32_t *const columns_counter,
const uint64_t *const degrees, const uint32_t num_radix_blocks,
const uint32_t total_blocks_in_vec) {
const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= total_blocks_in_vec)
return;
const uint64_t degree = degrees[idx];
if (degree == 0)
return;
const uint32_t column_id = idx % num_radix_blocks;
const uint32_t out_idx = atomicAdd(&columns_counter[column_id], 1);
columns[column_id][out_idx] = idx;
}
template <typename Torus>
__global__ inline void prepare_new_columns_and_pbs_indexes(
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
Torus *const lut_indexes, uint32_t *const pbs_counters,
const uint32_t *const *const columns, const uint32_t *const columns_counter,
const uint32_t chunk_size) {
__shared__ uint32_t counter, sharedOr;
if (threadIdx.x == 0) {
counter = 0;
sharedOr = 0;
}
__syncthreads();
const uint32_t base_id = threadIdx.x;
const uint32_t column_len = columns_counter[base_id];
uint32_t ct_count = 0;
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
// those indexes are for message ciphertexts
// for message ciphertexts in and out index should be same
const uint32_t in_index = columns[base_id][i];
new_columns[base_id][ct_count] = in_index;
const uint32_t pbs_index = atomicAdd(&counter, 1);
pbs_indexes_in[pbs_index] = in_index;
pbs_indexes_out[pbs_index] = in_index;
lut_indexes[pbs_index] = 0;
++ct_count;
}
__syncthreads();
uint32_t message_count = counter;
if (base_id > 0) {
const uint32_t prev_base_id = base_id - 1;
const uint32_t prev_column_len = columns_counter[prev_base_id];
for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
// those indexes are for carry ciphertexts
// for carry ciphertexts input is same as for message
// output will be placed to next block in the column
const uint32_t in_index = columns[prev_base_id][i];
const uint32_t out_index = columns[prev_base_id][i + 1];
new_columns[base_id][ct_count] = out_index;
const uint32_t pbs_index = atomicAdd(&counter, 1);
pbs_indexes_in[pbs_index] = in_index;
pbs_indexes_out[pbs_index] = out_index;
lut_indexes[pbs_index] = 1;
++ct_count;
}
}
const uint32_t start_index = column_len - column_len % chunk_size;
for (uint32_t i = start_index; i < column_len; ++i) {
new_columns[base_id][ct_count] = columns[base_id][i];
++ct_count;
}
new_columns_counter[base_id] = ct_count;
if (ct_count > chunk_size) {
atomicOr(&sharedOr, 1);
}
__syncthreads();
if (threadIdx.x == 0) {
pbs_counters[0] = counter;
pbs_counters[1] = message_count;
pbs_counters[2] = sharedOr;
}
}
template <typename Torus>
__global__ inline void prepare_final_pbs_indexes(
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
int idx = threadIdx.x;
pbs_indexes_in[idx] = idx % num_radix_blocks;
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
lut_indexes[idx] = idx / num_radix_blocks;
}
template <typename Torus>
__global__ void calculate_chunks(Torus *const input_blocks,
const uint32_t *const *const columns,
const uint32_t *const columns_counter,
const uint32_t chunk_size,
const uint32_t block_size) {
const uint32_t part_size = blockDim.x;
const uint32_t base_id = blockIdx.x;
const uint32_t part_id = blockIdx.y;
const uint32_t coef_id = part_id * part_size + threadIdx.x;
if (coef_id >= block_size)
return;
const uint32_t column_len = columns_counter[base_id];
if (column_len >= chunk_size) {
const uint32_t num_chunks = column_len / chunk_size;
Torus result = 0;
for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
result = input_blocks[first_ct_id * block_size + coef_id];
for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
const uint32_t cur_ct_id =
columns[base_id][chunk_id * chunk_size + ct_id];
result += input_blocks[cur_ct_id * block_size + coef_id];
}
input_blocks[first_ct_id * block_size + coef_id] = result;
}
}
}
template <typename Torus>
__global__ void calculate_final_chunk_into_radix(
Torus *const out_radix, const Torus *const input_blocks,
const uint32_t *const *const columns, const uint32_t *const columns_counter,
const uint32_t chunk_size, const uint32_t block_size) {
const uint32_t part_size = blockDim.x;
const uint32_t base_id = blockIdx.x;
const uint32_t part_id = blockIdx.y;
const uint32_t coef_id = part_id * part_size + threadIdx.x;
if (coef_id >= block_size)
return;
const uint32_t column_len = columns_counter[base_id];
Torus result = 0;
if (column_len) {
const uint32_t first_ct_id = columns[base_id][0];
result = input_blocks[first_ct_id * block_size + coef_id];
for (uint32_t i = 1; i < column_len; ++i) {
const uint32_t cur_ct_it = columns[base_id][i];
result += input_blocks[cur_ct_it * block_size + coef_id];
}
}
out_radix[base_id * block_size + coef_id] = result;
}
template <typename Torus, class params>
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
Torus *msb_blocks,
@@ -167,6 +335,65 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
(process_msb) ? cur_msb_ct[params::degree] : 0;
}
}
inline bool at_least_one_column_needs_processing(
const uint64_t *const degrees, const uint32_t num_radix_blocks,
const uint32_t num_radix_in_vec, const uint32_t chunk_size) {
std::vector<uint32_t> columns_count(num_radix_blocks, 0);
for (size_t column = 0; column < num_radix_blocks; ++column) {
for (size_t block = 0; block < num_radix_in_vec; ++block) {
const size_t block_index = block * num_radix_blocks + column;
if (degrees[block_index]) {
columns_count[column]++;
if (columns_count[column] > chunk_size) {
return true;
}
}
}
}
return false;
}
inline void calculate_final_degrees(uint64_t *const out_degrees,
const uint64_t *const input_degrees,
size_t num_blocks, size_t num_radix_in_vec,
size_t chunk_size,
uint64_t message_modulus) {
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
return std::min(message_modulus - 1, degree);
};
std::vector<std::queue<uint64_t>> columns(num_blocks);
for (size_t i = 0; i < num_radix_in_vec; ++i) {
for (size_t j = 0; j < num_blocks; ++j) {
if (input_degrees[i * num_blocks + j])
columns[j].push(input_degrees[i * num_blocks + j]);
}
}
for (size_t i = 0; i < num_blocks; ++i) {
auto &col = columns[i];
while (col.size() > 1) {
uint32_t cur_degree = 0;
size_t mn = std::min(chunk_size, col.size());
for (int j = 0; j < mn; ++j) {
cur_degree += col.front();
col.pop();
}
const uint64_t new_degree = get_degree(cur_degree);
col.push(new_degree);
if ((i + 1) < num_blocks) {
columns[i + 1].push(new_degree);
}
}
}
for (int i = 0; i < num_blocks; i++) {
out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
}
}
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -181,6 +408,70 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
return size_tracker;
}
void static DEBUG_PRINT_COLUMNS(uint32_t *d_column_data,
uint32_t *d_columns_count,
uint64_t *d_pbs_indexes_in,
uint64_t *d_pbs_indexes_out,
uint64_t *d_lut_indexes, int L, int N,
int pbs_cnt) {
cudaDeviceSynchronize(); // Ensure all device work is done
std::vector<uint64_t> h_pbs_indexes_in(pbs_cnt);
std::vector<uint64_t> h_pbs_indexes_out(pbs_cnt);
std::vector<uint64_t> h_lut_indexes(pbs_cnt);
check_cuda_error(cudaMemcpy(h_pbs_indexes_in.data(), d_pbs_indexes_in,
pbs_cnt * sizeof(uint64_t),
cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_pbs_indexes_out.data(), d_pbs_indexes_out,
pbs_cnt * sizeof(uint64_t),
cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_lut_indexes.data(), d_lut_indexes,
pbs_cnt * sizeof(uint64_t),
cudaMemcpyDeviceToHost));
std::vector<uint32_t> h_columns_count(L);
check_cuda_error(cudaMemcpy(h_columns_count.data(), d_columns_count,
L * sizeof(uint32_t), cudaMemcpyDeviceToHost));
std::vector<uint32_t> h_column_data(L * N);
check_cuda_error(cudaMemcpy(h_column_data.data(), d_column_data,
L * N * sizeof(uint32_t),
cudaMemcpyDeviceToHost));
cudaDeviceSynchronize(); // Ensure all device work is done
std::cout << "column_counters: ";
for (auto a : h_columns_count) {
std::cout << a << " ";
}
std::cout << std::endl;
for (int col = 0; col < L; ++col) {
std::cout << "Column[" << col << "]: ";
uint32_t count = h_columns_count[col];
for (uint32_t i = 0; i < count; ++i) {
std::cout << h_column_data[col * N + i] << " ";
}
std::cout << "\n";
}
printf("pbs_indexes %d\n", pbs_cnt);
for (auto a : h_pbs_indexes_in) {
printf("%d ", a);
}
printf("\n");
for (auto a : h_pbs_indexes_out) {
printf("%d ", a);
}
printf("\n");
for (auto a : h_lut_indexes) {
printf("%d ", a);
}
printf("\n");
printf("=========================================================\n");
}
template <typename Torus, class params>
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -199,22 +490,30 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
PANIC("Cuda error: input vector does not have enough blocks")
if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
PANIC("Cuda error: output does not have enough blocks")
auto new_blocks = mem_ptr->new_blocks;
auto new_blocks_copy = mem_ptr->new_blocks_copy;
auto old_blocks = mem_ptr->old_blocks;
auto current_blocks = mem_ptr->current_blocks;
auto small_lwe_vector = mem_ptr->small_lwe_vector;
auto d_degrees = mem_ptr->d_degrees;
auto d_columns = mem_ptr->d_columns;
auto d_columns_counter = mem_ptr->d_columns_counter;
auto d_new_columns = mem_ptr->d_new_columns;
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
auto d_pbs_counters = mem_ptr->d_pbs_counters;
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
auto luts_message_carry = mem_ptr->luts_message_carry;
auto message_modulus = mem_ptr->params.message_modulus;
auto carry_modulus = mem_ptr->params.carry_modulus;
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
auto glwe_dimension = mem_ptr->params.glwe_dimension;
auto polynomial_size = mem_ptr->params.polynomial_size;
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
auto small_lwe_size = small_lwe_dimension + 1;
auto helper_streams = mem_ptr->helper_streams;
auto chunk_size = mem_ptr->chunk_size;
size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
// In the case of extracting a single LWE this parameters are dummy
uint32_t num_many_lut = 1;
@@ -228,244 +527,153 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
terms, 0, num_radix_blocks);
return;
}
if (old_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
terms);
}
if (num_radix_in_vec == 2) {
CudaRadixCiphertextFFI old_blocks_slice;
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks_slice, num_radix_blocks);
CudaRadixCiphertextFFI terms_slice;
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
&terms_slice, num_radix_blocks);
return;
}
size_t r = num_radix_in_vec;
size_t total_modulus = message_modulus * carry_modulus;
size_t message_max = message_modulus - 1;
size_t chunk_size = (total_modulus - 1) / message_max;
size_t h_lwe_idx_in[terms->num_radix_blocks];
size_t h_lwe_idx_out[terms->num_radix_blocks];
int32_t h_smart_copy_in[terms->num_radix_blocks];
int32_t h_smart_copy_out[terms->num_radix_blocks];
/// Here it is important to query the default max shared memory on device 0
/// instead of cuda_get_max_shared_memory,
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
auto max_shared_memory = 0;
check_cuda_error(cudaDeviceGetAttribute(
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
// create lut object for message and carry
// we allocate luts_message_carry in the host function (instead of scratch)
// to reduce average memory consumption
int_radix_lut<Torus> *luts_message_carry;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
if (reused_lut == nullptr) {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_radix_blocks, true, nullptr);
} else {
luts_message_carry = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
if (current_blocks != terms) {
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
current_blocks, terms);
}
auto message_acc = luts_message_carry->get_lut(0, 0);
auto carry_acc = luts_message_carry->get_lut(0, 1);
// define functions for each accumulator
auto lut_f_message = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
return x / message_modulus;
};
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
total_blocks_in_vec * sizeof(uint64_t), streams[0],
gpu_indexes[0]);
// generate accumulators
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], message_acc,
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
lut_f_message, true);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
message_modulus, carry_modulus, lut_f_carry, true);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
int number_of_threads = 512;
int number_of_blocks =
(total_blocks_in_vec + number_of_threads - 1) / number_of_threads;
while (r > 2) {
size_t cur_total_blocks = r * num_radix_blocks;
size_t ch_amount = r / chunk_size;
if (!ch_amount)
ch_amount++;
dim3 add_grid(ch_amount, num_radix_blocks, 1);
radix_vec_to_columns<<<number_of_blocks, number_of_threads, 0, streams[0]>>>(
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
total_blocks_in_vec);
cuda_set_device(gpu_indexes[0]);
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
(Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
DEBUG_PRINT_COLUMNS(mem_ptr->d_columns_data, d_columns_counter,
d_pbs_indexes_in, d_pbs_indexes_out,
luts_message_carry->get_lut_indexes(0, 0),
num_radix_blocks, num_radix_in_vec, 0);
bool needs_processing = at_least_one_column_needs_processing(
current_blocks->degrees, num_radix_blocks, num_radix_in_vec, chunk_size);
check_cuda_error(cudaGetLastError());
number_of_threads = min(256, params::degree);
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
size_t total_count = 0;
size_t message_count = 0;
size_t carry_count = 0;
size_t sm_copy_count = 0;
// h_pbs_counters[0] - total ciphertexts
// h_pbs_counters[1] - message ciphertexts
// h_pbs_counters[2] - at_leaast_one_column_needs_processing
uint32_t *h_pbs_counters = (uint32_t *)malloc(3 * sizeof(uint32_t));
generate_ids_update_degrees(
terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
message_max, total_count, message_count, carry_count, sm_copy_count);
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
h_lwe_idx_in, h_lwe_idx_out);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
int DEBUG_I = 0;
while (needs_processing) {
calculate_chunks<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
chunk_size, big_lwe_size);
size_t copy_size = sm_copy_count * sizeof(int32_t);
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
streams[0], gpu_indexes[0]);
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0,
helper_streams[0]>>>(
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
d_pbs_counters, d_columns, d_columns_counter, chunk_size);
// inside d_smart_copy_in there are only -1 values
// it's fine to call smart_copy with same pointer
// as source and destination
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
new_blocks, 0, r * num_radix_blocks);
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
(Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
d_smart_copy_out, d_smart_copy_in, big_lwe_size);
check_cuda_error(cudaGetLastError());
cuda_memcpy_async_to_cpu(h_pbs_counters, d_pbs_counters,
3 * sizeof(uint32_t), helper_streams[0],
gpu_indexes[0]);
if (carry_count > 0)
cuda_set_value_async<Torus>(
streams[0], gpu_indexes[0],
luts_message_carry->get_lut_indexes(0, message_count), 1,
carry_count);
cuda_synchronize_stream(helper_streams[0], gpu_indexes[0]);
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
const uint32_t total_ciphertexts = h_pbs_counters[0];
const uint32_t total_messages = h_pbs_counters[1];
needs_processing = (h_pbs_counters[2] != 0);
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
std::vector<Torus *> small_lwe_vector_vec =
luts_message_carry->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec =
luts_message_carry->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec =
luts_message_carry->lwe_trivial_indexes_vec;
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
if (active_gpu_count == 1) {
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
polynomial_size * glwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
if (DEBUG_I % 2 == 0) {
DEBUG_PRINT_COLUMNS(
mem_ptr->d_new_columns_data, d_new_columns_counter, d_pbs_indexes_in,
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
num_radix_blocks, num_radix_in_vec, total_ciphertexts);
} else {
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, new_blocks_vec,
(Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
big_lwe_size);
/// Apply KS to go from a big LWE dimension to a small LWE dimension
/// After this keyswitch execution, we need to synchronize the streams
/// because the keyswitch and PBS do not operate on the same number of
/// inputs
execute_keyswitch_async<Torus>(
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
ksks, big_lwe_dimension, small_lwe_dimension,
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
/// different configuration
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, message_count,
small_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
(Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
luts_message_carry->using_trivial_lwe_indexes, total_count,
small_lwe_size);
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
luts_message_carry->using_trivial_lwe_indexes, total_count,
big_lwe_size);
/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
for (uint i = 0; i < total_count; i++) {
auto degrees_index = luts_message_carry->h_lut_indexes[i];
new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
DEBUG_PRINT_COLUMNS(
mem_ptr->d_columns_data, d_new_columns_counter, d_pbs_indexes_in,
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
num_radix_blocks, num_radix_in_vec, total_ciphertexts);
}
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
int new_blocks_created = 2 * ch_amount * num_radix_blocks;
cudaDeviceSynchronize();
if (rem_blocks > 0)
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
new_blocks_created + rem_blocks, old_blocks,
cur_total_blocks - rem_blocks, cur_total_blocks);
std::swap(new_blocks, old_blocks);
r = (new_blocks_created + rem_blocks) / num_radix_blocks;
printf("total_messages: %d\n", total_messages);
printf("total_ct: %d\n", total_ciphertexts);
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in, ksks,
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, total_messages);
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
d_pbs_indexes_out, luts_message_carry->lut_vec,
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
std::swap(d_columns, d_new_columns);
std::swap(d_columns_counter, d_new_columns_counter);
++DEBUG_I;
}
luts_message_carry->release(streams, gpu_indexes, gpu_count);
delete (luts_message_carry);
CudaRadixCiphertextFFI old_blocks_slice;
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
calculate_final_chunk_into_radix<Torus>
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
d_columns, d_columns_counter, chunk_size, big_lwe_size);
prepare_final_pbs_indexes<Torus>
<<<1, 2 * num_radix_blocks, 0, helper_streams[0]>>>(
d_pbs_indexes_in, d_pbs_indexes_out,
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
cuda_memset_async(
(Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
cuda_synchronize_stream(helper_streams[0], gpu_indexes[0]);
print_debug<Torus>("indexes_in", d_pbs_indexes_in, 2 * num_radix_blocks);
print_debug<Torus>("indexes_out", d_pbs_indexes_out, 2 * num_radix_blocks);
print_debug<Torus>("lut_indexes", luts_message_carry->get_lut_indexes(0, 0),
2 * num_radix_blocks);
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in,
(Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks, big_lwe_dimension,
small_lwe_dimension, mem_ptr->params.ks_base_log,
mem_ptr->params.ks_level, num_radix_blocks);
execute_pbs_async<Torus>(
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr, d_pbs_indexes_out,
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
(Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
CudaRadixCiphertextFFI current_blocks_slice;
as_radix_ciphertext_slice<Torus>(&current_blocks_slice, current_blocks,
num_radix_blocks, 2 * num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
&old_blocks_slice, num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
current_blocks, &current_blocks_slice, num_radix_blocks);
}
template <typename Torus, class params>

View File

@@ -118,7 +118,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
}
__syncthreads();
auto accumulator = accumulator_rotated;
if (blockIdx.z == 0) {

View File

@@ -357,16 +357,19 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
#endif
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
input_lwe_ciphertext_count, max_shared_memory)) {
printf("it is cg\n");
return scratch_cuda_programmable_bootstrap_cg<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
else
} else {
printf("it is default\n");
return scratch_cuda_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
}
}
template <typename Torus>

View File

@@ -1,3 +0,0 @@
*.xclbin filter=lfs diff=lfs merge=lfs -text
*.pdi filter=lfs diff=lfs merge=lfs -text
python/lib/example.json filter=lfs diff=lfs merge=lfs -text

View File

@@ -1,3 +0,0 @@
ngt_*
config
kogge_cfg.toml

View File

@@ -1,88 +0,0 @@
[package]
name = "tfhe-hpu-backend"
version = "0.1.0"
edition = "2021"
license = "BSD-3-Clause-Clear"
description = "HPU implementation on FPGA of TFHE-rs primitives."
homepage = "https://www.zama.ai/"
documentation = "https://docs.zama.ai/tfhe-rs"
repository = "https://github.com/zama-ai/tfhe-rs"
readme = "README.md"
keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography", "hardware", "fpga"]
[features]
hw-xrt = []
hw-v80 = []
io-dump = ["num-traits"]
rtl_graph = ["dot2"]
utils = ["clap", "clap-num", "bitvec", "serde_json"]
[build-dependencies]
cxx-build = "1.0"
[dependencies]
cxx = "1.0"
hw_regmap = "0.1.0"
strum = { version = "0.26.2", features = ["derive"] }
strum_macros = "0.26.2"
enum_dispatch = "0.3.13"
tracing = "0.1.40"
tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
serde = { version = "1", features = ["derive"] }
toml = { version = "0.8.*", features = [] }
paste = "1.0.15"
thiserror = "1.0.61"
bytemuck = "1.16.0"
anyhow = "1.0.82"
lazy_static = "1.4.0"
rand = "0.8.5"
regex = "1.10.4"
bitflags = { version = "2.5.0", features = ["serde"] }
itertools = "0.11.0"
lru = "0.12.3"
bitfield-struct = "0.10.0"
crossbeam = { version = "0.8.4", features = ["crossbeam-queue"] }
rayon = { workspace = true }
# Dependencies used for Sim feature
ipc-channel = "0.18.3"
# Dependencies used for debug feature
num-traits = { version = "*", optional = true }
clap = { version = "4.4.4", features = ["derive"], optional = true }
clap-num = { version = "1.1.1", optional = true }
nix = { version = "0.29.0", features = ["ioctl", "uio"] }
# Dependencies used for rtl_graph features
dot2 = { version = "*", optional = true }
bitvec = { version = "*", optional = true }
serde_json = { version = "*", optional = true }
# Binary for manual debugging
# Enable to access Hpu register and drive some custom sequence by hand
[[bin]]
name = "hputil"
path = "src/utils/hputil.rs"
required-features = ["utils"]
# Binary for asm manipulation
# Enable to convert back and forth between asm/hex format
[[bin]]
name = "dop_fmt"
path = "src/utils/dop_fmt.rs"
required-features = ["utils"]
# Enable to convert back and forth between asm/hex format
[[bin]]
name = "iop_fmt"
path = "src/utils/iop_fmt.rs"
required-features = ["utils"]
# Firmware generation
# Enable to expand IOp in list of Dop for inspection
[[bin]]
name = "fw"
path = "src/utils/fw.rs"
required-features = ["utils"]

View File

@@ -1,28 +0,0 @@
BSD 3-Clause Clear License
Copyright © 2025 ZAMA.
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or other
materials provided with the distribution.
3. Neither the name of ZAMA nor the names of its contributors may be used to endorse
or promote products derived from this software without specific prior written permission.
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE.
THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -1,261 +0,0 @@
# TFHE-hpu-backend
## Brief
The `tfhe-hpu-backend` holds the code to interface with the HPU accelerator of TFHE.
It contains a `HpuDevice` abstraction that enables easy configuration and dispatching of TFHE operations on the HPU accelerator.
The user API exposes the following functions for hardware setup:
- `HpuDevice::new`, `HpuDevice::from_config`: Instantiates abstraction device from configuration file.
- `HpuDevice::init`: Configures and uploads the required public material.
- `new_var_from`: Creates a HPU ciphertext from `tfhe-rs` ciphertext.
HPU device could also be used from `integer` with the help of the following function:
- `tfhe::integer::hpu::init_device`: Init given HPU device with server key.
- `tfhe::integer::hpu::ciphertext::HpuRadixCiphertext::from_radix_ciphertext`: Convert a CpuRadixCiphertext in it's HPU counterpart.
HPU device could also be used seamlessly from `hl-api` by setting up a thread-local HPU server key:
- `tfhe::Config::from_hpu_device`: Extract hl-api configuration from HpuDevice.
- `tfhe::set_server_key`: Register the Hpu server key in the current thread.
HPU variables could also be created from a `high-level-api` object, with the help of the `hw-xfer` feature.
This implements a trait that enables `clone_on`, `mv_on` `FheUint` object on the HPU accelerator, and cast back `from` them.
These objects implement the `std::ops` trait and could be used to dispatch operations on HPU hardware.
### Backend structure
`tfhe-hpu-backend` is split in various modules:
- `entities`: Defines structure handled by HPU accelerator. Conversion traits from/into those objects are implemented in `tfhe-rs`.
- `asm`: Describes assembly-like language for the HPU. It enables abstract HPU behavior and easily updates it through micro-code.
- `fw`: Abstraction to help the micro-code designer. Uses a simple rust program for describing new HPU operations. Helps with register/heap management.
- `interface`:
+ `device`: High-level structure that exposes the User API.
+ `backend`: Inner private structure that contains HPU modules
+ `variable`: Wraps HPU ciphertexts. It enables to hook an hardware object lifetime within the `rust` borrow-checker.
+ `memory`: Handles on-board memory allocation and synchronization
+ `config`: Helps to configure HPU accelerator through a TOML configuration file
+ `cmd`: Translates operation over `variable` in concrete HPU commands
+ `regmap`: Communicates with the HPU internal register with ease.
+ `rtl`: Defines concrete `rust` structure populated from HPU's status/configuration registers
Below is an overview of the internal structure of the Backend.
![HPU backend structure](./figures/tfhe-hpu-backend.excalidraw.png)
This picture depicts the internal modules of `tfhe-hpu-backend`, Device is the main entry point for the user. Its lifecycle is as follows:
1. Create HpuDevice, open link with the associated FPGA. Configure associated drivers and upload the bitstream. Read FPGA registers to extract supported configuration and features. Build Firmware conversion table (IOp -> DOps stream).
2. Allocate required memory chunks in the on-board memory. Upload public material required by TFHE computation.
3. Create HPU variables that handle TFHE Ciphertexts. It wraps TFHE Ciphertext with required internal resources and enforces the correct lifetime management. This abstraction enforces that during the variable lifecycle all required resources are valid.
4. Users could trigger HPU operation from the HPU variable.
Variable abstraction enforces that required objects are correctly synced on the hardware and converts each operation in a concrete HPU command.
When HPU operation is acknowledged by the hardware, the internal state of the associated variable is updated.
This mechanism enables asynchronous operation and minimal amount of Host to/from HW memory transfer.
This mechanism also enables offloading a computation graph to the HPU and requires a synchronization only on the final results.
## Example
### Configuration file
HPU configuration knobs are gathered in a TOML configuration file. This file describes the targeted FPGA with its associated configuration:
```toml
[fpga] # FPGA target
# Register layout in the FPGA
regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_1in3.toml",
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml",
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml",
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml"]
polling_us=10
[fpga.ffi.V80] # Hardware properties
ami_dev="/dev/ami1" # Name of ami device
qdma_h2c="/dev/qdma${V80_PCIE_DEV}001-MM-0" # QDma host to card device
qdma_c2h="/dev/qdma${V80_PCIE_DEV}001-MM-1" # QDma card to host device
[rtl] # RTL option
bpip_used = true # BPIP/IPIP mode
bpip_use_opportunism = false # Use strict flush paradigm
bpip_timeout = 100_000 # BPIP timeout in clock `cycles`
[board] # Board configuration
ct_mem = 32768 # Number of allocated ciphertext
ct_pc = [ # Memory used for ciphertext
{Hbm= {pc=32}},
{Hbm= {pc=33}},
]
heap_size = 16384 # Number of slots reserved for heap
lut_mem = 256 # Number of allocated LUT table
lut_pc = {Hbm={pc=34}} # Memory used for LUT
fw_size= 16777216 # Size in byte of the Firmware translation table
fw_pc = {Ddr= {offset= 0x3900_0000}} # Memory used for firmware translation table
bsk_pc = [ # Memory used for Bootstrapping key
{Hbm={pc=8}},
{Hbm={pc=12}},
{Hbm={pc=24}},
{Hbm={pc=28}},
{Hbm={pc=40}},
{Hbm={pc=44}},
{Hbm={pc=56}},
{Hbm={pc=60}}
]
ksk_pc = [ # Memory used for Keyswitching key
{Hbm={pc=0}},
{Hbm={pc=1}},
{Hbm={pc=2}},
{Hbm={pc=3}},
{Hbm={pc=4}},
{Hbm={pc=5}},
{Hbm={pc=6}},
{Hbm={pc=7}},
{Hbm={pc=16}},
{Hbm={pc=17}},
{Hbm={pc=18}},
{Hbm={pc=19}},
{Hbm={pc=20}},
{Hbm={pc=21}},
{Hbm={pc=22}},
{Hbm={pc=23}}
]
trace_pc = {Hbm={pc=35}} # Memory used for trace log
trace_depth = 32 # Size of Memory in MiB allocated for trace log
[firmware] # Firmware properties
implementation = "Llt" # Firmware flavor to use
integer_w=[4,6,8,10,12,14,16,32,64,128] # List of supported IOp width
min_batch_size = 11 # Minimum batch size for maximum throughput
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
# Default firmware configuration. Could be edited on per-IOp basis
[firmware.op_cfg.default]
fill_batch_fifo = true
min_batch_size = false
use_tiers = false
flush_behaviour = "Patient"
flush = true
```
### Device setup
Following code snippet shows how to instantiate and configure a `HpuDevice`:
```rust
// Following code snippets used the HighLevelApi abstraction
// Instantiate HpuDevice --------------------------------------------------
let hpu_device = HpuDevice::from_config(&args.config.expand());
// Generate keys ----------------------------------------------------------
let config = Config::from_hpu_device(&hpu_device);
let cks = ClientKey::generate(config);
let csks = CompressedServerKey::new(&cks);
// Register HpuDevice and key as thread-local engine
set_server_key((hpu_device, csks));
```
### Clone CPU ciphertext on HPU
Following code snippet shows how to convert CPU ciphertext in HPU one:
``` rust
// Draw random value as input
let a = rand::thread_rng().gen_range(0..u8::MAX);
// Encrypt them on Cpu side
let a_fhe = FheUint8::encrypt(a, &cks);
// Clone a ciphertext and move them in HpuWorld
// NB: Data doesn't move over Pcie at this stage
// Data are only arranged in Hpu ordered an copy in the host internal buffer
let a_hpu = a_fhe.clone_on(&hpu_device);
```
### Dispatch operation on HPU
Once registered as thread-local engine, HighLevel FheUint are converted in Hpu format.
Following code snippets show how to start operation on HPU:
``` rust
// Sum -------------------------------------------------------------
// Generate random inputs value and compute expected result
let in_a = rng.gen_range(0..u64::max_value());
let in_b = rng.gen_range(0..u64::max_value());
let clear_sum_ab = in_a.wrapping_add(in_b);
// Encrypt input value
let fhe_a = FheUint64::encrypt(in_a, cks);
let fhe_b = FheUint64::encrypt(in_b, cks);
// Triggered operation on HPU through hl_api
let fhe_sum_ab = fhe_a+fhe_b;
// Decrypt values
let dec_sum_ab: u64 = fhe_sum_ab.decrypt(cks);
```
## Pre-made Examples
There are some example applications already available in `tfhe/examples/hpu`:
* hpu_hlapi: Depict the used of HPU device through HighLevelApi.
* hpu_bench: Depict the used of HPU device through Integer abstraction level.
In order to run those applications on hardware, user must build from the project root (i.e `tfhe-rs-internal`) with `hpu-v80` features:
> NB: Running examples required to have correctly pulled the `.pdi` files. Those files, due to their size, are backed by git-lfs and disabled by default.
> In order to retrieve them, use the following command:
> ```bash
> git lfs pull --include="*" --exclude=""
> ```
``` bash
cargo build --release --features="hpu-v80" --example hpu_hlapi --example hpu_bench
# Correctly setup environment with setup_hpu.sh script
source setup_hpu.sh --config v80 --init-qdma
./target/release/examples/hpu_bench --integer-w 64 --integer-w 32 --iop MUL --iter 10
./target/release/examples/hpu_hlapi
```
## Test framework
There is also a set of tests backed in tfhe-rs. Tests are gather in testbundle over various integer width.
Those tests have 5 sub-kind:
* `alu`: Run and check all ct x ct IOp
* `alus`: Run and check all ct x scalar IOp
* `bitwise`: Run and check all bitwise IOp
* `cmp`: Run and check all comparison IOp
* `ternary`: Run and check ternary operation
* `algo`: Run and check IOp dedicated to offload small algorithms
Snippets below give some example of command that could be used for testing:
``` bash
# Correctly setup environment with setup_hpu.sh script
source setup_hpu.sh --config v80 --init-qdma
# Run all sub-kind for 64b integer width
cargo test --release --features="hpu-v80" --test hpu -- u64
# Run only `bitwise` sub-kind for all integer width IOp
cargo test --release --features="hpu-v80" --test hpu -- bitwise
```
## Benches framework
HPU is completely integrated in tfhe benchmark system. Performances results could be extracted from HighLevelApi or Integer Api.
Three benchmarks could be started, through the following Makefile target for simplicity:
``` bash
# Do not forget to correctly set environment before hand
source setup_hpu.sh --config v80 --init-qdma
# Run hlapi benches
make test_high_level_api_hpu
# Run hlapi erc20 benches
make bench_hlapi_erc20_hpu
# Run integer level benches
make bench_integer_hpu
```
## Eager to start without real Hardware ?
You are still waiting your FPGA board and are frustrated by lead time ?
Don't worry, you have backed-up. A dedicated simulation infrastructure with accurate performance estimation is available in tfhe-rs.
You can use it on any linux/MacOs to test HPU integration within tfhe-rs and optimized your application for HPU target.
Simply through an eye to [Hpu mockup](../../mockups/tfhe-hpu-mockup/Reaadme.md), and follow the instruction.

View File

@@ -1,26 +0,0 @@
fn main() {
if cfg!(feature = "hw-xrt") {
println!("cargo:rustc-link-search=/opt/xilinx/xrt/lib");
println!("cargo:rustc-link-lib=dylib=stdc++");
println!("cargo:rustc-link-lib=dl");
println!("cargo:rustc-link-lib=rt");
println!("cargo:rustc-link-lib=uuid");
println!("cargo:rustc-link-lib=dylib=xrt_coreutil");
cxx_build::bridge("src/ffi/xrt/mod.rs")
.file("src/ffi/xrt/cxx/hpu_hw.cc")
.file("src/ffi/xrt/cxx/mem_zone.cc")
.flag_if_supported("-std=c++23")
.include("/opt/xilinx/xrt/include") // Enhance: support parsing bash env instead of hard path
.flag("-fmessage-length=0")
.compile("hpu-hw-ffi");
println!("cargo:rerun-if-changed=src/ffi/xrt/mod.rs");
println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/hpu_hw.cc");
println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/hpu_hw.h");
println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/mem_zone.cc");
println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/mem_zone.h");
} else {
// Simulation ffi -> nothing to do
}
}

View File

@@ -1,15 +0,0 @@
# CUST_0
# Simple IOp to check the xfer between Hpu/Cpu
# Construct constant in dest slot -> 249 (0xf9)
SUB R0 R0 R0
ADDS R0 R0 1
ST TD[0].0 R0
SUB R1 R1 R1
ADDS R1 R1 2
ST TD[0].1 R1
SUB R2 R2 R2
ADDS R2 R2 3
ST TD[0].2 R2
SUB R3 R3 R3
ADDS R3 R3 3
ST TD[0].3 R3

View File

@@ -1,11 +0,0 @@
# CUST_1
# Simple IOp to check the xfer between Hpu/Cpu
# Dest <- Src_a
LD R0 TS[0].0
LD R1 TS[0].1
LD R2 TS[0].2
LD R3 TS[0].3
ST TD[0].0 R0
ST TD[0].1 R1
ST TD[0].2 R2
ST TD[0].3 R3

View File

@@ -1,25 +0,0 @@
; CUST_8
; Simple IOp to check the ALU operation
; Dst[0].0 <- Src[0].0 + Src[1].0
LD R1 TS[0].0
LD R2 TS[1].0
ADD R0 R1 R2
ST TD[0].0 R0
; Dst[0].1 <- Src[0].1 + Src[1].1
LD R5 TS[0].1
LD R6 TS[1].1
ADD R4 R5 R6
ST TD[0].2 R4
; Dst[0].2 <- Src[0].2 + Src[1].2
LD R9 TS[0].2
LD R10 TS[1].2
ADD R8 R9 R10
ST TD[0].2 R8
; Dst[0].3 <- Src[0].3 + Src[1].3
LD R13 TS[0].3
LD R14 TS[1].3
ADD R12 R13 R14
ST TD[0].3 R0

View File

@@ -1,6 +0,0 @@
# CUST_16
# Simple IOp to check PBS behavior
# Dest <- PBSNone(Src_a.0)
LD R0 TS[0].0
PBS_F R0 R0 PbsNone
ST TD[0].0 R0

View File

@@ -1,15 +0,0 @@
# CUST_17
# Simple IOp to check PBS behavior
# Dest <- PBSNone(Src_a)
LD R0 TS[0].0
PBS R0 R0 PbsNone
ST TD[0].0 R0
LD R1 TS[0].1
PBS R1 R1 PbsNone
ST TD[0].1 R1
LD R2 TS[0].2
PBS R2 R2 PbsNone
ST TD[0].2 R2
LD R3 TS[0].3
PBS_F R3 R3 PbsNone
ST TD[0].3 R3

View File

@@ -1,23 +0,0 @@
; CUST_18
; Simple IOp to check extraction pattern
; Correct result:
; * Dst[0,1] <- Src[0][0,1]
; * Dst[2,3] <- Src[1][0,1]
; Pack Src[0][0,1] with a Mac and extract Carry/Msg in Dst[0][0,1]
LD R0 TS[0].0
LD R1 TS[0].1
MAC R3 R1 R0 4
PBS R4 R3 PbsMsgOnly
PBS R5 R3 PbsCarryInMsg
ST TD[0].0 R4
ST TD[0].1 R5
; Pack Src[1][0,1] with a Mac and extract Carry/Msg in Dst[0][2,3]
LD R10 TS[1].0
LD R11 TS[1].1
MAC R13 R11 R10 4
PBS R14 R13 PbsMsgOnly
PBS R15 R13 PbsCarryInMsg
ST TD[0].2 R14
ST TD[0].3 R15

View File

@@ -1,19 +0,0 @@
; CUST_19
; Simple IOp to check PbsMl2
; Correct result:
; * Dst[0][0] <- Src[0][0]
; * Dst[0][1] <- 0
; * Dst[0][2] <- Src[0][0] +1
; * Dst[0][3] <- 0
; i.e Cust_19(0x2) => 0x32
; Construct a 0 for destination padding
SUB R16 R16 R16
; Apply PbsMl2 on Src[0] result goes in dest[0][0-3] (0-padded)
LD R0 TS[0].0
PBS_ML2_F R0 R0 PbsTestMany2
ST TD[0].0 R0
ST TD[0].1 R16
ST TD[0].2 R1
ST TD[0].3 R16

View File

@@ -1,11 +0,0 @@
# CUST_2
# Simple IOp to check the xfer between Hpu/Cpu
# Dest <- Src_b
LD R0 TS[1].0
LD R1 TS[1].1
LD R2 TS[1].2
LD R3 TS[1].3
ST TD[0].0 R0
ST TD[0].1 R1
ST TD[0].2 R2
ST TD[0].3 R3

View File

@@ -1,22 +0,0 @@
; CUST_20
; Simple IOp to check PbsMl4
; Correct result:
; * Dst[0][0] <- Src[0][0]
; * Dst[0][1] <- Src[0][0] +1
; * Dst[0][2] <- Src[0][0] +2
; * Dst[0][3] <- Src[0][0] +3
; i.e Cust_20(0x0) => 0xe4
SUB R16 R16 R16
ST TD[0].0 R0
ST TD[0].1 R0
ST TD[0].2 R0
ST TD[0].3 R0
; Apply PbsMl4 on Src[0] result goes in dest[0][0-3]
LD R0 TS[0].0
PBS_ML4_F R0 R0 PbsTestMany4
ST TD[0].0 R0
ST TD[0].1 R1
ST TD[0].2 R2
ST TD[0].3 R3

View File

@@ -1,24 +0,0 @@
; CUST_21
; Simple IOp to check PbsMl8
; WARN: This operation required 16b ct width
; Correct result:
; * Dst[0][0] <- Src[0][0]
; * Dst[0][1] <- Src[0][0] +1
; * Dst[0][2] <- Src[0][0] +2
; * Dst[0][3] <- Src[0][0] +3
; * Dst[0][4] <- Src[0][0] +4
; * Dst[0][5] <- Src[0][0] +5
; * Dst[0][6] <- Src[0][0] +6
; * Dst[0][7] <- Src[0][0] +7
; Apply PbsMl8 on Src[0] result goes in dest[0][0-7]
LD R0 TS[0].0
PBS_ML8_F R0 R0 PbsTestMany8
ST TD[0].0 R0
ST TD[0].1 R1
ST TD[0].2 R2
ST TD[0].3 R3
ST TD[0].4 R4
ST TD[0].5 R5
ST TD[0].6 R6
ST TD[0].7 R7

View File

@@ -1,16 +0,0 @@
# CUST_3
# Simple IOp to check isc behavior
# Generate obvious deps and check that isc correctly issued the dop
# Correct result must bu Dest <- Src[0]
LD R0 TS[0].0
LD R1 TS[0].1
LD R2 TS[0].2
LD R3 TS[0].3
PBS R4 R0 PbsNone
ST TD[0].0 R4
PBS R4 R1 PbsNone
ST TD[0].1 R4
PBS R4 R2 PbsNone
ST TD[0].2 R4
PBS_F R4 R3 PbsNone
ST TD[0].3 R4

View File

@@ -1,19 +0,0 @@
; CUST_8
; Simple IOp to check the ALU operation
; Dst[0].0 <- Src[0].0 + Src[1].0
LD R1 TS[0].0
LD R2 TS[1].0
ADD R0 R1 R2
ST TD[0].0 R0
; Dst[0].1 <- Src[0].1 - Src[1].1
LD R5 TS[0].1
LD R6 TS[1].1
SUB R4 R5 R6
ST TD[0].1 R4
; Dst[0].2 <- Src[0].2 + (Src[1].2 *4)
LD R9 TS[0].2
LD R10 TS[1].2
MAC R8 R9 R10 4
ST TD[0].2 R8

View File

@@ -1,21 +0,0 @@
; CUST_9
; Simple IOp to check the ALU Scalar operation
; Dst[0].0 <- Src[0].0 + Imm[0].0
LD R1 TS[0].0
ADDS R0 R1 TI[0].0
ST TD[0].0 R0
; Dst[0].1 <- Src[0].1 - Imm[0].1
LD R5 TS[0].1
SUBS R4 R5 TI[0].1
ST TD[0].1 R4
; Dst[0].2 <- Imm[0].2 - Src[0].2
LD R9 TS[0].2
SSUB R8 R9 TI[0].2
ST TD[0].2 R8
; Dst[0].3 <- Src[0].3 * Imm[0].3
LD R13 TS[0].3
MULS R12 R13 TI[0].3
ST TD[0].3 R12

View File

@@ -1,108 +0,0 @@
[fpga]
regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_1in3.toml",
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml",
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml",
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml",
"${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/tb_hpu_regif_dummy.toml"]
polling_us=100000
[fpga.ffi.Sim]
ipc_name="/tmp/${USER}/hpu_mockup_ipc"
[rtl]
bpip_use = true
bpip_use_opportunism = true
bpip_timeout = 100_000
[board]
ct_mem = 32768
ct_pc = [
{Hbm= {pc=32}},
{Hbm= {pc=33}},
]
heap_size = 16384
lut_mem = 256
lut_pc = {Hbm={pc=34}}
fw_size= 16777215 # i.e. 16 MiB
fw_pc = {Ddr= {offset= 0x3900_0000}} # NB: Allocation must take place in the Discret DDR
bsk_pc = [
{Hbm={pc=8}},
{Hbm={pc=12}},
{Hbm={pc=24}},
{Hbm={pc=28}},
{Hbm={pc=40}},
{Hbm={pc=44}},
{Hbm={pc=56}},
{Hbm={pc=60}}
]
ksk_pc = [
{Hbm={pc=0}},
{Hbm={pc=1}},
{Hbm={pc=2}},
{Hbm={pc=3}},
{Hbm={pc=4}},
{Hbm={pc=5}},
{Hbm={pc=6}},
{Hbm={pc=7}},
{Hbm={pc=16}},
{Hbm={pc=17}},
{Hbm={pc=18}},
{Hbm={pc=19}},
{Hbm={pc=20}},
{Hbm={pc=21}},
{Hbm={pc=22}},
{Hbm={pc=23}}
]
trace_pc = {Hbm={pc=35}}
trace_depth = 32 # In MB
[firmware]
implementation = "Llt"
integer_w=[2,4,6,8,10,12,14,16,32,64,128]
min_batch_size = 11
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
custom_iop.'IOP[2]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_2.asm"
custom_iop.'IOP[3]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_3.asm"
custom_iop.'IOP[8]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_8.asm"
custom_iop.'IOP[9]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_9.asm"
custom_iop.'IOP[16]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_16.asm"
custom_iop.'IOP[17]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_17.asm"
custom_iop.'IOP[18]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_18.asm"
custom_iop.'IOP[19]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_19.asm"
custom_iop.'IOP[20]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_20.asm"
custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm"
[firmware.op_cfg.default]
fill_batch_fifo = true
min_batch_size = false
use_tiers = false
flush_behaviour = "Patient"
flush = true
[firmware.op_cfg.by_op.MUL]
fill_batch_fifo = false
min_batch_size = false
use_tiers = false
flush_behaviour = "Patient"
flush = true
[firmware.op_cfg.by_op.MULS]
fill_batch_fifo = false
min_batch_size = false
use_tiers = false
flush_behaviour = "Patient"
flush = true
[firmware.op_cfg.by_op.ERC_20]
fill_batch_fifo = true
min_batch_size = false
use_tiers = true
flush_behaviour = "Patient"
flush = true

View File

@@ -1,256 +0,0 @@
module_name="hpu_regif_core_cfg_1in3"
description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it."
word_size_b = 32
offset = 0x00
range = 0x10000
ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
# =====================================================================================================================
[section.entry_cfg_1in3]
description="entry_cfg_1in3 section with known value used for debug."
offset= 0x0
[section.entry_cfg_1in3.register.dummy_val0]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x01010101}
[section.entry_cfg_1in3.register.dummy_val1]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x11111111}
[section.entry_cfg_1in3.register.dummy_val2]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x21212121}
[section.entry_cfg_1in3.register.dummy_val3]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x31313131}
# =====================================================================================================================
[section.info]
description="RTL architecture parameters"
offset= 0x10
[section.info.register.version]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Param="VERSION"}
[section.info.register.ntt_architecture]
description="NTT architecture"
owner="Parameter"
read_access="Read"
write_access="None"
default={Param="NTT_CORE_ARCH"}
[section.info.register.ntt_structure]
description="NTT structure parameters"
owner="Parameter"
read_access="Read"
write_access="None"
field.radix = { size_b=8, offset_b=0 , default={Param="R"}, description="NTT radix"}
field.psi = { size_b=8, offset_b=8 , default={Param="PSI"}, description="NTT psi"}
field.div = { size_b=8, offset_b=16, default={Param="BWD_PSI_DIV"}, description="NTT backward div"}
field.delta = { size_b=8, offset_b=24, default={Param="DELTA"}, description="NTT network delta (for wmm arch)"}
[section.info.register.ntt_rdx_cut]
description="NTT radix cuts, in log2 unit (for gf64 arch)"
owner="Parameter"
read_access="Read"
write_access="None"
field.radix_cut0 = { size_b=4, offset_b=0 , default={Param="NTT_RDX_CUT_S_0"}, description="NTT radix cut #0"}
field.radix_cut1 = { size_b=4, offset_b=4 , default={Param="NTT_RDX_CUT_S_1"}, description="NTT radix cut #1"}
field.radix_cut2 = { size_b=4, offset_b=8 , default={Param="NTT_RDX_CUT_S_2"}, description="NTT radix cut #2"}
field.radix_cut3 = { size_b=4, offset_b=12, default={Param="NTT_RDX_CUT_S_3"}, description="NTT radix cut #3"}
field.radix_cut4 = { size_b=4, offset_b=16, default={Param="NTT_RDX_CUT_S_4"}, description="NTT radix cut #4"}
field.radix_cut5 = { size_b=4, offset_b=20, default={Param="NTT_RDX_CUT_S_5"}, description="NTT radix cut #5"}
field.radix_cut6 = { size_b=4, offset_b=24, default={Param="NTT_RDX_CUT_S_6"}, description="NTT radix cut #6"}
field.radix_cut7 = { size_b=4, offset_b=28, default={Param="NTT_RDX_CUT_S_7"}, description="NTT radix cut #7"}
[section.info.register.ntt_pbs]
description="Maximum number of PBS in the NTT pipeline"
owner="Parameter"
read_access="Read"
write_access="None"
field.batch_pbs_nb = { size_b=8, offset_b=0 , default={Param="BATCH_PBS_NB"}, description="Maximum number of PBS in the NTT pipe"}
field.total_pbs_nb = { size_b=8, offset_b=8 , default={Param="TOTAL_PBS_NB"}, description="Maximum number of PBS stored in PEP buffer"}
[section.info.register.ntt_modulo]
description="Code associated to the NTT prime"
owner="Parameter"
read_access="Read"
write_access="None"
default={Param="MOD_NTT_NAME"}
[section.info.register.application]
description="Code associated with the application"
owner="Parameter"
read_access="Read"
write_access="None"
default={Param="APPLICATION_NAME"}
[section.info.register.ks_structure]
description="Key-switch structure parameters"
owner="Parameter"
read_access="Read"
write_access="None"
field.x = { size_b=8, offset_b=0 , default={Param="LBX"}, description="Number of coefficients on X dimension"}
field.y = { size_b=8, offset_b=8 , default={Param="LBY"}, description="Number of coefficients on Y dimension"}
field.z = { size_b=8, offset_b=16, default={Param="LBZ"}, description="Number of coefficients on Z dimension"}
[section.info.register.ks_crypto_param]
description="Key-switch crypto parameters"
owner="Parameter"
read_access="Read"
write_access="None"
field.mod_ksk_w = { size_b=8, offset_b=0 , default={Param="MOD_KSK_W"}, description="Width of KSK modulo"}
field.ks_l = { size_b=8, offset_b=8 , default={Param="KS_L"}, description="Number of KS decomposition level"}
field.ks_b = { size_b=8, offset_b=16, default={Param="KS_B_W"}, description="Width of KS decomposition base"}
[section.info.register.regf_structure]
description="Register file structure parameters"
owner="Parameter"
read_access="Read"
write_access="None"
field.reg_nb = { size_b=8, offset_b=0 , default={Param="REGF_REG_NB"}, description="Number of registers in regfile"}
field.coef_nb = { size_b=8, offset_b=8 , default={Param="REGF_COEF_NB"}, description="Number of coefficients at regfile interface"}
[section.info.register.isc_structure]
description="Instruction scheduler structure parameters"
owner="Parameter"
read_access="Read"
write_access="None"
field.depth = { size_b=8, offset_b=0 , default={Param="ISC_DEPTH"}, description="Number of slots in ISC lookahead buffer."}
field.min_iop_size = { size_b=8, offset_b=8 , default={Param="MIN_IOP_SIZE"}, description="Minimum number of DOp per IOp to prevent sync_id overflow."}
[section.info.register.pe_properties]
description="Processing elements parameters"
owner="Parameter"
read_access="Read"
write_access="None"
field.alu_nb = { size_b=8, offset_b=24 , default={Param="PEA_ALU_NB"}, description="Number of coefficients processed in parallel in pe_alu"}
field.pep_regf_period = { size_b=8, offset_b=16 , default={Param="PEP_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEP and regfile"}
field.pem_regf_period = { size_b=8, offset_b=8 , default={Param="PEM_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEM and regfile"}
field.pea_regf_period = { size_b=8, offset_b=0 , default={Param="PEA_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEA and regfile"}
[section.info.register.bsk_structure]
description="BSK manager structure parameters"
owner="Parameter"
read_access="Read"
write_access="None"
field.bsk_cut_nb = { size_b=8, offset_b=8 , default={Param="BSK_CUT_NB"}, description="BSK cut nb"}
[section.info.register.ksk_structure]
description="KSK manager structure parameters"
owner="Parameter"
read_access="Read"
write_access="None"
field.ksk_cut_nb = { size_b=8, offset_b=8 , default={Param="KSK_CUT_NB"}, description="KSK cut nb"}
[section.info.register.hbm_axi4_nb]
description="Number of AXI4 connections to HBM"
owner="Parameter"
read_access="Read"
write_access="None"
field.bsk_pc = { size_b=8, offset_b=0 , default={Param="BSK_PC"}, description="Number of HBM connections for BSK"}
field.ksk_pc = { size_b=8, offset_b=8, default={Param="KSK_PC"}, description="Number of HBM connections for KSK"}
field.pem_pc = { size_b=8, offset_b=16, default={Param="PEM_PC"}, description="Number of HBM connections for ciphertexts (PEM)"}
field.glwe_pc = { size_b=8, offset_b=24, default={Param="GLWE_PC"}, description="Number of HBM connections for GLWE"}
[section.info.register.hbm_axi4_dataw_pem]
description="Ciphertext HBM AXI4 connection data width"
owner="Parameter"
read_access="Read"
write_access="None"
default={Param="AXI4_PEM_DATA_W"}
[section.info.register.hbm_axi4_dataw_glwe]
description="GLWE HBM AXI4 connection data width"
owner="Parameter"
read_access="Read"
write_access="None"
default={Param="AXI4_GLWE_DATA_W"}
[section.info.register.hbm_axi4_dataw_bsk]
description="BSK HBM AXI4 connection data width"
owner="Parameter"
read_access="Read"
write_access="None"
default={Param="AXI4_BSK_DATA_W"}
[section.info.register.hbm_axi4_dataw_ksk]
description="KSK HBM AXI4 connection data width"
owner="Parameter"
read_access="Read"
write_access="None"
default={Param="AXI4_KSK_DATA_W"}
# =====================================================================================================================
[section.hbm_axi4_addr_1in3]
offset= 0x1000
description="HBM AXI4 connection address offset"
[section.hbm_axi4_addr_1in3.register.ct]
description="Address offset for each ciphertext HBM AXI4 connection"
owner="User"
read_access="Read"
write_access="Write"
duplicate=["_pc0_lsb", "_pc0_msb","_pc1_lsb", "_pc1_msb"]
[section.hbm_axi4_addr_1in3.register.glwe]
description="Address offset for each GLWE HBM AXI4 connection"
owner="User"
read_access="Read"
write_access="Write"
duplicate=["_pc0_lsb", "_pc0_msb"]
[section.hbm_axi4_addr_1in3.register.ksk]
description="Address offset for each KSK HBM AXI4 connection"
owner="User"
read_access="Read"
write_access="Write"
duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"]
[section.hbm_axi4_addr_1in3.register.trc]
description="Address offset for each trace HBM AXI4 connection"
owner="User"
read_access="Read"
write_access="Write"
duplicate=["_pc0_lsb", "_pc0_msb"]
# =====================================================================================================================
[section.bpip]
offset= 0x2000
description="BPIP configuration"
[section.bpip.register.use]
description="(1) Use BPIP mode, (0) use IPIP mode (default)"
owner="User"
read_access="Read"
write_access="Write"
field.use_bpip = { size_b=1, offset_b=0 , default={Cst=1}, description="use"}
field.use_opportunism = { size_b=1, offset_b=1 , default={Cst=0}, description="use opportunistic PBS flush"}
[section.bpip.register.timeout]
description="Timeout for BPIP mode"
owner="User"
read_access="Read"
write_access="Write"
default={Cst=0xffffffff}

View File

@@ -1,51 +0,0 @@
module_name="hpu_regif_core_cfg_3in3"
description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it."
word_size_b = 32
offset = 0x20000
range = 0x10000
ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
# =====================================================================================================================
[section.entry_cfg_3in3]
description="entry_cfg_3in3 section with known value used for debug."
offset= 0x0
[section.entry_cfg_3in3.register.dummy_val0]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x03030303}
[section.entry_cfg_3in3.register.dummy_val1]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x13131313}
[section.entry_cfg_3in3.register.dummy_val2]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x23232323}
[section.entry_cfg_3in3.register.dummy_val3]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x33333333}
# =====================================================================================================================
[section.hbm_axi4_addr_3in3]
description="HBM AXI4 connection address offset"
offset= 0x10
[section.hbm_axi4_addr_3in3.register.bsk]
description="Address offset for each BSK HBM AXI4 connection"
owner="User"
read_access="Read"
write_access="Write"
duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"]

View File

@@ -1,336 +0,0 @@
module_name="hpu_regif_core_prc_1in3"
description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it."
word_size_b = 32
offset = 0x10000
range = 0x10000
ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
# =====================================================================================================================
[section.entry_prc_1in3]
description="entry_prc_1in3 section with known value used for debug."
offset= 0x0
[section.entry_prc_1in3.register.dummy_val0]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x02020202}
[section.entry_prc_1in3.register.dummy_val1]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x12121212}
[section.entry_prc_1in3.register.dummy_val2]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x22222222}
[section.entry_prc_1in3.register.dummy_val3]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x32323232}
# =====================================================================================================================
[section.status_1in3]
description="HPU status of part 1in3"
offset= 0x10
[section.status_1in3.register.error]
description="Error register (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
field.pbs = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 1in3"}
# =====================================================================================================================
[section.ksk_avail]
description="KSK availability configuration"
offset= 0x1000
[section.ksk_avail.register.avail]
description="KSK available bit"
owner="User"
read_access="Read"
write_access="Write"
field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"}
[section.ksk_avail.register.reset]
description="KSK reset sequence"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"}
field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"}
# =====================================================================================================================
[section.runtime_1in3]
description="Runtime information"
offset= 0x2000
[section.runtime_1in3.register.pep_cmux_loop]
description="PEP: CMUX iteration loop number"
owner="Kernel"
read_access="Read"
write_access="None"
field.br_loop = { size_b=15, offset_b=0 , default={Cst=0}, description="PBS current BR-loop"}
field.br_loop_c = { size_b=1, offset_b=15 , default={Cst=0}, description="PBS current BR-loop parity"}
field.ks_loop = { size_b=15, offset_b=16 , default={Cst=0}, description="KS current KS-loop"}
field.ks_loop_c = { size_b=1, offset_b=31 , default={Cst=0}, description="KS current KS-loop parity"}
[section.runtime_1in3.register.pep_pointer_0]
description="PEP: pointers (part 1)"
owner="Kernel"
read_access="Read"
write_access="None"
field.pool_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP pool_rp"}
field.pool_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP pool_wp"}
field.ldg_pt = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ldg_pt"}
field.ldb_pt = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ldb_pt"}
[section.runtime_1in3.register.pep_pointer_1]
description="PEP: pointers (part 2)"
owner="Kernel"
read_access="Read"
write_access="None"
field.ks_in_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP ks_in_rp"}
field.ks_in_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP ks_in_wp"}
field.ks_out_rp = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ks_out_rp"}
field.ks_out_wp = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ks_out_wp"}
[section.runtime_1in3.register.pep_pointer_2]
description="PEP: pointers (part 3)"
owner="Kernel"
read_access="Read"
write_access="None"
field.pbs_in_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP pbs_in_rp"}
field.pbs_in_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP pbs_in_wp"}
field.ipip_flush_last_pbs_in_loop = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP IPIP flush last pbs_in_loop"}
[section.runtime_1in3.register.isc_latest_instruction]
description="ISC: 4 latest instructions received ([0] is the most recent)"
owner="Kernel"
read_access="Read"
write_access="None"
duplicate=["_0","_1","_2","_3"]
[section.runtime_1in3.register.pep_seq_bpip_batch_cnt]
description="PEP: BPIP batch counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_seq_bpip_batch_flush_cnt]
description="PEP: BPIP batch triggered by a flush counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_seq_bpip_batch_timeout_cnt]
description="PEP: BPIP batch triggered by a timeout counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_seq_bpip_waiting_batch_cnt]
description="PEP: BPIP batch that waits the trigger counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_seq_bpip_batch_filling_cnt]
description="PEP: Count batch with filled with a given number of CT (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
duplicate=["_1","_2","_3","_4","_5","_6","_7","_8","_9","_10","_11","_12","_13","_14","_15","_16"]
[section.runtime_1in3.register.pep_seq_ld_ack_cnt]
description="PEP: load BLWE ack counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_seq_cmux_not_full_batch_cnt]
description="PEP: not full batch CMUX counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_seq_ipip_flush_cnt]
description="PEP: IPIP flush CMUX counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_ldb_rcp_dur]
description="PEP: load BLWE reception max duration (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_ldg_req_dur]
description="PEP: load GLWE request max duration (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_ldg_rcp_dur]
description="PEP: load GLWE reception max duration (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_load_ksk_rcp_dur]
description="PEP: load KSK slice reception max duration (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"]
[section.runtime_1in3.register.pep_mmacc_sxt_rcp_dur]
description="PEP: MMACC SXT reception duration (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_mmacc_sxt_req_dur]
description="PEP: MMACC SXT request duration (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_mmacc_sxt_cmd_wait_b_dur]
description="PEP: MMACC SXT command wait for b duration (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_inst_cnt]
description="PEP: input instruction counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pep_ack_cnt]
description="PEP: instruction acknowledge counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pem_load_inst_cnt]
description="PEM: load input instruction counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pem_load_ack_cnt]
description="PEM: load instruction acknowledge counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pem_store_inst_cnt]
description="PEM: store input instruction counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pem_store_ack_cnt]
description="PEM: store instruction acknowledge counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pea_inst_cnt]
description="PEA: input instruction counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pea_ack_cnt]
description="PEA: instruction acknowledge counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.isc_inst_cnt]
description="ISC: input instruction counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.isc_ack_cnt]
description="ISC: instruction acknowledge counter (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.runtime_1in3.register.pem_load_info_0]
description="PEM: load first data)"
owner="Kernel"
read_access="Read"
write_access="None"
duplicate=["_pc0_0","_pc0_1","_pc0_2","_pc0_3","_pc1_0","_pc1_1","_pc1_2","_pc1_3"]
[section.runtime_1in3.register.pem_load_info_1]
description="PEM: load first address"
owner="Kernel"
read_access="Read"
write_access="None"
duplicate=["_pc0_lsb","_pc0_msb","_pc1_lsb","_pc1_msb"]
[section.runtime_1in3.register.pem_store_info_0]
description="PEM: store info 0)"
owner="Kernel"
read_access="Read"
write_access="None"
field.cmd_vld = { size_b=1, offset_b=0 , default={Cst=0}, description="PEM_ST cmd vld"}
field.cmd_rdy = { size_b=1, offset_b=1 , default={Cst=0}, description="PEM_ST cmd rdy"}
field.pem_regf_rd_req_vld = { size_b=1, offset_b=2 , default={Cst=0}, description="PEM_ST pem_regf_rd_req_vld"}
field.pem_regf_rd_req_rdy = { size_b=1, offset_b=3 , default={Cst=0}, description="PEM_ST pem_regf_rd_req_rdy"}
field.brsp_fifo_in_vld = { size_b=4, offset_b=4 , default={Cst=0}, description="PEM_ST brsp_fifo_in_vld"}
field.brsp_fifo_in_rdy = { size_b=4, offset_b=8 , default={Cst=0}, description="PEM_ST brsp_fifo_in_rdy"}
field.rcp_fifo_in_vld = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST rcp_fifo_in_vld"}
field.rcp_fifo_in_rdy = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST rcp_fifo_in_rdy"}
field.r2_axi_vld = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST r2_axi_vld"}
field.r2_axi_rdy = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST r2_axi_rdy"}
field.c0_enough_location = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST c0_enough_location"}
[section.runtime_1in3.register.pem_store_info_1]
description="PEM: store info 1"
owner="Kernel"
read_access="Read"
write_access="None"
field.s0_cmd_vld = { size_b=4, offset_b=0 , default={Cst=0}, description="PEM_ST s0_cmd_vld"}
field.s0_cmd_rdy = { size_b=4, offset_b=4 , default={Cst=0}, description="PEM_ST s0_cmd_rdy"}
field.m_axi_bvalid = { size_b=4, offset_b=8 , default={Cst=0}, description="PEM_ST m_axi_bvalid"}
field.m_axi_bready = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST m_axi_bready"}
field.m_axi_wvalid = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST m_axi_wvalid"}
field.m_axi_wready = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST m_axi_wready"}
field.m_axi_awvalid = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST m_axi_awvalid"}
field.m_axi_awready = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST m_axi_awready"}
[section.runtime_1in3.register.pem_store_info_2]
description="PEM: store info 2"
owner="Kernel"
read_access="Read"
write_access="None"
field.c0_free_loc_cnt = { size_b=16, offset_b=0 , default={Cst=0}, description="PEM_ST c0_free_loc_cnt"}
field.brsp_bresp_cnt = { size_b=16, offset_b=16 , default={Cst=0}, description="PEM_ST brsp_bresp_cnt"}
[section.runtime_1in3.register.pem_store_info_3]
description="PEM: store info 3"
owner="Kernel"
read_access="Read"
write_access="None"
field.brsp_ack_seen = { size_b=16, offset_b=0 , default={Cst=0}, description="PEM_ST brsp_ack_seen"}
field.c0_cmd_cnt = { size_b=8, offset_b=16 , default={Cst=0}, description="PEM_ST c0_cmd_cnt"}

View File

@@ -1,100 +0,0 @@
module_name="hpu_regif_core_prc_3in3"
description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it."
word_size_b = 32
offset = 0x30000
range = 0x10000
ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
# =====================================================================================================================
[section.entry_prc_3in3]
description="entry_prc_3in3 section with known value used for debug."
offset= 0x0
[section.entry_prc_3in3.register.dummy_val0]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x04040404}
[section.entry_prc_3in3.register.dummy_val1]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x14141414}
[section.entry_prc_3in3.register.dummy_val2]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x24242424}
[section.entry_prc_3in3.register.dummy_val3]
description="RTL version"
owner="Parameter"
read_access="Read"
write_access="None"
default={Cst=0x34343434}
# =====================================================================================================================
[section.status_3in3]
description="HPU status of parts 2in3 and 3in3"
offset= 0x10
[section.status_3in3.register.error]
description="Error register (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
field.pbs = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 3in3"}
# =====================================================================================================================
[section.bsk_avail]
description="BSK availability configuration"
offset= 0x1000
[section.bsk_avail.register.avail]
description="BSK available bit"
owner="User"
read_access="Read"
write_access="Write"
field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"}
[section.bsk_avail.register.reset]
description="BSK reset sequence"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"}
field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"}
# =====================================================================================================================
[section.runtime_3in3]
description="Runtime information"
offset= 0x2000
[section.runtime_3in3.register.pep_load_bsk_rcp_dur]
description="PEP: load BSK slice reception max duration (Could be reset by user)"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"]
[section.runtime_3in3.register.pep_bskif_req_info_0]
description="PEP: BSK_IF: requester info 0"
owner="Kernel"
read_access="Read"
write_access="None"
field.req_br_loop_rp = { size_b=16, offset_b=0 , default={Cst=0}, description="PEP BSK_IF requester BSK read pointer"}
field.req_br_loop_wp = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK write pointer"}
[section.runtime_3in3.register.pep_bskif_req_info_1]
description="PEP: BSK_IF: requester info 0"
owner="Kernel"
read_access="Read"
write_access="None"
field.req_prf_br_loop = { size_b=16, offset_b=0 , default={Cst=0}, description="PEP BSK_IF requester BSK prefetch pointer"}
field.req_parity = { size_b=1, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK pointer parity"}
field.req_assigned = { size_b=1, offset_b=31 , default={Cst=0}, description="PEP BSK_IF requester assignment"}

View File

@@ -1,22 +0,0 @@
module_name="tb_hpu_regif_dummy"
description="Fake registers needed by the mockup"
word_size_b = 32
offset = 0x40000
range = 0x10000
ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"]
# ==============================================================================
[section.WorkAck]
description="Purpose of this section"
[section.WorkAck.register.workq]
description="Insert work in workq and read status"
owner="Kernel"
read_access="Read"
write_access="WriteNotify"
[section.WorkAck.register.ackq]
description="Pop ack from in ackq"
owner="Kernel"
read_access="ReadNotify"
write_access="None"

View File

@@ -1,6 +0,0 @@
# Fpga version
Built with the following command: (i.e. xrt/run_syn_hpu_msplit_3parts_64b.sh)
```
just zaxl-build hpu_msplit_3parts 3 "0:300" "-F TOP_MSPLIT TOP_MSPLIT_1 -F TOP_BATCH TOP_BATCH_TOPhpu_BPBS8_TPBS32 -F TOP_PCMAX TOP_PCMAX_pem2_glwe1_bsk8_ksk8 -F TOP_PC TOP_PC_pem2_glwe1_bsk4_ksk4 -F APPLICATION APPLI_msg2_carry2 -F NTT_MOD NTT_MOD_goldilocks -F NTT_CORE_ARCH NTT_CORE_ARCH_gf64 -F NTT_CORE_R_PSI NTT_CORE_R2_PSI16 -F NTT_CORE_RDX_CUT NTT_CORE_RDX_CUT_n5c5c1 -F NTT_CORE_DIV NTT_CORE_DIV_1 -F BSK_SLOT_CUT BSK_SLOT8_CUT4 -F KSK_SLOT_CUT KSK_SLOT8_CUT4 -F KSLB KSLB_x2y32z3 -F HPU_PART HPU_PART_gf64 -F AXI_DATA_W AXI_DATA_W_512" "1:${PROJECT_DIR}/hw/output/micro_code/ucore_fw.elf" 'D:MEMORY_FILE_PATH=\\\"${PROJECT_DIR}/hw/\\\"' | tee build_out.log
```

View File

@@ -1,15 +0,0 @@
# CUST_0
# Simple IOp to check the xfer between Hpu/Cpu
# Construct constant in dest slot -> 249 (0xf9)
SUB R0 R0 R0
ADDS R0 R0 1
ST TD[0].0 R0
SUB R1 R1 R1
ADDS R1 R1 2
ST TD[0].1 R1
SUB R2 R2 R2
ADDS R2 R2 3
ST TD[0].2 R2
SUB R3 R3 R3
ADDS R3 R3 3
ST TD[0].3 R3

View File

@@ -1,11 +0,0 @@
# CUST_1
# Simple IOp to check the xfer between Hpu/Cpu
# Dest <- Src_a
LD R0 TS[0].0
LD R1 TS[0].1
LD R2 TS[0].2
LD R3 TS[0].3
ST TD[0].0 R0
ST TD[0].1 R1
ST TD[0].2 R2
ST TD[0].3 R3

View File

@@ -1,25 +0,0 @@
; CUST_8
; Simple IOp to check the ALU operation
; Dst[0].0 <- Src[0].0 + Src[1].0
LD R1 TS[0].0
LD R2 TS[1].0
ADD R0 R1 R2
ST TD[0].0 R0
; Dst[0].1 <- Src[0].1 + Src[1].1
LD R5 TS[0].1
LD R6 TS[1].1
ADD R4 R5 R6
ST TD[0].2 R4
; Dst[0].2 <- Src[0].2 + Src[1].2
LD R9 TS[0].2
LD R10 TS[1].2
ADD R8 R9 R10
ST TD[0].2 R8
; Dst[0].3 <- Src[0].3 + Src[1].3
LD R13 TS[0].3
LD R14 TS[1].3
ADD R12 R13 R14
ST TD[0].3 R0

View File

@@ -1,6 +0,0 @@
# CUST_16
# Simple IOp to check PBS behavior
# Dest <- PBSNone(Src_a.0)
LD R0 TS[0].0
PBS_F R0 R0 PbsNone
ST TD[0].0 R0

View File

@@ -1,15 +0,0 @@
# CUST_17
# Simple IOp to check PBS behavior
# Dest <- PBSNone(Src_a)
LD R0 TS[0].0
PBS R0 R0 PbsNone
ST TD[0].0 R0
LD R1 TS[0].1
PBS R1 R1 PbsNone
ST TD[0].1 R1
LD R2 TS[0].2
PBS R2 R2 PbsNone
ST TD[0].2 R2
LD R3 TS[0].3
PBS_F R3 R3 PbsNone
ST TD[0].3 R3

View File

@@ -1,23 +0,0 @@
; CUST_18
; Simple IOp to check extraction pattern
; Correct result:
; * Dst[0,1] <- Src[0][0,1]
; * Dst[2,3] <- Src[1][0,1]
; Pack Src[0][0,1] with a Mac and extract Carry/Msg in Dst[0][0,1]
LD R0 TS[0].0
LD R1 TS[0].1
MAC R3 R1 R0 4
PBS R4 R3 PbsMsgOnly
PBS R5 R3 PbsCarryInMsg
ST TD[0].0 R4
ST TD[0].1 R5
; Pack Src[1][0,1] with a Mac and extract Carry/Msg in Dst[0][2,3]
LD R10 TS[1].0
LD R11 TS[1].1
MAC R13 R11 R10 4
PBS R14 R13 PbsMsgOnly
PBS R15 R13 PbsCarryInMsg
ST TD[0].2 R14
ST TD[0].3 R15

View File

@@ -1,19 +0,0 @@
; CUST_19
; Simple IOp to check PbsMl2
; Correct result:
; * Dst[0][0] <- Src[0][0]
; * Dst[0][1] <- 0
; * Dst[0][2] <- Src[0][0] +1
; * Dst[0][3] <- 0
; i.e Cust_19(0x2) => 0x32
; Construct a 0 for destination padding
SUB R16 R16 R16
; Apply PbsMl2 on Src[0] result goes in dest[0][0-3] (0-padded)
LD R0 TS[0].0
PBS_ML2_F R0 R0 PbsTestMany2
ST TD[0].0 R0
ST TD[0].1 R16
ST TD[0].2 R1
ST TD[0].3 R16

View File

@@ -1,11 +0,0 @@
# CUST_2
# Simple IOp to check the xfer between Hpu/Cpu
# Dest <- Src_b
LD R0 TS[1].0
LD R1 TS[1].1
LD R2 TS[1].2
LD R3 TS[1].3
ST TD[0].0 R0
ST TD[0].1 R1
ST TD[0].2 R2
ST TD[0].3 R3

View File

@@ -1,22 +0,0 @@
; CUST_20
; Simple IOp to check PbsMl4
; Correct result:
; * Dst[0][0] <- Src[0][0]
; * Dst[0][1] <- Src[0][0] +1
; * Dst[0][2] <- Src[0][0] +2
; * Dst[0][3] <- Src[0][0] +3
; i.e Cust_20(0x0) => 0xe4
SUB R16 R16 R16
ST TD[0].0 R0
ST TD[0].1 R0
ST TD[0].2 R0
ST TD[0].3 R0
; Apply PbsMl4 on Src[0] result goes in dest[0][0-3]
LD R0 TS[0].0
PBS_ML4_F R0 R0 PbsTestMany4
ST TD[0].0 R0
ST TD[0].1 R1
ST TD[0].2 R2
ST TD[0].3 R3

View File

@@ -1,24 +0,0 @@
; CUST_21
; Simple IOp to check PbsMl8
; WARN: This operation required 16b ct width
; Correct result:
; * Dst[0][0] <- Src[0][0]
; * Dst[0][1] <- Src[0][0] +1
; * Dst[0][2] <- Src[0][0] +2
; * Dst[0][3] <- Src[0][0] +3
; * Dst[0][4] <- Src[0][0] +4
; * Dst[0][5] <- Src[0][0] +5
; * Dst[0][6] <- Src[0][0] +6
; * Dst[0][7] <- Src[0][0] +7
; Apply PbsMl8 on Src[0] result goes in dest[0][0-7]
LD R0 TS[0].0
PBS_ML8_F R0 R0 PbsTestMany8
ST TD[0].0 R0
ST TD[0].1 R1
ST TD[0].2 R2
ST TD[0].3 R3
ST TD[0].4 R4
ST TD[0].5 R5
ST TD[0].6 R6
ST TD[0].7 R7

View File

@@ -1,16 +0,0 @@
# CUST_3
# Simple IOp to check isc behavior
# Generate obvious deps and check that isc correctly issued the dop
# Correct result must bu Dest <- Src[0]
LD R0 TS[0].0
LD R1 TS[0].1
LD R2 TS[0].2
LD R3 TS[0].3
PBS R4 R0 PbsNone
ST TD[0].0 R4
PBS R4 R1 PbsNone
ST TD[0].1 R4
PBS R4 R2 PbsNone
ST TD[0].2 R4
PBS_F R4 R3 PbsNone
ST TD[0].3 R4

View File

@@ -1,19 +0,0 @@
; CUST_8
; Simple IOp to check the ALU operation
; Dst[0].0 <- Src[0].0 + Src[1].0
LD R1 TS[0].0
LD R2 TS[1].0
ADD R0 R1 R2
ST TD[0].0 R0
; Dst[0].1 <- Src[0].1 - Src[1].1
LD R5 TS[0].1
LD R6 TS[1].1
SUB R4 R5 R6
ST TD[0].1 R4
; Dst[0].2 <- Src[0].2 + (Src[1].2 *4)
LD R9 TS[0].2
LD R10 TS[1].2
MAC R8 R9 R10 4
ST TD[0].2 R8

View File

@@ -1,21 +0,0 @@
; CUST_9
; Simple IOp to check the ALU Scalar operation
; Dst[0].0 <- Src[0].0 + Imm[0].0
LD R1 TS[0].0
ADDS R0 R1 TI[0].0
ST TD[0].0 R0
; Dst[0].1 <- Src[0].1 - Imm[0].1
LD R5 TS[0].1
SUBS R4 R5 TI[0].1
ST TD[0].1 R4
; Dst[0].2 <- Imm[0].2 - Src[0].2
LD R9 TS[0].2
SSUB R8 R9 TI[0].2
ST TD[0].2 R8
; Dst[0].3 <- Src[0].3 * Imm[0].3
LD R13 TS[0].3
MULS R12 R13 TI[0].3
ST TD[0].3 R12

View File

@@ -1,98 +0,0 @@
[fpga]
regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core.toml"]
polling_us=10
[fpga.ffi.Xrt]
id= 0
kernel= "hpu_msplit_3parts_1in3"
xclbin="${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_msplit_3parts.xclbin"
[rtl]
bpip_use = true
bpip_use_opportunism = true
bpip_timeout = 100_000
[board]
ct_mem = 4096
ct_pc = [
{Hbm= {pc=10}},
{Hbm= {pc=11}},
]
heap_size = 3584
lut_mem = 256
lut_pc = {Hbm={pc=12}}
fw_size= 65536
fw_pc = {Hbm={pc=1}}
bsk_pc = [
{Hbm={pc=2}},
{Hbm={pc=3}},
{Hbm={pc=4}},
{Hbm={pc=5}},
{Hbm={pc=6}},
{Hbm={pc=7}},
{Hbm={pc=8}},
{Hbm={pc=9}}
]
ksk_pc = [
{Hbm={pc=24}},
{Hbm={pc=25}},
{Hbm={pc=26}},
{Hbm={pc=27}},
{Hbm={pc=28}},
{Hbm={pc=29}},
{Hbm={pc=30}},
{Hbm={pc=31}}
]
trace_pc = {Hbm={pc=0}}
trace_depth = 4 # In MB
[firmware]
implementation = "Llt"
integer_w=[4,6,8,10,12,14,16,32,64,128]
min_batch_size = 6
kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml"
custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm"
custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm"
custom_iop.'IOP[2]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_2.asm"
custom_iop.'IOP[3]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_3.asm"
custom_iop.'IOP[8]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_8.asm"
custom_iop.'IOP[9]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_9.asm"
custom_iop.'IOP[16]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_16.asm"
custom_iop.'IOP[17]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_17.asm"
custom_iop.'IOP[18]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_18.asm"
custom_iop.'IOP[19]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_19.asm"
custom_iop.'IOP[20]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_20.asm"
custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm"
[firmware.op_cfg.default]
fill_batch_fifo = true
min_batch_size = false
use_tiers = false
flush_behaviour = "Patient"
flush = true
[firmware.op_cfg.by_op.MUL]
fill_batch_fifo = false
min_batch_size = false
use_tiers = false
flush_behaviour = "Patient"
flush = true
[firmware.op_cfg.by_op.MULS]
fill_batch_fifo = false
min_batch_size = false
use_tiers = false
flush_behaviour = "Patient"
flush = true
[firmware.op_cfg.by_op.ERC_20]
fill_batch_fifo = false
min_batch_size = true
use_tiers = true
flush_behaviour = "Patient"
flush = true

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:35ad67cf9760e37256a6c92cf29ea67334690b724fd3b7b859919ee9b0bde6d3
size 78194785

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More