diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 7a9086cb0..584f41649 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -6,6 +6,7 @@ self-hosted-runner: - large_windows_16_latest - large_ubuntu_16 - large_ubuntu_16-22.04 + - v80-desktop # Configuration variables in array of strings defined in your repository or # organization. `null` means disabling configuration variables check. # Empty array means no configuration variable is allowed. diff --git a/.github/workflows/benchmark_hpu_integer.yml b/.github/workflows/benchmark_hpu_integer.yml new file mode 100644 index 000000000..fb12c3ff4 --- /dev/null +++ b/.github/workflows/benchmark_hpu_integer.yml @@ -0,0 +1,88 @@ +# Run all integer benchmarks on a permanent HPU instance and return parsed results to Slab CI bot. +name: Hpu Integer Benchmarks + +on: + workflow_dispatch: + +env: + CARGO_TERM_COLOR: always + RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + RUST_BACKTRACE: "full" + RUST_MIN_STACK: "8388608" + +permissions: {} + +jobs: + integer-benchmarks-hpu: + name: Execute integer & erc20 benchmarks for HPU backend + runs-on: v80-desktop + concurrency: + group: ${{ github.workflow }}_${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + timeout-minutes: 1440 # 24 hours + steps: + # Needed as long as hw_regmap repository is private + - name: Configure SSH + uses: webfactory/ssh-agent@a6f90b1f127823b31d4d4a8d96047790581349bd # v0.9.1 + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + + - name: Checkout tfhe-rs repo with tags + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + fetch-depth: 0 + persist-credentials: 'false' + token: ${{ secrets.REPO_CHECKOUT_TOKEN }} + + - name: Get benchmark details + run: | + { + echo "BENCH_DATE=$(date --iso-8601=seconds)"; + echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})"; + echo "COMMIT_HASH=$(git describe --tags --dirty)"; + } >> "${GITHUB_ENV}" + + - name: Install rust + uses: dtolnay/rust-toolchain@a54c7afa936fefeb4456b2dd8068152669aa8203 + with: + toolchain: nightly + + - name: Checkout Slab repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + repository: zama-ai/slab + path: slab + persist-credentials: 'false' + token: ${{ secrets.REPO_CHECKOUT_TOKEN }} + + - name: Run benchmarks + run: | + make bench_integer_hpu + make bench_hlapi_erc20_hpu + + - name: Parse results + run: | + python3 ./ci/benchmark_parser.py target/criterion "${RESULTS_FILENAME}" \ + --database tfhe_rs \ + --hardware "hpu_x1" \ + --backend hpu \ + --project-version "${COMMIT_HASH}" \ + --branch "${REF_NAME}" \ + --commit-date "${COMMIT_DATE}" \ + --bench-date "${BENCH_DATE}" \ + --walk-subdirs + env: + REF_NAME: ${{ github.ref_name }} + + - name: Upload parsed results artifact + uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 + with: + name: ${{ github.sha }}_integer_benchmarks + path: ${{ env.RESULTS_FILENAME }} + + - name: Send data to Slab + shell: bash + run: | + python3 slab/scripts/data_sender.py "${RESULTS_FILENAME}" "${{ secrets.JOB_SECRET }}" \ + --slab-url "${{ secrets.SLAB_URL }}" diff --git a/.github/workflows/cargo_build.yml b/.github/workflows/cargo_build.yml index 32edfe80b..d75c22bbf 100644 --- a/.github/workflows/cargo_build.yml +++ b/.github/workflows/cargo_build.yml @@ -94,5 +94,10 @@ jobs: run: | make build_tfhe_coverage + - name: Run Hpu pcc checks + if: ${{ contains(matrix.os, 'ubuntu') }} + run: | + make pcc_hpu + # The wasm build check is a bit annoying to set-up here and is done during the tests in # aws_tfhe_tests.yml diff --git a/.github/workflows/cargo_test_fft.yml b/.github/workflows/cargo_test_fft.yml index 8c25092dc..12edc0614 100644 --- a/.github/workflows/cargo_test_fft.yml +++ b/.github/workflows/cargo_test_fft.yml @@ -51,7 +51,7 @@ jobs: runs-on: ${{ matrix.runner_type }} strategy: matrix: - runner_type: [ubuntu-latest, macos-latest, windows-latest] + runner_type: [ ubuntu-latest, macos-latest, windows-latest ] fail-fast: false steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 @@ -82,7 +82,7 @@ jobs: runs-on: ${{ matrix.runner_type }} strategy: matrix: - runner_type: [ubuntu-latest, macos-latest, windows-latest] + runner_type: [ ubuntu-latest, macos-latest, windows-latest ] steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 with: diff --git a/.github/workflows/cargo_test_ntt.yml b/.github/workflows/cargo_test_ntt.yml index f21f35f26..aceee9509 100644 --- a/.github/workflows/cargo_test_ntt.yml +++ b/.github/workflows/cargo_test_ntt.yml @@ -51,7 +51,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ ubuntu-latest, macos-latest, windows-latest ] fail-fast: false steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 @@ -77,7 +77,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, macos-latest, windows-latest] + os: [ ubuntu-latest, macos-latest, windows-latest ] steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 with: diff --git a/.github/workflows/hpu_hlapi_tests.yml b/.github/workflows/hpu_hlapi_tests.yml new file mode 100644 index 000000000..a4773f31f --- /dev/null +++ b/.github/workflows/hpu_hlapi_tests.yml @@ -0,0 +1,73 @@ +# Test tfhe-fft +name: Cargo Test HLAPI HPU + +on: + pull_request: + push: + branches: + - main + +env: + CARGO_TERM_COLOR: always + IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }} + CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }} + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + + +permissions: { } + +jobs: + should-run: + runs-on: ubuntu-latest + permissions: + pull-requests: read + outputs: + hpu_test: ${{ env.IS_PULL_REQUEST == 'false' || steps.changed-files.outputs.hpu_any_changed }} + steps: + - name: Checkout tfhe-rs + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + fetch-depth: 0 + persist-credentials: 'false' + token: ${{ env.CHECKOUT_TOKEN }} + + - name: Check for file changes + id: changed-files + uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 + with: + files_yaml: | + hpu: + - tfhe/Cargo.toml + - Makefile + - backends/tfhe-hpu-backend/** + - mockups/tfhe-hpu-mockup/** + + cargo-tests-hpu: + needs: should-run + if: needs.should-run.outputs.hpu_test == 'true' + runs-on: large_ubuntu_16 + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + persist-credentials: 'false' + token: ${{ env.CHECKOUT_TOKEN }} + + - name: Install Rust + uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af + with: + toolchain: stable + override: true + + - name: Install Just + run: | + cargo install just + + - name: Test HLAPI HPU + run: | + source setup_hpu.sh + just -f mockups/tfhe-hpu-mockup/Justfile BUILD_PROFILE=release mockup & + make HPU_CONFIG=sim test_high_level_api_hpu + diff --git a/.github/workflows/make_release_hpu.yml b/.github/workflows/make_release_hpu.yml new file mode 100644 index 000000000..a51be72a1 --- /dev/null +++ b/.github/workflows/make_release_hpu.yml @@ -0,0 +1,105 @@ +name: Publish HPU release + +on: + workflow_dispatch: + inputs: + dry_run: + description: "Dry-run" + type: boolean + default: true + +env: + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + +permissions: {} + +jobs: + verify_tag: + uses: ./.github/workflows/verify_tagged_commit.yml + secrets: + RELEASE_TEAM: ${{ secrets.RELEASE_TEAM }} + READ_ORG_TOKEN: ${{ secrets.READ_ORG_TOKEN }} + + package: + runs-on: ubuntu-latest + needs: verify_tag + outputs: + hash: ${{ steps.hash.outputs.hash }} + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + persist-credentials: 'false' + token: ${{ secrets.REPO_CHECKOUT_TOKEN }} + - name: Prepare package + run: | + cargo package -p tfhe-hpu-backend + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: crate + path: target/package/*.crate + - name: generate hash + id: hash + run: cd target/package && echo "hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}" + + provenance: + if: ${{ !inputs.dry_run }} + needs: [package] + uses: slsa-framework/slsa-github-generator/.github/workflows/generator_generic_slsa3.yml@v2.1.0 + permissions: + # Needed to detect the GitHub Actions environment + actions: read + # Needed to create the provenance via GitHub OIDC + id-token: write + # Needed to upload assets/artifacts + contents: write + with: + # SHA-256 hashes of the Crate package. + base64-subjects: ${{ needs.package.outputs.hash }} + + publish_release: + name: Publish tfhe-hpu-backend Release + runs-on: ubuntu-latest + needs: [verify_tag, package] # for comparing hashes + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + persist-credentials: 'false' + token: ${{ secrets.REPO_CHECKOUT_TOKEN }} + + - name: Publish crate.io package + env: + CRATES_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + DRY_RUN: ${{ inputs.dry_run && '--dry-run' || '' }} + run: | + # DRY_RUN expansion cannot be double quoted when variable contains empty string otherwise cargo publish + # would fail. This is safe since DRY_RUN is handled in the env section above. + # shellcheck disable=SC2086 + cargo publish -p tfhe-hpu-backend --token "${CRATES_TOKEN}" ${DRY_RUN} + + - name: Generate hash + id: published_hash + run: cd target/package && echo "pub_hash=$(sha256sum ./*.crate | base64 -w0)" >> "${GITHUB_OUTPUT}" + + - name: Slack notification (hashes comparison) + if: ${{ needs.package.outputs.hash != steps.published_hash.outputs.pub_hash }} + continue-on-error: true + uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3 + env: + SLACK_COLOR: failure + SLACK_MESSAGE: "SLSA tfhe-hpu-backend crate - hash comparison failure: (${{ env.ACTION_RUN_URL }})" + + - name: Slack Notification + if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }} + continue-on-error: true + uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 # v2.3.3 + env: + SLACK_COLOR: ${{ job.status }} + SLACK_MESSAGE: "tfhe-hpu-backend release failed: (${{ env.ACTION_RUN_URL }})" diff --git a/.lfsconfig b/.lfsconfig new file mode 100644 index 000000000..e494bcf0b --- /dev/null +++ b/.lfsconfig @@ -0,0 +1,2 @@ +[lfs] + fetchexclude = * diff --git a/Cargo.toml b/Cargo.toml index 421ee22c2..dcf978b49 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,10 +9,12 @@ members = [ "tasks", "tfhe-csprng", "backends/tfhe-cuda-backend", + "backends/tfhe-hpu-backend", "utils/tfhe-versionable", "utils/tfhe-versionable-derive", "utils/param_dedup", "tests", + "mockups/tfhe-hpu-mockup", ] exclude = [ diff --git a/Makefile b/Makefile index 34b29a86e..29d482d94 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ SHELL:=$(shell /usr/bin/env which bash) OS:=$(shell uname) RS_CHECK_TOOLCHAIN:=$(shell cat toolchain.txt | tr -d '\n') CARGO_RS_CHECK_TOOLCHAIN:=+$(RS_CHECK_TOOLCHAIN) +CARGO_BUILD_JOBS=default CPU_COUNT=$(shell ./scripts/cpu_count.sh) RS_BUILD_TOOLCHAIN:=stable CARGO_RS_BUILD_TOOLCHAIN:=+$(RS_BUILD_TOOLCHAIN) @@ -55,6 +56,9 @@ REGEX_PATTERN?='' TFHECUDA_SRC=backends/tfhe-cuda-backend/cuda TFHECUDA_BUILD=$(TFHECUDA_SRC)/build +# tfhe-hpu-backend +HPU_CONFIG=v80 + # Exclude these files from coverage reports define COVERAGE_EXCLUDED_FILES --exclude-files apps/trivium/src/trivium/* \ @@ -301,6 +305,13 @@ check_gpu: install_rs_check_toolchain --all-targets \ -p $(TFHE_SPEC) +.PHONY: clippy_hpu # Run clippy lints on tfhe with "hpu" enabled +clippy_hpu: install_rs_check_toolchain + RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy \ + --features=boolean,shortint,integer,internal-keycache,hpu,pbs-stats,extended-types \ + --all-targets \ + -p $(TFHE_SPEC) -- --no-deps -D warnings + .PHONY: fix_newline # Fix newline at end of file issues to be UNIX compliant fix_newline: check_linelint_installed linelint -a . @@ -473,6 +484,11 @@ clippy_cuda_backend: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ -p tfhe-cuda-backend -- --no-deps -D warnings +.PHONY: clippy_hpu_backend # Run clippy lints on the tfhe-hpu-backend +clippy_hpu_backend: install_rs_check_toolchain + RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ + -p tfhe-hpu-backend -- --no-deps -D warnings + .PHONY: check_rust_bindings_did_not_change # Check rust bindings are up to date for tfhe-cuda-backend check_rust_bindings_did_not_change: cargo build -p tfhe-cuda-backend && "$(MAKE)" fmt_gpu && \ @@ -702,6 +718,28 @@ test_signed_integer_multi_bit_gpu_ci: install_rs_check_toolchain install_cargo_n --cargo-profile "$(CARGO_PROFILE)" --multi-bit --backend "gpu" \ --signed-only --tfhe-package "$(TFHE_SPEC)" +.PHONY: test_integer_hpu_ci # Run the tests for integer ci on hpu backend +test_integer_hpu_ci: install_rs_check_toolchain install_cargo_nextest + cargo test --release -p $(TFHE_SPEC) --features hpu-v80 --test hpu + +.PHONY: test_integer_hpu_mockup_ci # Run the tests for integer ci on hpu backend and mockup +test_integer_hpu_mockup_ci: install_rs_check_toolchain install_cargo_nextest + source ./setup_hpu.sh --config sim ; \ + cargo build --release --bin hpu_mockup; \ + coproc target/release/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml > mockup.log; \ + HPU_TEST_ITER=1 \ + cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \ + kill %1 + +.PHONY: test_integer_hpu_mockup_ci_fast # Run the quick tests for integer ci on hpu backend and mockup. +test_integer_hpu_mockup_ci_fast: install_rs_check_toolchain install_cargo_nextest + source ./setup_hpu.sh --config sim ; \ + cargo build --profile devo --bin hpu_mockup; \ + coproc target/devo/hpu_mockup --params mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml > mockup.log; \ + HPU_TEST_ITER=1 \ + cargo test --profile devo -p $(TFHE_SPEC) --features hpu --test hpu -- u32 && \ + kill %1 + .PHONY: test_boolean # Run the tests of the boolean module test_boolean: install_rs_build_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \ @@ -857,6 +895,22 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest --features=integer,internal-keycache,gpu -p $(TFHE_SPEC) \ -E "test(/high_level_api::.*gpu.*/)" +test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest +ifeq ($(HPU_CONFIG), v80) + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \ + --build-jobs=$(CARGO_BUILD_JOBS) \ + --test-threads=1 \ + --features=integer,internal-keycache,hpu,hpu-v80 -p $(TFHE_SPEC) \ + -E "test(/high_level_api::.*hpu.*/)" +else + RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \ + --build-jobs=$(CARGO_BUILD_JOBS) \ + --test-threads=1 \ + --features=integer,internal-keycache,hpu -p $(TFHE_SPEC) \ + -E "test(/high_level_api::.*hpu.*/)" +endif + + .PHONY: test_strings # Run the tests for strings ci test_strings: install_rs_build_toolchain RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \ @@ -1100,6 +1154,12 @@ clippy_bench_gpu: install_rs_check_toolchain --features=gpu,shortint,integer,internal-keycache,nightly-avx512,pbs-stats,zk-pok \ -p tfhe-benchmark -- --no-deps -D warnings +.PHONY: clippy_bench_hpu # Run clippy lints on tfhe-benchmark +clippy_bench_hpu: install_rs_check_toolchain + RUSTFLAGS="$(RUSTFLAGS)" cargo "$(CARGO_RS_CHECK_TOOLCHAIN)" clippy --all-targets \ + --features=hpu,shortint,integer,internal-keycache,pbs-stats\ + -p tfhe-benchmark -- --no-deps -D warnings + .PHONY: print_doc_bench_parameters # Print parameters used in doc benchmarks print_doc_bench_parameters: RUSTFLAGS="" cargo run --example print_doc_bench_parameters \ @@ -1133,6 +1193,14 @@ bench_signed_integer_gpu: install_rs_check_toolchain --bench integer-signed-bench \ --features=integer,gpu,internal-keycache,nightly-avx512,pbs-stats -p tfhe-benchmark -- +.PHONY: bench_integer_hpu # Run benchmarks for integer on HPU backend +bench_integer_hpu: install_rs_check_toolchain + source ./setup_hpu.sh --config $(HPU_CONFIG) ; \ + RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_OP_FLAVOR=$(BENCH_OP_FLAVOR) __TFHE_RS_FAST_BENCH=$(FAST_BENCH) __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ + cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ + --bench integer-bench \ + --features=integer,internal-keycache,pbs-stats,hpu,hpu-v80 -p tfhe-benchmark -- --quick + .PHONY: bench_integer_compression # Run benchmarks for unsigned integer compression bench_integer_compression: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" __TFHE_RS_BENCH_TYPE=$(BENCH_TYPE) \ @@ -1324,6 +1392,14 @@ bench_hlapi_dex_gpu: install_rs_check_toolchain --bench hlapi-dex \ --features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p tfhe-benchmark -- +.PHONY: bench_hlapi_erc20_hpu # Run benchmarks for ECR20 operations on HPU +bench_hlapi_erc20_hpu: install_rs_check_toolchain + source ./setup_hpu.sh --config $(HPU_CONFIG) ; \ + RUSTFLAGS="$(RUSTFLAGS)" \ + cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \ + --bench hlapi-erc20 \ + --features=integer,internal-keycache,hpu,hpu-v80 -p tfhe-benchmark -- --quick + .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate bench_tfhe_zk_pok: install_rs_check_toolchain RUSTFLAGS="$(RUSTFLAGS)" \ @@ -1423,6 +1499,9 @@ tfhe_lints pcc_gpu: check_rust_bindings_did_not_change clippy_rustdoc_gpu \ clippy_gpu clippy_cuda_backend clippy_bench_gpu check_compile_tests_benches_gpu +.PHONY: pcc_hpu # pcc stands for pre commit checks for HPU compilation +pcc_hpu: clippy_hpu clippy_hpu_backend test_integer_hpu_mockup_ci_fast + .PHONY: fpcc # pcc stands for pre commit checks, the f stands for fast fpcc: no_tfhe_typo no_dbg_log check_parameter_export_ok check_fmt check_typos lint_doc \ check_md_docs_are_tested clippy_fast check_compile_tests diff --git a/_typos.toml b/_typos.toml index 69e10146c..4946598a7 100644 --- a/_typos.toml +++ b/_typos.toml @@ -11,11 +11,13 @@ extend-ignore-identifiers-re = [ # Example with string replacing "hello" with "herlo" "herlo", # Example in trivium - "C9217BA0D762ACA1" + "C9217BA0D762ACA1", + "0x[0-9a-fA-F]+" ] [files] extend-exclude = [ "backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu", "backends/tfhe-cuda-backend/cuda/src/fft/twiddles.cu", + "backends/tfhe-hpu-backend/config_store/**/*.link_summary", ] diff --git a/backends/tfhe-hpu-backend/.gitattributes b/backends/tfhe-hpu-backend/.gitattributes new file mode 100644 index 000000000..afe123f81 --- /dev/null +++ b/backends/tfhe-hpu-backend/.gitattributes @@ -0,0 +1,3 @@ +*.xclbin filter=lfs diff=lfs merge=lfs -text +*.pdi filter=lfs diff=lfs merge=lfs -text +python/lib/example.json filter=lfs diff=lfs merge=lfs -text diff --git a/backends/tfhe-hpu-backend/.gitignore b/backends/tfhe-hpu-backend/.gitignore new file mode 100644 index 000000000..b175ca32e --- /dev/null +++ b/backends/tfhe-hpu-backend/.gitignore @@ -0,0 +1,3 @@ +ngt_* +config +kogge_cfg.toml diff --git a/backends/tfhe-hpu-backend/Cargo.toml b/backends/tfhe-hpu-backend/Cargo.toml new file mode 100644 index 000000000..9022cd5c6 --- /dev/null +++ b/backends/tfhe-hpu-backend/Cargo.toml @@ -0,0 +1,88 @@ +[package] +name = "tfhe-hpu-backend" +version = "0.1.0" +edition = "2021" +license = "BSD-3-Clause-Clear" +description = "HPU implementation on FPGA of TFHE-rs primitives." +homepage = "https://www.zama.ai/" +documentation = "https://docs.zama.ai/tfhe-rs" +repository = "https://github.com/zama-ai/tfhe-rs" +readme = "README.md" +keywords = ["fully", "homomorphic", "encryption", "fhe", "cryptography", "hardware", "fpga"] + +[features] +hw-xrt = [] +hw-v80 = [] +io-dump = ["num-traits"] +rtl_graph = ["dot2"] +utils = ["clap", "clap-num", "bitvec", "serde_json"] + +[build-dependencies] +cxx-build = "1.0" + +[dependencies] +cxx = "1.0" +hw_regmap = "0.1.0" + +strum = { version = "0.26.2", features = ["derive"] } +strum_macros = "0.26.2" +enum_dispatch = "0.3.13" +tracing = "0.1.40" +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +serde = { version = "1", features = ["derive"] } +toml = { version = "0.8.*", features = [] } +paste = "1.0.15" +thiserror = "1.0.61" +bytemuck = "1.16.0" +anyhow = "1.0.82" +lazy_static = "1.4.0" +rand = "0.8.5" +regex = "1.10.4" +bitflags = { version = "2.5.0", features = ["serde"] } +itertools = "0.11.0" +lru = "0.12.3" +bitfield-struct = "0.10.0" +crossbeam = { version = "0.8.4", features = ["crossbeam-queue"] } +rayon = { workspace = true } + +# Dependencies used for Sim feature +ipc-channel = "0.18.3" + +# Dependencies used for debug feature +num-traits = { version = "*", optional = true } +clap = { version = "4.4.4", features = ["derive"], optional = true } +clap-num = { version = "1.1.1", optional = true } +nix = { version = "0.29.0", features = ["ioctl", "uio"] } + +# Dependencies used for rtl_graph features +dot2 = { version = "*", optional = true } + +bitvec = { version = "*", optional = true } +serde_json = { version = "*", optional = true } + +# Binary for manual debugging +# Enable to access Hpu register and drive some custom sequence by hand +[[bin]] +name = "hputil" +path = "src/utils/hputil.rs" +required-features = ["utils"] + +# Binary for asm manipulation +# Enable to convert back and forth between asm/hex format +[[bin]] +name = "dop_fmt" +path = "src/utils/dop_fmt.rs" +required-features = ["utils"] + +# Enable to convert back and forth between asm/hex format +[[bin]] +name = "iop_fmt" +path = "src/utils/iop_fmt.rs" +required-features = ["utils"] + +# Firmware generation +# Enable to expand IOp in list of Dop for inspection +[[bin]] +name = "fw" +path = "src/utils/fw.rs" +required-features = ["utils"] diff --git a/backends/tfhe-hpu-backend/LICENSE b/backends/tfhe-hpu-backend/LICENSE new file mode 100644 index 000000000..48312e88a --- /dev/null +++ b/backends/tfhe-hpu-backend/LICENSE @@ -0,0 +1,28 @@ +BSD 3-Clause Clear License + +Copyright © 2025 ZAMA. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or other +materials provided with the distribution. + +3. Neither the name of ZAMA nor the names of its contributors may be used to endorse +or promote products derived from this software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. +THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/backends/tfhe-hpu-backend/Readme.md b/backends/tfhe-hpu-backend/Readme.md new file mode 100644 index 000000000..f80e2c633 --- /dev/null +++ b/backends/tfhe-hpu-backend/Readme.md @@ -0,0 +1,261 @@ +# TFHE-hpu-backend + +## Brief +The `tfhe-hpu-backend` holds the code to interface with the HPU accelerator of TFHE. +It contains a `HpuDevice` abstraction that enables easy configuration and dispatching of TFHE operations on the HPU accelerator. + +The user API exposes the following functions for hardware setup: +- `HpuDevice::new`, `HpuDevice::from_config`: Instantiates abstraction device from configuration file. +- `HpuDevice::init`: Configures and uploads the required public material. +- `new_var_from`: Creates a HPU ciphertext from `tfhe-rs` ciphertext. + +HPU device could also be used from `integer` with the help of the following function: +- `tfhe::integer::hpu::init_device`: Init given HPU device with server key. +- `tfhe::integer::hpu::ciphertext::HpuRadixCiphertext::from_radix_ciphertext`: Convert a CpuRadixCiphertext in it's HPU counterpart. + +HPU device could also be used seamlessly from `hl-api` by setting up a thread-local HPU server key: +- `tfhe::Config::from_hpu_device`: Extract hl-api configuration from HpuDevice. +- `tfhe::set_server_key`: Register the Hpu server key in the current thread. + +HPU variables could also be created from a `high-level-api` object, with the help of the `hw-xfer` feature. +This implements a trait that enables `clone_on`, `mv_on` `FheUint` object on the HPU accelerator, and cast back `from` them. + +These objects implement the `std::ops` trait and could be used to dispatch operations on HPU hardware. + +### Backend structure +`tfhe-hpu-backend` is split in various modules: +- `entities`: Defines structure handled by HPU accelerator. Conversion traits from/into those objects are implemented in `tfhe-rs`. +- `asm`: Describes assembly-like language for the HPU. It enables abstract HPU behavior and easily updates it through micro-code. +- `fw`: Abstraction to help the micro-code designer. Uses a simple rust program for describing new HPU operations. Helps with register/heap management. +- `interface`: + + `device`: High-level structure that exposes the User API. + + `backend`: Inner private structure that contains HPU modules + + `variable`: Wraps HPU ciphertexts. It enables to hook an hardware object lifetime within the `rust` borrow-checker. + + `memory`: Handles on-board memory allocation and synchronization + + `config`: Helps to configure HPU accelerator through a TOML configuration file + + `cmd`: Translates operation over `variable` in concrete HPU commands + + `regmap`: Communicates with the HPU internal register with ease. + + `rtl`: Defines concrete `rust` structure populated from HPU's status/configuration registers + + +Below is an overview of the internal structure of the Backend. +![HPU backend structure](./figures/tfhe-hpu-backend.excalidraw.png) + +This picture depicts the internal modules of `tfhe-hpu-backend`, Device is the main entry point for the user. Its lifecycle is as follows: + +1. Create HpuDevice, open link with the associated FPGA. Configure associated drivers and upload the bitstream. Read FPGA registers to extract supported configuration and features. Build Firmware conversion table (IOp -> DOps stream). + +2. Allocate required memory chunks in the on-board memory. Upload public material required by TFHE computation. + +3. Create HPU variables that handle TFHE Ciphertexts. It wraps TFHE Ciphertext with required internal resources and enforces the correct lifetime management. This abstraction enforces that during the variable lifecycle all required resources are valid. + +4. Users could trigger HPU operation from the HPU variable. + Variable abstraction enforces that required objects are correctly synced on the hardware and converts each operation in a concrete HPU command. + When HPU operation is acknowledged by the hardware, the internal state of the associated variable is updated. + This mechanism enables asynchronous operation and minimal amount of Host to/from HW memory transfer. + This mechanism also enables offloading a computation graph to the HPU and requires a synchronization only on the final results. + +## Example +### Configuration file +HPU configuration knobs are gathered in a TOML configuration file. This file describes the targeted FPGA with its associated configuration: +```toml +[fpga] # FPGA target + # Register layout in the FPGA + regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_1in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml"] + polling_us=10 +[fpga.ffi.V80] # Hardware properties + ami_dev="/dev/ami1" # Name of ami device + qdma_h2c="/dev/qdma${V80_PCIE_DEV}001-MM-0" # QDma host to card device + qdma_c2h="/dev/qdma${V80_PCIE_DEV}001-MM-1" # QDma card to host device + +[rtl] # RTL option + bpip_used = true # BPIP/IPIP mode + bpip_use_opportunism = false # Use strict flush paradigm + bpip_timeout = 100_000 # BPIP timeout in clock `cycles` + +[board] # Board configuration + ct_mem = 32768 # Number of allocated ciphertext + ct_pc = [ # Memory used for ciphertext + {Hbm= {pc=32}}, + {Hbm= {pc=33}}, + ] + heap_size = 16384 # Number of slots reserved for heap + + lut_mem = 256 # Number of allocated LUT table + lut_pc = {Hbm={pc=34}} # Memory used for LUT + + fw_size= 16777216 # Size in byte of the Firmware translation table + fw_pc = {Ddr= {offset= 0x3900_0000}} # Memory used for firmware translation table + + bsk_pc = [ # Memory used for Bootstrapping key + {Hbm={pc=8}}, + {Hbm={pc=12}}, + {Hbm={pc=24}}, + {Hbm={pc=28}}, + {Hbm={pc=40}}, + {Hbm={pc=44}}, + {Hbm={pc=56}}, + {Hbm={pc=60}} + ] + + ksk_pc = [ # Memory used for Keyswitching key + {Hbm={pc=0}}, + {Hbm={pc=1}}, + {Hbm={pc=2}}, + {Hbm={pc=3}}, + {Hbm={pc=4}}, + {Hbm={pc=5}}, + {Hbm={pc=6}}, + {Hbm={pc=7}}, + {Hbm={pc=16}}, + {Hbm={pc=17}}, + {Hbm={pc=18}}, + {Hbm={pc=19}}, + {Hbm={pc=20}}, + {Hbm={pc=21}}, + {Hbm={pc=22}}, + {Hbm={pc=23}} + ] + + trace_pc = {Hbm={pc=35}} # Memory used for trace log + trace_depth = 32 # Size of Memory in MiB allocated for trace log + +[firmware] # Firmware properties + implementation = "Llt" # Firmware flavor to use + integer_w=[4,6,8,10,12,14,16,32,64,128] # List of supported IOp width + min_batch_size = 11 # Minimum batch size for maximum throughput + kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml" + custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm" + +# Default firmware configuration. Could be edited on per-IOp basis +[firmware.op_cfg.default] + fill_batch_fifo = true + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + ``` + +### Device setup +Following code snippet shows how to instantiate and configure a `HpuDevice`: +```rust + // Following code snippets used the HighLevelApi abstraction + // Instantiate HpuDevice -------------------------------------------------- + let hpu_device = HpuDevice::from_config(&args.config.expand()); + + // Generate keys ---------------------------------------------------------- + let config = Config::from_hpu_device(&hpu_device); + + let cks = ClientKey::generate(config); + let csks = CompressedServerKey::new(&cks); + + // Register HpuDevice and key as thread-local engine + set_server_key((hpu_device, csks)); +``` + +### Clone CPU ciphertext on HPU +Following code snippet shows how to convert CPU ciphertext in HPU one: +``` rust + // Draw random value as input + let a = rand::thread_rng().gen_range(0..u8::MAX); + + // Encrypt them on Cpu side + let a_fhe = FheUint8::encrypt(a, &cks); + + // Clone a ciphertext and move them in HpuWorld + // NB: Data doesn't move over Pcie at this stage + // Data are only arranged in Hpu ordered an copy in the host internal buffer + let a_hpu = a_fhe.clone_on(&hpu_device); +``` + +### Dispatch operation on HPU +Once registered as thread-local engine, HighLevel FheUint are converted in Hpu format. +Following code snippets show how to start operation on HPU: + +``` rust + // Sum ------------------------------------------------------------- + // Generate random inputs value and compute expected result + let in_a = rng.gen_range(0..u64::max_value()); + let in_b = rng.gen_range(0..u64::max_value()); + let clear_sum_ab = in_a.wrapping_add(in_b); + + // Encrypt input value + let fhe_a = FheUint64::encrypt(in_a, cks); + let fhe_b = FheUint64::encrypt(in_b, cks); + + // Triggered operation on HPU through hl_api + let fhe_sum_ab = fhe_a+fhe_b; + + // Decrypt values + let dec_sum_ab: u64 = fhe_sum_ab.decrypt(cks); +``` + +## Pre-made Examples +There are some example applications already available in `tfhe/examples/hpu`: + * hpu_hlapi: Depict the used of HPU device through HighLevelApi. + * hpu_bench: Depict the used of HPU device through Integer abstraction level. + +In order to run those applications on hardware, user must build from the project root (i.e `tfhe-rs-internal`) with `hpu-v80` features: + +> NB: Running examples required to have correctly pulled the `.pdi` files. Those files, due to their size, are backed by git-lfs and disabled by default. +> In order to retrieve them, use the following command: +> ```bash +> git lfs pull --include="*" --exclude="" +> ``` + +``` bash +cargo build --release --features="hpu-v80" --example hpu_hlapi --example hpu_bench +# Correctly setup environment with setup_hpu.sh script +source setup_hpu.sh --config v80 --init-qdma +./target/release/examples/hpu_bench --integer-w 64 --integer-w 32 --iop MUL --iter 10 +./target/release/examples/hpu_hlapi +``` + +## Test framework +There is also a set of tests backed in tfhe-rs. Tests are gather in testbundle over various integer width. +Those tests have 5 sub-kind: +* `alu`: Run and check all ct x ct IOp +* `alus`: Run and check all ct x scalar IOp +* `bitwise`: Run and check all bitwise IOp +* `cmp`: Run and check all comparison IOp +* `ternary`: Run and check ternary operation +* `algo`: Run and check IOp dedicated to offload small algorithms + + +Snippets below give some example of command that could be used for testing: +``` bash +# Correctly setup environment with setup_hpu.sh script +source setup_hpu.sh --config v80 --init-qdma + +# Run all sub-kind for 64b integer width +cargo test --release --features="hpu-v80" --test hpu -- u64 + +# Run only `bitwise` sub-kind for all integer width IOp +cargo test --release --features="hpu-v80" --test hpu -- bitwise +``` + +## Benches framework +HPU is completely integrated in tfhe benchmark system. Performances results could be extracted from HighLevelApi or Integer Api. +Three benchmarks could be started, through the following Makefile target for simplicity: +``` bash +# Do not forget to correctly set environment before hand +source setup_hpu.sh --config v80 --init-qdma + +# Run hlapi benches +make test_high_level_api_hpu + +# Run hlapi erc20 benches +make bench_hlapi_erc20_hpu + +# Run integer level benches +make bench_integer_hpu +``` + +## Eager to start without real Hardware ? +You are still waiting your FPGA board and are frustrated by lead time ? +Don't worry, you have backed-up. A dedicated simulation infrastructure with accurate performance estimation is available in tfhe-rs. +You can use it on any linux/MacOs to test HPU integration within tfhe-rs and optimized your application for HPU target. +Simply through an eye to [Hpu mockup](../../mockups/tfhe-hpu-mockup/Reaadme.md), and follow the instruction. diff --git a/backends/tfhe-hpu-backend/build.rs b/backends/tfhe-hpu-backend/build.rs new file mode 100644 index 000000000..291897082 --- /dev/null +++ b/backends/tfhe-hpu-backend/build.rs @@ -0,0 +1,26 @@ +fn main() { + if cfg!(feature = "hw-xrt") { + println!("cargo:rustc-link-search=/opt/xilinx/xrt/lib"); + println!("cargo:rustc-link-lib=dylib=stdc++"); + println!("cargo:rustc-link-lib=dl"); + println!("cargo:rustc-link-lib=rt"); + println!("cargo:rustc-link-lib=uuid"); + println!("cargo:rustc-link-lib=dylib=xrt_coreutil"); + + cxx_build::bridge("src/ffi/xrt/mod.rs") + .file("src/ffi/xrt/cxx/hpu_hw.cc") + .file("src/ffi/xrt/cxx/mem_zone.cc") + .flag_if_supported("-std=c++23") + .include("/opt/xilinx/xrt/include") // Enhance: support parsing bash env instead of hard path + .flag("-fmessage-length=0") + .compile("hpu-hw-ffi"); + + println!("cargo:rerun-if-changed=src/ffi/xrt/mod.rs"); + println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/hpu_hw.cc"); + println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/hpu_hw.h"); + println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/mem_zone.cc"); + println!("cargo:rerun-if-changed=src/ffi/xrt/cxx/mem_zone.h"); + } else { + // Simulation ffi -> nothing to do + } +} diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_0.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_0.asm new file mode 100644 index 000000000..838beed9e --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_0.asm @@ -0,0 +1,15 @@ +# CUST_0 +# Simple IOp to check the xfer between Hpu/Cpu +# Construct constant in dest slot -> 249 (0xf9) +SUB R0 R0 R0 +ADDS R0 R0 1 +ST TD[0].0 R0 +SUB R1 R1 R1 +ADDS R1 R1 2 +ST TD[0].1 R1 +SUB R2 R2 R2 +ADDS R2 R2 3 +ST TD[0].2 R2 +SUB R3 R3 R3 +ADDS R3 R3 3 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_1.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_1.asm new file mode 100644 index 000000000..3679e2c5f --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_1.asm @@ -0,0 +1,11 @@ +# CUST_1 +# Simple IOp to check the xfer between Hpu/Cpu +# Dest <- Src_a +LD R0 TS[0].0 +LD R1 TS[0].1 +LD R2 TS[0].2 +LD R3 TS[0].3 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_10.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_10.asm new file mode 100644 index 000000000..f591d66b3 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_10.asm @@ -0,0 +1,25 @@ +; CUST_8 +; Simple IOp to check the ALU operation +; Dst[0].0 <- Src[0].0 + Src[1].0 +LD R1 TS[0].0 +LD R2 TS[1].0 +ADD R0 R1 R2 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 + Src[1].1 +LD R5 TS[0].1 +LD R6 TS[1].1 +ADD R4 R5 R6 +ST TD[0].2 R4 + +; Dst[0].2 <- Src[0].2 + Src[1].2 +LD R9 TS[0].2 +LD R10 TS[1].2 +ADD R8 R9 R10 +ST TD[0].2 R8 + +; Dst[0].3 <- Src[0].3 + Src[1].3 +LD R13 TS[0].3 +LD R14 TS[1].3 +ADD R12 R13 R14 +ST TD[0].3 R0 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_16.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_16.asm new file mode 100644 index 000000000..0b4cfe80f --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_16.asm @@ -0,0 +1,6 @@ +# CUST_16 +# Simple IOp to check PBS behavior +# Dest <- PBSNone(Src_a.0) +LD R0 TS[0].0 +PBS_F R0 R0 PbsNone +ST TD[0].0 R0 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_17.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_17.asm new file mode 100644 index 000000000..bdb6711a7 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_17.asm @@ -0,0 +1,15 @@ +# CUST_17 +# Simple IOp to check PBS behavior +# Dest <- PBSNone(Src_a) +LD R0 TS[0].0 +PBS R0 R0 PbsNone +ST TD[0].0 R0 +LD R1 TS[0].1 +PBS R1 R1 PbsNone +ST TD[0].1 R1 +LD R2 TS[0].2 +PBS R2 R2 PbsNone +ST TD[0].2 R2 +LD R3 TS[0].3 +PBS_F R3 R3 PbsNone +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_18.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_18.asm new file mode 100644 index 000000000..c4b9a46a0 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_18.asm @@ -0,0 +1,23 @@ +; CUST_18 +; Simple IOp to check extraction pattern +; Correct result: +; * Dst[0,1] <- Src[0][0,1] +; * Dst[2,3] <- Src[1][0,1] + +; Pack Src[0][0,1] with a Mac and extract Carry/Msg in Dst[0][0,1] +LD R0 TS[0].0 +LD R1 TS[0].1 +MAC R3 R1 R0 4 +PBS R4 R3 PbsMsgOnly +PBS R5 R3 PbsCarryInMsg +ST TD[0].0 R4 +ST TD[0].1 R5 + +; Pack Src[1][0,1] with a Mac and extract Carry/Msg in Dst[0][2,3] +LD R10 TS[1].0 +LD R11 TS[1].1 +MAC R13 R11 R10 4 +PBS R14 R13 PbsMsgOnly +PBS R15 R13 PbsCarryInMsg +ST TD[0].2 R14 +ST TD[0].3 R15 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_19.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_19.asm new file mode 100644 index 000000000..0974347fa --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_19.asm @@ -0,0 +1,19 @@ +; CUST_19 +; Simple IOp to check PbsMl2 +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- 0 +; * Dst[0][2] <- Src[0][0] +1 +; * Dst[0][3] <- 0 +; i.e Cust_19(0x2) => 0x32 + +; Construct a 0 for destination padding +SUB R16 R16 R16 + +; Apply PbsMl2 on Src[0] result goes in dest[0][0-3] (0-padded) +LD R0 TS[0].0 +PBS_ML2_F R0 R0 PbsTestMany2 +ST TD[0].0 R0 +ST TD[0].1 R16 +ST TD[0].2 R1 +ST TD[0].3 R16 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_2.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_2.asm new file mode 100644 index 000000000..bc8e0175e --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_2.asm @@ -0,0 +1,11 @@ +# CUST_2 +# Simple IOp to check the xfer between Hpu/Cpu +# Dest <- Src_b +LD R0 TS[1].0 +LD R1 TS[1].1 +LD R2 TS[1].2 +LD R3 TS[1].3 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_20.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_20.asm new file mode 100644 index 000000000..5f29f8ee5 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_20.asm @@ -0,0 +1,22 @@ +; CUST_20 +; Simple IOp to check PbsMl4 +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- Src[0][0] +1 +; * Dst[0][2] <- Src[0][0] +2 +; * Dst[0][3] <- Src[0][0] +3 +; i.e Cust_20(0x0) => 0xe4 + +SUB R16 R16 R16 +ST TD[0].0 R0 +ST TD[0].1 R0 +ST TD[0].2 R0 +ST TD[0].3 R0 + +; Apply PbsMl4 on Src[0] result goes in dest[0][0-3] +LD R0 TS[0].0 +PBS_ML4_F R0 R0 PbsTestMany4 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_21.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_21.asm new file mode 100644 index 000000000..5a601bbe6 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_21.asm @@ -0,0 +1,24 @@ +; CUST_21 +; Simple IOp to check PbsMl8 +; WARN: This operation required 16b ct width +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- Src[0][0] +1 +; * Dst[0][2] <- Src[0][0] +2 +; * Dst[0][3] <- Src[0][0] +3 +; * Dst[0][4] <- Src[0][0] +4 +; * Dst[0][5] <- Src[0][0] +5 +; * Dst[0][6] <- Src[0][0] +6 +; * Dst[0][7] <- Src[0][0] +7 + +; Apply PbsMl8 on Src[0] result goes in dest[0][0-7] +LD R0 TS[0].0 +PBS_ML8_F R0 R0 PbsTestMany8 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 +ST TD[0].4 R4 +ST TD[0].5 R5 +ST TD[0].6 R6 +ST TD[0].7 R7 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_3.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_3.asm new file mode 100644 index 000000000..d13ca243c --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_3.asm @@ -0,0 +1,16 @@ +# CUST_3 +# Simple IOp to check isc behavior +# Generate obvious deps and check that isc correctly issued the dop +# Correct result must bu Dest <- Src[0] +LD R0 TS[0].0 +LD R1 TS[0].1 +LD R2 TS[0].2 +LD R3 TS[0].3 +PBS R4 R0 PbsNone +ST TD[0].0 R4 +PBS R4 R1 PbsNone +ST TD[0].1 R4 +PBS R4 R2 PbsNone +ST TD[0].2 R4 +PBS_F R4 R3 PbsNone +ST TD[0].3 R4 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_8.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_8.asm new file mode 100644 index 000000000..c02eee9cd --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_8.asm @@ -0,0 +1,19 @@ +; CUST_8 +; Simple IOp to check the ALU operation +; Dst[0].0 <- Src[0].0 + Src[1].0 +LD R1 TS[0].0 +LD R2 TS[1].0 +ADD R0 R1 R2 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 - Src[1].1 +LD R5 TS[0].1 +LD R6 TS[1].1 +SUB R4 R5 R6 +ST TD[0].1 R4 + +; Dst[0].2 <- Src[0].2 + (Src[1].2 *4) +LD R9 TS[0].2 +LD R10 TS[1].2 +MAC R8 R9 R10 4 +ST TD[0].2 R8 diff --git a/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_9.asm b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_9.asm new file mode 100644 index 000000000..5e5cc4129 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/custom_iop/cust_9.asm @@ -0,0 +1,21 @@ +; CUST_9 +; Simple IOp to check the ALU Scalar operation +; Dst[0].0 <- Src[0].0 + Imm[0].0 +LD R1 TS[0].0 +ADDS R0 R1 TI[0].0 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 - Imm[0].1 +LD R5 TS[0].1 +SUBS R4 R5 TI[0].1 +ST TD[0].1 R4 + +; Dst[0].2 <- Imm[0].2 - Src[0].2 +LD R9 TS[0].2 +SSUB R8 R9 TI[0].2 +ST TD[0].2 R8 + +; Dst[0].3 <- Src[0].3 * Imm[0].3 +LD R13 TS[0].3 +MULS R12 R13 TI[0].3 +ST TD[0].3 R12 diff --git a/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml b/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml new file mode 100644 index 000000000..80e7a4827 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_config.toml @@ -0,0 +1,108 @@ + +[fpga] + regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_1in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/tb_hpu_regif_dummy.toml"] + polling_us=100000 +[fpga.ffi.Sim] + ipc_name="/tmp/${USER}/hpu_mockup_ipc" + +[rtl] + bpip_use = true + bpip_use_opportunism = true + bpip_timeout = 100_000 + +[board] + ct_mem = 32768 + ct_pc = [ + {Hbm= {pc=32}}, + {Hbm= {pc=33}}, + ] + heap_size = 16384 + + lut_mem = 256 + lut_pc = {Hbm={pc=34}} + + fw_size= 16777215 # i.e. 16 MiB + fw_pc = {Ddr= {offset= 0x3900_0000}} # NB: Allocation must take place in the Discret DDR + + bsk_pc = [ + {Hbm={pc=8}}, + {Hbm={pc=12}}, + {Hbm={pc=24}}, + {Hbm={pc=28}}, + {Hbm={pc=40}}, + {Hbm={pc=44}}, + {Hbm={pc=56}}, + {Hbm={pc=60}} + ] + + ksk_pc = [ + {Hbm={pc=0}}, + {Hbm={pc=1}}, + {Hbm={pc=2}}, + {Hbm={pc=3}}, + {Hbm={pc=4}}, + {Hbm={pc=5}}, + {Hbm={pc=6}}, + {Hbm={pc=7}}, + {Hbm={pc=16}}, + {Hbm={pc=17}}, + {Hbm={pc=18}}, + {Hbm={pc=19}}, + {Hbm={pc=20}}, + {Hbm={pc=21}}, + {Hbm={pc=22}}, + {Hbm={pc=23}} + ] + + trace_pc = {Hbm={pc=35}} + trace_depth = 32 # In MB + +[firmware] + implementation = "Llt" + integer_w=[2,4,6,8,10,12,14,16,32,64,128] + min_batch_size = 11 + kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml" + custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm" + custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm" + custom_iop.'IOP[2]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_2.asm" + custom_iop.'IOP[3]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_3.asm" + custom_iop.'IOP[8]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_8.asm" + custom_iop.'IOP[9]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_9.asm" + custom_iop.'IOP[16]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_16.asm" + custom_iop.'IOP[17]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_17.asm" + custom_iop.'IOP[18]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_18.asm" + custom_iop.'IOP[19]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_19.asm" + custom_iop.'IOP[20]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_20.asm" + custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm" + +[firmware.op_cfg.default] + fill_batch_fifo = true + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.MUL] + fill_batch_fifo = false + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.MULS] + fill_batch_fifo = false + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.ERC_20] + fill_batch_fifo = true + min_batch_size = false + use_tiers = true + flush_behaviour = "Patient" + flush = true diff --git a/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_1in3.toml b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_1in3.toml new file mode 100644 index 000000000..bfdb80263 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_1in3.toml @@ -0,0 +1,256 @@ +module_name="hpu_regif_core_cfg_1in3" +description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it." +word_size_b = 32 +offset = 0x00 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.entry_cfg_1in3] +description="entry_cfg_1in3 section with known value used for debug." +offset= 0x0 + +[section.entry_cfg_1in3.register.dummy_val0] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x01010101} + +[section.entry_cfg_1in3.register.dummy_val1] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x11111111} + +[section.entry_cfg_1in3.register.dummy_val2] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x21212121} + + +[section.entry_cfg_1in3.register.dummy_val3] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x31313131} + +# ===================================================================================================================== +[section.info] +description="RTL architecture parameters" +offset= 0x10 + +[section.info.register.version] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="VERSION"} + +[section.info.register.ntt_architecture] + description="NTT architecture" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="NTT_CORE_ARCH"} + +[section.info.register.ntt_structure] + description="NTT structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.radix = { size_b=8, offset_b=0 , default={Param="R"}, description="NTT radix"} + field.psi = { size_b=8, offset_b=8 , default={Param="PSI"}, description="NTT psi"} + field.div = { size_b=8, offset_b=16, default={Param="BWD_PSI_DIV"}, description="NTT backward div"} + field.delta = { size_b=8, offset_b=24, default={Param="DELTA"}, description="NTT network delta (for wmm arch)"} + +[section.info.register.ntt_rdx_cut] + description="NTT radix cuts, in log2 unit (for gf64 arch)" + owner="Parameter" + read_access="Read" + write_access="None" + field.radix_cut0 = { size_b=4, offset_b=0 , default={Param="NTT_RDX_CUT_S_0"}, description="NTT radix cut #0"} + field.radix_cut1 = { size_b=4, offset_b=4 , default={Param="NTT_RDX_CUT_S_1"}, description="NTT radix cut #1"} + field.radix_cut2 = { size_b=4, offset_b=8 , default={Param="NTT_RDX_CUT_S_2"}, description="NTT radix cut #2"} + field.radix_cut3 = { size_b=4, offset_b=12, default={Param="NTT_RDX_CUT_S_3"}, description="NTT radix cut #3"} + field.radix_cut4 = { size_b=4, offset_b=16, default={Param="NTT_RDX_CUT_S_4"}, description="NTT radix cut #4"} + field.radix_cut5 = { size_b=4, offset_b=20, default={Param="NTT_RDX_CUT_S_5"}, description="NTT radix cut #5"} + field.radix_cut6 = { size_b=4, offset_b=24, default={Param="NTT_RDX_CUT_S_6"}, description="NTT radix cut #6"} + field.radix_cut7 = { size_b=4, offset_b=28, default={Param="NTT_RDX_CUT_S_7"}, description="NTT radix cut #7"} + +[section.info.register.ntt_pbs] + description="Maximum number of PBS in the NTT pipeline" + owner="Parameter" + read_access="Read" + write_access="None" + field.batch_pbs_nb = { size_b=8, offset_b=0 , default={Param="BATCH_PBS_NB"}, description="Maximum number of PBS in the NTT pipe"} + field.total_pbs_nb = { size_b=8, offset_b=8 , default={Param="TOTAL_PBS_NB"}, description="Maximum number of PBS stored in PEP buffer"} + +[section.info.register.ntt_modulo] + description="Code associated to the NTT prime" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="MOD_NTT_NAME"} + +[section.info.register.application] + description="Code associated with the application" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="APPLICATION_NAME"} + +[section.info.register.ks_structure] + description="Key-switch structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.x = { size_b=8, offset_b=0 , default={Param="LBX"}, description="Number of coefficients on X dimension"} + field.y = { size_b=8, offset_b=8 , default={Param="LBY"}, description="Number of coefficients on Y dimension"} + field.z = { size_b=8, offset_b=16, default={Param="LBZ"}, description="Number of coefficients on Z dimension"} + +[section.info.register.ks_crypto_param] + description="Key-switch crypto parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.mod_ksk_w = { size_b=8, offset_b=0 , default={Param="MOD_KSK_W"}, description="Width of KSK modulo"} + field.ks_l = { size_b=8, offset_b=8 , default={Param="KS_L"}, description="Number of KS decomposition level"} + field.ks_b = { size_b=8, offset_b=16, default={Param="KS_B_W"}, description="Width of KS decomposition base"} + +[section.info.register.regf_structure] + description="Register file structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.reg_nb = { size_b=8, offset_b=0 , default={Param="REGF_REG_NB"}, description="Number of registers in regfile"} + field.coef_nb = { size_b=8, offset_b=8 , default={Param="REGF_COEF_NB"}, description="Number of coefficients at regfile interface"} + +[section.info.register.isc_structure] + description="Instruction scheduler structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.depth = { size_b=8, offset_b=0 , default={Param="ISC_DEPTH"}, description="Number of slots in ISC lookahead buffer."} + field.min_iop_size = { size_b=8, offset_b=8 , default={Param="MIN_IOP_SIZE"}, description="Minimum number of DOp per IOp to prevent sync_id overflow."} + +[section.info.register.pe_properties] + description="Processing elements parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.alu_nb = { size_b=8, offset_b=24 , default={Param="PEA_ALU_NB"}, description="Number of coefficients processed in parallel in pe_alu"} + field.pep_regf_period = { size_b=8, offset_b=16 , default={Param="PEP_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEP and regfile"} + field.pem_regf_period = { size_b=8, offset_b=8 , default={Param="PEM_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEM and regfile"} + field.pea_regf_period = { size_b=8, offset_b=0 , default={Param="PEA_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEA and regfile"} + +[section.info.register.bsk_structure] + description="BSK manager structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.bsk_cut_nb = { size_b=8, offset_b=8 , default={Param="BSK_CUT_NB"}, description="BSK cut nb"} + +[section.info.register.ksk_structure] + description="KSK manager structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.ksk_cut_nb = { size_b=8, offset_b=8 , default={Param="KSK_CUT_NB"}, description="KSK cut nb"} + +[section.info.register.hbm_axi4_nb] + description="Number of AXI4 connections to HBM" + owner="Parameter" + read_access="Read" + write_access="None" + field.bsk_pc = { size_b=8, offset_b=0 , default={Param="BSK_PC"}, description="Number of HBM connections for BSK"} + field.ksk_pc = { size_b=8, offset_b=8, default={Param="KSK_PC"}, description="Number of HBM connections for KSK"} + field.pem_pc = { size_b=8, offset_b=16, default={Param="PEM_PC"}, description="Number of HBM connections for ciphertexts (PEM)"} + field.glwe_pc = { size_b=8, offset_b=24, default={Param="GLWE_PC"}, description="Number of HBM connections for GLWE"} + +[section.info.register.hbm_axi4_dataw_pem] + description="Ciphertext HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_PEM_DATA_W"} + +[section.info.register.hbm_axi4_dataw_glwe] + description="GLWE HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_GLWE_DATA_W"} + +[section.info.register.hbm_axi4_dataw_bsk] + description="BSK HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_BSK_DATA_W"} + +[section.info.register.hbm_axi4_dataw_ksk] + description="KSK HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_KSK_DATA_W"} + + +# ===================================================================================================================== +[section.hbm_axi4_addr_1in3] +offset= 0x1000 +description="HBM AXI4 connection address offset" + +[section.hbm_axi4_addr_1in3.register.ct] + description="Address offset for each ciphertext HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb","_pc1_lsb", "_pc1_msb"] + +[section.hbm_axi4_addr_1in3.register.glwe] + description="Address offset for each GLWE HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb"] + + +[section.hbm_axi4_addr_1in3.register.ksk] + description="Address offset for each KSK HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"] + + [section.hbm_axi4_addr_1in3.register.trc] + description="Address offset for each trace HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb"] + +# ===================================================================================================================== +[section.bpip] +offset= 0x2000 +description="BPIP configuration" + +[section.bpip.register.use] + description="(1) Use BPIP mode, (0) use IPIP mode (default)" + owner="User" + read_access="Read" + write_access="Write" + field.use_bpip = { size_b=1, offset_b=0 , default={Cst=1}, description="use"} + field.use_opportunism = { size_b=1, offset_b=1 , default={Cst=0}, description="use opportunistic PBS flush"} + +[section.bpip.register.timeout] + description="Timeout for BPIP mode" + owner="User" + read_access="Read" + write_access="Write" + default={Cst=0xffffffff} diff --git a/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_3in3.toml b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_3in3.toml new file mode 100644 index 000000000..4afc095ab --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_cfg_3in3.toml @@ -0,0 +1,51 @@ +module_name="hpu_regif_core_cfg_3in3" +description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it." +word_size_b = 32 +offset = 0x20000 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.entry_cfg_3in3] +description="entry_cfg_3in3 section with known value used for debug." +offset= 0x0 + +[section.entry_cfg_3in3.register.dummy_val0] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x03030303} + +[section.entry_cfg_3in3.register.dummy_val1] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x13131313} + +[section.entry_cfg_3in3.register.dummy_val2] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x23232323} + +[section.entry_cfg_3in3.register.dummy_val3] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x33333333} + +# ===================================================================================================================== +[section.hbm_axi4_addr_3in3] +description="HBM AXI4 connection address offset" +offset= 0x10 + +[section.hbm_axi4_addr_3in3.register.bsk] + description="Address offset for each BSK HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"] diff --git a/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_1in3.toml b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_1in3.toml new file mode 100644 index 000000000..ef20175f8 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_1in3.toml @@ -0,0 +1,336 @@ +module_name="hpu_regif_core_prc_1in3" +description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it." +word_size_b = 32 +offset = 0x10000 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.entry_prc_1in3] +description="entry_prc_1in3 section with known value used for debug." +offset= 0x0 + +[section.entry_prc_1in3.register.dummy_val0] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x02020202} + +[section.entry_prc_1in3.register.dummy_val1] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x12121212} + +[section.entry_prc_1in3.register.dummy_val2] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x22222222} + +[section.entry_prc_1in3.register.dummy_val3] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x32323232} + +# ===================================================================================================================== +[section.status_1in3] +description="HPU status of part 1in3" +offset= 0x10 + +[section.status_1in3.register.error] + description="Error register (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.pbs = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 1in3"} + +# ===================================================================================================================== +[section.ksk_avail] +description="KSK availability configuration" +offset= 0x1000 + +[section.ksk_avail.register.avail] + description="KSK available bit" + owner="User" + read_access="Read" + write_access="Write" + field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"} + +[section.ksk_avail.register.reset] + description="KSK reset sequence" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"} + field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"} + +# ===================================================================================================================== +[section.runtime_1in3] +description="Runtime information" +offset= 0x2000 + +[section.runtime_1in3.register.pep_cmux_loop] + description="PEP: CMUX iteration loop number" + owner="Kernel" + read_access="Read" + write_access="None" + field.br_loop = { size_b=15, offset_b=0 , default={Cst=0}, description="PBS current BR-loop"} + field.br_loop_c = { size_b=1, offset_b=15 , default={Cst=0}, description="PBS current BR-loop parity"} + field.ks_loop = { size_b=15, offset_b=16 , default={Cst=0}, description="KS current KS-loop"} + field.ks_loop_c = { size_b=1, offset_b=31 , default={Cst=0}, description="KS current KS-loop parity"} + +[section.runtime_1in3.register.pep_pointer_0] + description="PEP: pointers (part 1)" + owner="Kernel" + read_access="Read" + write_access="None" + field.pool_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP pool_rp"} + field.pool_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP pool_wp"} + field.ldg_pt = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ldg_pt"} + field.ldb_pt = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ldb_pt"} + +[section.runtime_1in3.register.pep_pointer_1] + description="PEP: pointers (part 2)" + owner="Kernel" + read_access="Read" + write_access="None" + field.ks_in_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP ks_in_rp"} + field.ks_in_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP ks_in_wp"} + field.ks_out_rp = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ks_out_rp"} + field.ks_out_wp = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ks_out_wp"} + +[section.runtime_1in3.register.pep_pointer_2] + description="PEP: pointers (part 3)" + owner="Kernel" + read_access="Read" + write_access="None" + field.pbs_in_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP pbs_in_rp"} + field.pbs_in_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP pbs_in_wp"} + field.ipip_flush_last_pbs_in_loop = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP IPIP flush last pbs_in_loop"} + +[section.runtime_1in3.register.isc_latest_instruction] + description="ISC: 4 latest instructions received ([0] is the most recent)" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_0","_1","_2","_3"] + +[section.runtime_1in3.register.pep_seq_bpip_batch_cnt] + description="PEP: BPIP batch counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_flush_cnt] + description="PEP: BPIP batch triggered by a flush counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_timeout_cnt] + description="PEP: BPIP batch triggered by a timeout counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_waiting_batch_cnt] + description="PEP: BPIP batch that waits the trigger counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_filling_cnt] + description="PEP: Count batch with filled with a given number of CT (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_1","_2","_3","_4","_5","_6","_7","_8","_9","_10","_11","_12","_13","_14","_15","_16"] + +[section.runtime_1in3.register.pep_seq_ld_ack_cnt] + description="PEP: load BLWE ack counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_cmux_not_full_batch_cnt] + description="PEP: not full batch CMUX counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_ipip_flush_cnt] + description="PEP: IPIP flush CMUX counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldb_rcp_dur] + description="PEP: load BLWE reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldg_req_dur] + description="PEP: load GLWE request max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldg_rcp_dur] + description="PEP: load GLWE reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_load_ksk_rcp_dur] + description="PEP: load KSK slice reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"] + + +[section.runtime_1in3.register.pep_mmacc_sxt_rcp_dur] + description="PEP: MMACC SXT reception duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_mmacc_sxt_req_dur] + description="PEP: MMACC SXT request duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_mmacc_sxt_cmd_wait_b_dur] + description="PEP: MMACC SXT command wait for b duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_inst_cnt] + description="PEP: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ack_cnt] + description="PEP: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_inst_cnt] + description="PEM: load input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_ack_cnt] + description="PEM: load instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_store_inst_cnt] + description="PEM: store input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_store_ack_cnt] + description="PEM: store instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pea_inst_cnt] + description="PEA: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pea_ack_cnt] + description="PEA: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.isc_inst_cnt] + description="ISC: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.isc_ack_cnt] + description="ISC: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_info_0] + description="PEM: load first data)" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_pc0_0","_pc0_1","_pc0_2","_pc0_3","_pc1_0","_pc1_1","_pc1_2","_pc1_3"] + +[section.runtime_1in3.register.pem_load_info_1] + description="PEM: load first address" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_pc0_lsb","_pc0_msb","_pc1_lsb","_pc1_msb"] + +[section.runtime_1in3.register.pem_store_info_0] + description="PEM: store info 0)" + owner="Kernel" + read_access="Read" + write_access="None" + field.cmd_vld = { size_b=1, offset_b=0 , default={Cst=0}, description="PEM_ST cmd vld"} + field.cmd_rdy = { size_b=1, offset_b=1 , default={Cst=0}, description="PEM_ST cmd rdy"} + field.pem_regf_rd_req_vld = { size_b=1, offset_b=2 , default={Cst=0}, description="PEM_ST pem_regf_rd_req_vld"} + field.pem_regf_rd_req_rdy = { size_b=1, offset_b=3 , default={Cst=0}, description="PEM_ST pem_regf_rd_req_rdy"} + field.brsp_fifo_in_vld = { size_b=4, offset_b=4 , default={Cst=0}, description="PEM_ST brsp_fifo_in_vld"} + field.brsp_fifo_in_rdy = { size_b=4, offset_b=8 , default={Cst=0}, description="PEM_ST brsp_fifo_in_rdy"} + field.rcp_fifo_in_vld = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST rcp_fifo_in_vld"} + field.rcp_fifo_in_rdy = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST rcp_fifo_in_rdy"} + field.r2_axi_vld = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST r2_axi_vld"} + field.r2_axi_rdy = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST r2_axi_rdy"} + field.c0_enough_location = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST c0_enough_location"} + +[section.runtime_1in3.register.pem_store_info_1] + description="PEM: store info 1" + owner="Kernel" + read_access="Read" + write_access="None" + field.s0_cmd_vld = { size_b=4, offset_b=0 , default={Cst=0}, description="PEM_ST s0_cmd_vld"} + field.s0_cmd_rdy = { size_b=4, offset_b=4 , default={Cst=0}, description="PEM_ST s0_cmd_rdy"} + field.m_axi_bvalid = { size_b=4, offset_b=8 , default={Cst=0}, description="PEM_ST m_axi_bvalid"} + field.m_axi_bready = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST m_axi_bready"} + field.m_axi_wvalid = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST m_axi_wvalid"} + field.m_axi_wready = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST m_axi_wready"} + field.m_axi_awvalid = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST m_axi_awvalid"} + field.m_axi_awready = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST m_axi_awready"} + +[section.runtime_1in3.register.pem_store_info_2] + description="PEM: store info 2" + owner="Kernel" + read_access="Read" + write_access="None" + field.c0_free_loc_cnt = { size_b=16, offset_b=0 , default={Cst=0}, description="PEM_ST c0_free_loc_cnt"} + field.brsp_bresp_cnt = { size_b=16, offset_b=16 , default={Cst=0}, description="PEM_ST brsp_bresp_cnt"} + +[section.runtime_1in3.register.pem_store_info_3] + description="PEM: store info 3" + owner="Kernel" + read_access="Read" + write_access="None" + field.brsp_ack_seen = { size_b=16, offset_b=0 , default={Cst=0}, description="PEM_ST brsp_ack_seen"} + field.c0_cmd_cnt = { size_b=8, offset_b=16 , default={Cst=0}, description="PEM_ST c0_cmd_cnt"} diff --git a/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_3in3.toml b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_3in3.toml new file mode 100644 index 000000000..627f140c1 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/hpu_regif_core_prc_3in3.toml @@ -0,0 +1,100 @@ +module_name="hpu_regif_core_prc_3in3" +description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it." +word_size_b = 32 +offset = 0x30000 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.entry_prc_3in3] +description="entry_prc_3in3 section with known value used for debug." +offset= 0x0 + +[section.entry_prc_3in3.register.dummy_val0] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x04040404} + +[section.entry_prc_3in3.register.dummy_val1] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x14141414} + +[section.entry_prc_3in3.register.dummy_val2] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x24242424} + +[section.entry_prc_3in3.register.dummy_val3] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x34343434} + +# ===================================================================================================================== +[section.status_3in3] +description="HPU status of parts 2in3 and 3in3" +offset= 0x10 + +[section.status_3in3.register.error] + description="Error register (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.pbs = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 3in3"} + +# ===================================================================================================================== +[section.bsk_avail] +description="BSK availability configuration" +offset= 0x1000 + +[section.bsk_avail.register.avail] + description="BSK available bit" + owner="User" + read_access="Read" + write_access="Write" + field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"} + +[section.bsk_avail.register.reset] + description="BSK reset sequence" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"} + field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"} + +# ===================================================================================================================== +[section.runtime_3in3] +description="Runtime information" +offset= 0x2000 + +[section.runtime_3in3.register.pep_load_bsk_rcp_dur] + description="PEP: load BSK slice reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"] + +[section.runtime_3in3.register.pep_bskif_req_info_0] + description="PEP: BSK_IF: requester info 0" + owner="Kernel" + read_access="Read" + write_access="None" + field.req_br_loop_rp = { size_b=16, offset_b=0 , default={Cst=0}, description="PEP BSK_IF requester BSK read pointer"} + field.req_br_loop_wp = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK write pointer"} + +[section.runtime_3in3.register.pep_bskif_req_info_1] + description="PEP: BSK_IF: requester info 0" + owner="Kernel" + read_access="Read" + write_access="None" + field.req_prf_br_loop = { size_b=16, offset_b=0 , default={Cst=0}, description="PEP BSK_IF requester BSK prefetch pointer"} + field.req_parity = { size_b=1, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK pointer parity"} + field.req_assigned = { size_b=1, offset_b=31 , default={Cst=0}, description="PEP BSK_IF requester assignment"} diff --git a/backends/tfhe-hpu-backend/config_store/sim/tb_hpu_regif_dummy.toml b/backends/tfhe-hpu-backend/config_store/sim/tb_hpu_regif_dummy.toml new file mode 100644 index 000000000..2777aaf78 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/sim/tb_hpu_regif_dummy.toml @@ -0,0 +1,22 @@ +module_name="tb_hpu_regif_dummy" +description="Fake registers needed by the mockup" +word_size_b = 32 +offset = 0x40000 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ============================================================================== +[section.WorkAck] +description="Purpose of this section" + +[section.WorkAck.register.workq] + description="Insert work in workq and read status" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.WorkAck.register.ackq] + description="Pop ack from in ackq" + owner="Kernel" + read_access="ReadNotify" + write_access="None" diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/Readme.md b/backends/tfhe-hpu-backend/config_store/u55c_gf64/Readme.md new file mode 100644 index 000000000..2fbe5df9f --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/Readme.md @@ -0,0 +1,6 @@ +# Fpga version + +Built with the following command: (i.e. xrt/run_syn_hpu_msplit_3parts_64b.sh) +``` +just zaxl-build hpu_msplit_3parts 3 "0:300" "-F TOP_MSPLIT TOP_MSPLIT_1 -F TOP_BATCH TOP_BATCH_TOPhpu_BPBS8_TPBS32 -F TOP_PCMAX TOP_PCMAX_pem2_glwe1_bsk8_ksk8 -F TOP_PC TOP_PC_pem2_glwe1_bsk4_ksk4 -F APPLICATION APPLI_msg2_carry2 -F NTT_MOD NTT_MOD_goldilocks -F NTT_CORE_ARCH NTT_CORE_ARCH_gf64 -F NTT_CORE_R_PSI NTT_CORE_R2_PSI16 -F NTT_CORE_RDX_CUT NTT_CORE_RDX_CUT_n5c5c1 -F NTT_CORE_DIV NTT_CORE_DIV_1 -F BSK_SLOT_CUT BSK_SLOT8_CUT4 -F KSK_SLOT_CUT KSK_SLOT8_CUT4 -F KSLB KSLB_x2y32z3 -F HPU_PART HPU_PART_gf64 -F AXI_DATA_W AXI_DATA_W_512" "1:${PROJECT_DIR}/hw/output/micro_code/ucore_fw.elf" 'D:MEMORY_FILE_PATH=\\\"${PROJECT_DIR}/hw/\\\"' | tee build_out.log +``` diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_0.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_0.asm new file mode 100644 index 000000000..838beed9e --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_0.asm @@ -0,0 +1,15 @@ +# CUST_0 +# Simple IOp to check the xfer between Hpu/Cpu +# Construct constant in dest slot -> 249 (0xf9) +SUB R0 R0 R0 +ADDS R0 R0 1 +ST TD[0].0 R0 +SUB R1 R1 R1 +ADDS R1 R1 2 +ST TD[0].1 R1 +SUB R2 R2 R2 +ADDS R2 R2 3 +ST TD[0].2 R2 +SUB R3 R3 R3 +ADDS R3 R3 3 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_1.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_1.asm new file mode 100644 index 000000000..3679e2c5f --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_1.asm @@ -0,0 +1,11 @@ +# CUST_1 +# Simple IOp to check the xfer between Hpu/Cpu +# Dest <- Src_a +LD R0 TS[0].0 +LD R1 TS[0].1 +LD R2 TS[0].2 +LD R3 TS[0].3 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_10.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_10.asm new file mode 100644 index 000000000..f591d66b3 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_10.asm @@ -0,0 +1,25 @@ +; CUST_8 +; Simple IOp to check the ALU operation +; Dst[0].0 <- Src[0].0 + Src[1].0 +LD R1 TS[0].0 +LD R2 TS[1].0 +ADD R0 R1 R2 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 + Src[1].1 +LD R5 TS[0].1 +LD R6 TS[1].1 +ADD R4 R5 R6 +ST TD[0].2 R4 + +; Dst[0].2 <- Src[0].2 + Src[1].2 +LD R9 TS[0].2 +LD R10 TS[1].2 +ADD R8 R9 R10 +ST TD[0].2 R8 + +; Dst[0].3 <- Src[0].3 + Src[1].3 +LD R13 TS[0].3 +LD R14 TS[1].3 +ADD R12 R13 R14 +ST TD[0].3 R0 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_16.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_16.asm new file mode 100644 index 000000000..0b4cfe80f --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_16.asm @@ -0,0 +1,6 @@ +# CUST_16 +# Simple IOp to check PBS behavior +# Dest <- PBSNone(Src_a.0) +LD R0 TS[0].0 +PBS_F R0 R0 PbsNone +ST TD[0].0 R0 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_17.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_17.asm new file mode 100644 index 000000000..bdb6711a7 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_17.asm @@ -0,0 +1,15 @@ +# CUST_17 +# Simple IOp to check PBS behavior +# Dest <- PBSNone(Src_a) +LD R0 TS[0].0 +PBS R0 R0 PbsNone +ST TD[0].0 R0 +LD R1 TS[0].1 +PBS R1 R1 PbsNone +ST TD[0].1 R1 +LD R2 TS[0].2 +PBS R2 R2 PbsNone +ST TD[0].2 R2 +LD R3 TS[0].3 +PBS_F R3 R3 PbsNone +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_18.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_18.asm new file mode 100644 index 000000000..c4b9a46a0 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_18.asm @@ -0,0 +1,23 @@ +; CUST_18 +; Simple IOp to check extraction pattern +; Correct result: +; * Dst[0,1] <- Src[0][0,1] +; * Dst[2,3] <- Src[1][0,1] + +; Pack Src[0][0,1] with a Mac and extract Carry/Msg in Dst[0][0,1] +LD R0 TS[0].0 +LD R1 TS[0].1 +MAC R3 R1 R0 4 +PBS R4 R3 PbsMsgOnly +PBS R5 R3 PbsCarryInMsg +ST TD[0].0 R4 +ST TD[0].1 R5 + +; Pack Src[1][0,1] with a Mac and extract Carry/Msg in Dst[0][2,3] +LD R10 TS[1].0 +LD R11 TS[1].1 +MAC R13 R11 R10 4 +PBS R14 R13 PbsMsgOnly +PBS R15 R13 PbsCarryInMsg +ST TD[0].2 R14 +ST TD[0].3 R15 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_19.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_19.asm new file mode 100644 index 000000000..0974347fa --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_19.asm @@ -0,0 +1,19 @@ +; CUST_19 +; Simple IOp to check PbsMl2 +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- 0 +; * Dst[0][2] <- Src[0][0] +1 +; * Dst[0][3] <- 0 +; i.e Cust_19(0x2) => 0x32 + +; Construct a 0 for destination padding +SUB R16 R16 R16 + +; Apply PbsMl2 on Src[0] result goes in dest[0][0-3] (0-padded) +LD R0 TS[0].0 +PBS_ML2_F R0 R0 PbsTestMany2 +ST TD[0].0 R0 +ST TD[0].1 R16 +ST TD[0].2 R1 +ST TD[0].3 R16 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_2.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_2.asm new file mode 100644 index 000000000..bc8e0175e --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_2.asm @@ -0,0 +1,11 @@ +# CUST_2 +# Simple IOp to check the xfer between Hpu/Cpu +# Dest <- Src_b +LD R0 TS[1].0 +LD R1 TS[1].1 +LD R2 TS[1].2 +LD R3 TS[1].3 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_20.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_20.asm new file mode 100644 index 000000000..5f29f8ee5 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_20.asm @@ -0,0 +1,22 @@ +; CUST_20 +; Simple IOp to check PbsMl4 +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- Src[0][0] +1 +; * Dst[0][2] <- Src[0][0] +2 +; * Dst[0][3] <- Src[0][0] +3 +; i.e Cust_20(0x0) => 0xe4 + +SUB R16 R16 R16 +ST TD[0].0 R0 +ST TD[0].1 R0 +ST TD[0].2 R0 +ST TD[0].3 R0 + +; Apply PbsMl4 on Src[0] result goes in dest[0][0-3] +LD R0 TS[0].0 +PBS_ML4_F R0 R0 PbsTestMany4 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_21.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_21.asm new file mode 100644 index 000000000..5a601bbe6 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_21.asm @@ -0,0 +1,24 @@ +; CUST_21 +; Simple IOp to check PbsMl8 +; WARN: This operation required 16b ct width +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- Src[0][0] +1 +; * Dst[0][2] <- Src[0][0] +2 +; * Dst[0][3] <- Src[0][0] +3 +; * Dst[0][4] <- Src[0][0] +4 +; * Dst[0][5] <- Src[0][0] +5 +; * Dst[0][6] <- Src[0][0] +6 +; * Dst[0][7] <- Src[0][0] +7 + +; Apply PbsMl8 on Src[0] result goes in dest[0][0-7] +LD R0 TS[0].0 +PBS_ML8_F R0 R0 PbsTestMany8 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 +ST TD[0].4 R4 +ST TD[0].5 R5 +ST TD[0].6 R6 +ST TD[0].7 R7 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_3.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_3.asm new file mode 100644 index 000000000..d13ca243c --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_3.asm @@ -0,0 +1,16 @@ +# CUST_3 +# Simple IOp to check isc behavior +# Generate obvious deps and check that isc correctly issued the dop +# Correct result must bu Dest <- Src[0] +LD R0 TS[0].0 +LD R1 TS[0].1 +LD R2 TS[0].2 +LD R3 TS[0].3 +PBS R4 R0 PbsNone +ST TD[0].0 R4 +PBS R4 R1 PbsNone +ST TD[0].1 R4 +PBS R4 R2 PbsNone +ST TD[0].2 R4 +PBS_F R4 R3 PbsNone +ST TD[0].3 R4 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_8.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_8.asm new file mode 100644 index 000000000..c02eee9cd --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_8.asm @@ -0,0 +1,19 @@ +; CUST_8 +; Simple IOp to check the ALU operation +; Dst[0].0 <- Src[0].0 + Src[1].0 +LD R1 TS[0].0 +LD R2 TS[1].0 +ADD R0 R1 R2 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 - Src[1].1 +LD R5 TS[0].1 +LD R6 TS[1].1 +SUB R4 R5 R6 +ST TD[0].1 R4 + +; Dst[0].2 <- Src[0].2 + (Src[1].2 *4) +LD R9 TS[0].2 +LD R10 TS[1].2 +MAC R8 R9 R10 4 +ST TD[0].2 R8 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_9.asm b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_9.asm new file mode 100644 index 000000000..5e5cc4129 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/custom_iop/cust_9.asm @@ -0,0 +1,21 @@ +; CUST_9 +; Simple IOp to check the ALU Scalar operation +; Dst[0].0 <- Src[0].0 + Imm[0].0 +LD R1 TS[0].0 +ADDS R0 R1 TI[0].0 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 - Imm[0].1 +LD R5 TS[0].1 +SUBS R4 R5 TI[0].1 +ST TD[0].1 R4 + +; Dst[0].2 <- Imm[0].2 - Src[0].2 +LD R9 TS[0].2 +SSUB R8 R9 TI[0].2 +ST TD[0].2 R8 + +; Dst[0].3 <- Src[0].3 * Imm[0].3 +LD R13 TS[0].3 +MULS R12 R13 TI[0].3 +ST TD[0].3 R12 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_config.toml b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_config.toml new file mode 100644 index 000000000..b5d460a1a --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_config.toml @@ -0,0 +1,98 @@ + +[fpga] + regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core.toml"] + polling_us=10 +[fpga.ffi.Xrt] + id= 0 + kernel= "hpu_msplit_3parts_1in3" + xclbin="${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_msplit_3parts.xclbin" + +[rtl] + bpip_use = true + bpip_use_opportunism = true + bpip_timeout = 100_000 + +[board] + ct_mem = 4096 + ct_pc = [ + {Hbm= {pc=10}}, + {Hbm= {pc=11}}, + ] + heap_size = 3584 + + lut_mem = 256 + lut_pc = {Hbm={pc=12}} + + fw_size= 65536 + fw_pc = {Hbm={pc=1}} + + bsk_pc = [ + {Hbm={pc=2}}, + {Hbm={pc=3}}, + {Hbm={pc=4}}, + {Hbm={pc=5}}, + {Hbm={pc=6}}, + {Hbm={pc=7}}, + {Hbm={pc=8}}, + {Hbm={pc=9}} + ] + + ksk_pc = [ + {Hbm={pc=24}}, + {Hbm={pc=25}}, + {Hbm={pc=26}}, + {Hbm={pc=27}}, + {Hbm={pc=28}}, + {Hbm={pc=29}}, + {Hbm={pc=30}}, + {Hbm={pc=31}} + ] + + trace_pc = {Hbm={pc=0}} + trace_depth = 4 # In MB + +[firmware] + implementation = "Llt" + integer_w=[4,6,8,10,12,14,16,32,64,128] + min_batch_size = 6 + kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml" + custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm" + custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm" + custom_iop.'IOP[2]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_2.asm" + custom_iop.'IOP[3]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_3.asm" + custom_iop.'IOP[8]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_8.asm" + custom_iop.'IOP[9]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_9.asm" + custom_iop.'IOP[16]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_16.asm" + custom_iop.'IOP[17]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_17.asm" + custom_iop.'IOP[18]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_18.asm" + custom_iop.'IOP[19]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_19.asm" + custom_iop.'IOP[20]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_20.asm" + custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm" + +[firmware.op_cfg.default] + fill_batch_fifo = true + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.MUL] + fill_batch_fifo = false + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.MULS] + fill_batch_fifo = false + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.ERC_20] + fill_batch_fifo = false + min_batch_size = true + use_tiers = true + flush_behaviour = "Patient" + flush = true diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin new file mode 100644 index 000000000..604d76461 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35ad67cf9760e37256a6c92cf29ea67334690b724fd3b7b859919ee9b0bde6d3 +size 78194785 diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin.info b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin.info new file mode 100644 index 000000000..cbaa8cba5 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin.info @@ -0,0 +1,1550 @@ + +============================================================================== +XRT Build Version: 2.16.0 (Vitis) + Build Date: 2023-07-13 16:00:55 + Hash ID: 157faa07876c55bb8aa8ec51b28608a6a0f6638e +============================================================================== +xclbin Information +------------------ + Generated by: v++ (2024.1) on 2024-05-20-23:21:20 + Version: 2.16.0 + Kernels: hpu_msplit_3parts_3in3, hpu_msplit_3parts_2in3, hpu_msplit_3parts_1in3 + Signature: + Content: Bitstream + UUID (xclbin): d0bcc1ed-d380-c9cf-175f-c059794490d4 + UUID (IINTF): b7ac1abe1e3e1cb686d5a81232452676 + Sections: BITSTREAM, MEM_TOPOLOGY, IP_LAYOUT, CONNECTIVITY, + CLOCK_FREQ_TOPOLOGY, BUILD_METADATA, + EMBEDDED_METADATA, SYSTEM_METADATA, + PARTITION_METADATA, GROUP_CONNECTIVITY, GROUP_TOPOLOGY +============================================================================== +Hardware Platform (Shell) Information +------------------------------------- + Vendor: xilinx + Board: u55c + Name: gen3x16_xdma_3 + Version: 202210.1 + Generated Version: Vivado 2022.1 (SW Build: 3513633) + Created: + Fri Apr 1 11:16:28 2022 FPGA Device: xcu55c + Board Vendor: xilinx.com + Board Name: xilinx.com:au55c:1.0 + Board Part: xilinx.com:au55c:part0:1.0 + Platform VBNV: xilinx_u55c_gen3x16_xdma_3_202210_1 + Static UUID: b7ac1abe-1e3e-1cb6-86d5-a81232452676 + Feature ROM TimeStamp: 0 + +Scalable Clocks +--------------- + Name: hbm_aclk + Index: 0 + Type: SYSTEM + Frequency: 450 MHz + + Name: KERNEL_CLK + Index: 1 + Type: KERNEL + Frequency: 500 MHz + + Name: DATA_CLK + Index: 2 + Type: DATA + Frequency: 300 MHz + +System Clocks +------ + Name: ulp_ucs_aclk_kernel_00 + Type: SCALABLE + Default Freq: 300 MHz + Requested Freq: 300 MHz + Achieved Freq: 300 MHz + + Name: ulp_ucs_aclk_kernel_01 + Type: SCALABLE + Default Freq: 500 MHz + Requested Freq: 500 MHz + Achieved Freq: 500 MHz + + Name: _bd_top_blp_s_aclk_freerun_ref_00 + Type: FIXED + Default Freq: 100 MHz + +Memory Configuration +-------------------- + Name: HBM[0] + Index: 0 + Type: MEM_HBM + Base Address: 0x0 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[1] + Index: 1 + Type: MEM_DRAM + Base Address: 0x20000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[2] + Index: 2 + Type: MEM_DRAM + Base Address: 0x40000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[3] + Index: 3 + Type: MEM_DRAM + Base Address: 0x60000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[4] + Index: 4 + Type: MEM_DRAM + Base Address: 0x80000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[5] + Index: 5 + Type: MEM_DRAM + Base Address: 0xa0000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[6] + Index: 6 + Type: MEM_DRAM + Base Address: 0xc0000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[7] + Index: 7 + Type: MEM_DRAM + Base Address: 0xe0000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[8] + Index: 8 + Type: MEM_DRAM + Base Address: 0x100000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[9] + Index: 9 + Type: MEM_DRAM + Base Address: 0x120000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[10] + Index: 10 + Type: MEM_DRAM + Base Address: 0x140000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[11] + Index: 11 + Type: MEM_DRAM + Base Address: 0x160000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[12] + Index: 12 + Type: MEM_DRAM + Base Address: 0x180000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[13] + Index: 13 + Type: MEM_DRAM + Base Address: 0x1a0000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[14] + Index: 14 + Type: MEM_DRAM + Base Address: 0x1c0000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[15] + Index: 15 + Type: MEM_DRAM + Base Address: 0x1e0000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[16] + Index: 16 + Type: MEM_DRAM + Base Address: 0x200000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[17] + Index: 17 + Type: MEM_DRAM + Base Address: 0x220000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[18] + Index: 18 + Type: MEM_DRAM + Base Address: 0x240000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[19] + Index: 19 + Type: MEM_DRAM + Base Address: 0x260000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[20] + Index: 20 + Type: MEM_DRAM + Base Address: 0x280000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[21] + Index: 21 + Type: MEM_DRAM + Base Address: 0x2a0000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[22] + Index: 22 + Type: MEM_DRAM + Base Address: 0x2c0000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[23] + Index: 23 + Type: MEM_DRAM + Base Address: 0x2e0000000 + Address Size: 0x20000000 + Bank Used: No + + Name: HBM[24] + Index: 24 + Type: MEM_DRAM + Base Address: 0x300000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[25] + Index: 25 + Type: MEM_DRAM + Base Address: 0x320000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[26] + Index: 26 + Type: MEM_DRAM + Base Address: 0x340000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[27] + Index: 27 + Type: MEM_DRAM + Base Address: 0x360000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[28] + Index: 28 + Type: MEM_DRAM + Base Address: 0x380000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[29] + Index: 29 + Type: MEM_DRAM + Base Address: 0x3a0000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[30] + Index: 30 + Type: MEM_DRAM + Base Address: 0x3c0000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: HBM[31] + Index: 31 + Type: MEM_DRAM + Base Address: 0x3e0000000 + Address Size: 0x20000000 + Bank Used: Yes + + Name: PLRAM[0] + Index: 32 + Type: MEM_DRAM + Base Address: 0x0 + Address Size: 0x0 + Bank Used: No + + Name: PLRAM[1] + Index: 33 + Type: MEM_DRAM + Base Address: 0x0 + Address Size: 0x0 + Bank Used: No + + Name: PLRAM[2] + Index: 34 + Type: MEM_DRAM + Base Address: 0x0 + Address Size: 0x0 + Bank Used: No + + Name: PLRAM[3] + Index: 35 + Type: MEM_DRAM + Base Address: 0x0 + Address Size: 0x0 + Bank Used: No + + Name: PLRAM[4] + Index: 36 + Type: MEM_DRAM + Base Address: 0x0 + Address Size: 0x0 + Bank Used: No + + Name: PLRAM[5] + Index: 37 + Type: MEM_DRAM + Base Address: 0x0 + Address Size: 0x0 + Bank Used: No + + Name: HOST[0] + Index: 38 + Type: MEM_DRAM + Base Address: 0x0 + Address Size: 0x0 + Bank Used: No + + Name: dc_0 + Index: 39 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_1 + Index: 40 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_2 + Index: 41 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_3 + Index: 42 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_4 + Index: 43 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_5 + Index: 44 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_6 + Index: 45 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_7 + Index: 46 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_8 + Index: 47 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_9 + Index: 48 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_10 + Index: 49 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_11 + Index: 50 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_12 + Index: 51 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_13 + Index: 52 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_14 + Index: 53 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_15 + Index: 54 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_16 + Index: 55 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_17 + Index: 56 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_18 + Index: 57 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_19 + Index: 58 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_20 + Index: 59 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_21 + Index: 60 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_22 + Index: 61 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_23 + Index: 62 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_24 + Index: 63 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_25 + Index: 64 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_26 + Index: 65 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_27 + Index: 66 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes + + Name: dc_28 + Index: 67 + Type: MEM_STREAMING_CONNECTION + Base Address: 0x0 + Address Size: 0x0 + Bank Used: Yes +============================================================================== +Kernel: hpu_msplit_3parts_3in3 + +Definition +---------- + Signature: hpu_msplit_3parts_3in3 (void* M_AXI_BSK_0_PTR, void* M_AXI_BSK_1_PTR, void* M_AXI_BSK_2_PTR, void* M_AXI_BSK_3_PTR, void* M_AXI_BSK_4_PTR, void* M_AXI_BSK_5_PTR, void* M_AXI_BSK_6_PTR, void* M_AXI_BSK_7_PTR, void* axis_p2_p3_batch, void* axis_p2_p3_bsk_c, void* axis_p2_p3_proc_c, void* axis_p2_p3_proc_d0, void* axis_p2_p3_proc_d1, void* axis_p2_p3_proc_d2, void* axis_p2_p3_proc_d3, void* axis_p3_p2_bsk_c, void* axis_p3_p2_proc_c, void* axis_p3_p2_proc_d0, void* axis_p3_p2_proc_d1, void* axis_p3_p2_proc_d2, void* axis_p3_p2_proc_d3, void* axis_p3_p2_side) + +Ports +----- + Port: axis_p2_p3_batch + Mode: read_only + Range (bytes): + Data Width: 32 bits + Port Type: stream + + Port: axis_p2_p3_bsk_c + Mode: read_only + Range (bytes): + Data Width: 520 bits + Port Type: stream + + Port: axis_p2_p3_proc_c + Mode: read_only + Range (bytes): + Data Width: 16 bits + Port Type: stream + + Port: axis_p2_p3_proc_d0 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p3_proc_d1 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p3_proc_d2 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p3_proc_d3 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_bsk_c + Mode: write_only + Range (bytes): + Data Width: 8 bits + Port Type: stream + + Port: axis_p3_p2_proc_c + Mode: write_only + Range (bytes): + Data Width: 16 bits + Port Type: stream + + Port: axis_p3_p2_proc_d0 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_proc_d1 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_proc_d2 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_proc_d3 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_side + Mode: write_only + Range (bytes): + Data Width: 256 bits + Port Type: stream + + Port: m_axi_bsk_0 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_bsk_1 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_bsk_2 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_bsk_3 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_bsk_4 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_bsk_5 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_bsk_6 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_bsk_7 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: s_axil + Mode: slave + Range (bytes): 0x10000 + Data Width: 32 bits + Port Type: addressable + +-------------------------- +Instance: hpu_msplit_3parts_3in3_1 + Base Address: 0x1000000 + + Argument: M_AXI_BSK_0_PTR + Register Offset: 0x10 + Port: m_axi_bsk_0 + Memory: HBM[2] (MEM_DRAM) + + Argument: M_AXI_BSK_1_PTR + Register Offset: 0x14 + Port: m_axi_bsk_1 + Memory: HBM[3] (MEM_DRAM) + + Argument: M_AXI_BSK_2_PTR + Register Offset: 0x18 + Port: m_axi_bsk_2 + Memory: HBM[4] (MEM_DRAM) + + Argument: M_AXI_BSK_3_PTR + Register Offset: 0x1c + Port: m_axi_bsk_3 + Memory: HBM[5] (MEM_DRAM) + + Argument: M_AXI_BSK_4_PTR + Register Offset: 0x20 + Port: m_axi_bsk_4 + Memory: HBM[6] (MEM_DRAM) + + Argument: M_AXI_BSK_5_PTR + Register Offset: 0x24 + Port: m_axi_bsk_5 + Memory: HBM[7] (MEM_DRAM) + + Argument: M_AXI_BSK_6_PTR + Register Offset: 0x28 + Port: m_axi_bsk_6 + Memory: HBM[8] (MEM_DRAM) + + Argument: M_AXI_BSK_7_PTR + Register Offset: 0x2c + Port: m_axi_bsk_7 + Memory: HBM[9] (MEM_DRAM) + + Argument: axis_p2_p3_batch + Register Offset: 0x0 + Port: axis_p2_p3_batch + Memory: dc_15 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_bsk_c + Register Offset: 0x0 + Port: axis_p2_p3_bsk_c + Memory: dc_16 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_c + Register Offset: 0x0 + Port: axis_p2_p3_proc_c + Memory: dc_17 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_d0 + Register Offset: 0x0 + Port: axis_p2_p3_proc_d0 + Memory: dc_18 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_d1 + Register Offset: 0x0 + Port: axis_p2_p3_proc_d1 + Memory: dc_19 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_d2 + Register Offset: 0x0 + Port: axis_p2_p3_proc_d2 + Memory: dc_20 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_d3 + Register Offset: 0x0 + Port: axis_p2_p3_proc_d3 + Memory: dc_21 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_bsk_c + Register Offset: 0x0 + Port: axis_p3_p2_bsk_c + Memory: dc_22 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_c + Register Offset: 0x0 + Port: axis_p3_p2_proc_c + Memory: dc_23 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_d0 + Register Offset: 0x0 + Port: axis_p3_p2_proc_d0 + Memory: dc_24 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_d1 + Register Offset: 0x0 + Port: axis_p3_p2_proc_d1 + Memory: dc_25 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_d2 + Register Offset: 0x0 + Port: axis_p3_p2_proc_d2 + Memory: dc_26 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_d3 + Register Offset: 0x0 + Port: axis_p3_p2_proc_d3 + Memory: dc_27 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_side + Register Offset: 0x0 + Port: axis_p3_p2_side + Memory: dc_28 (MEM_STREAMING_CONNECTION) +Kernel: hpu_msplit_3parts_2in3 + +Definition +---------- + Signature: hpu_msplit_3parts_2in3 (void* axis_p1_p2_batch, void* axis_p1_p2_bsk_c, void* axis_p1_p2_ldg_c, void* axis_p1_p2_ldg_d, void* axis_p1_p2_mmacc_side, void* axis_p1_p2_mmfeed_c, void* axis_p1_p2_mmfeed_d0, void* axis_p1_p2_mmfeed_d1, void* axis_p1_p2_mmsxt_c, void* axis_p2_p1_bsk_c, void* axis_p2_p1_mmacc_c, void* axis_p2_p1_mmacc_d0, void* axis_p2_p1_mmacc_side, void* axis_p2_p1_mmsxt_d0, void* axis_p2_p1_side, void* axis_p2_p3_batch, void* axis_p2_p3_bsk_c, void* axis_p2_p3_proc_c, void* axis_p2_p3_proc_d0, void* axis_p2_p3_proc_d1, void* axis_p2_p3_proc_d2, void* axis_p2_p3_proc_d3, void* axis_p3_p2_bsk_c, void* axis_p3_p2_proc_c, void* axis_p3_p2_proc_d0, void* axis_p3_p2_proc_d1, void* axis_p3_p2_proc_d2, void* axis_p3_p2_proc_d3, void* axis_p3_p2_side) + +Ports +----- + Port: axis_p1_p2_batch + Mode: read_only + Range (bytes): + Data Width: 32 bits + Port Type: stream + + Port: axis_p1_p2_bsk_c + Mode: read_only + Range (bytes): + Data Width: 520 bits + Port Type: stream + + Port: axis_p1_p2_ldg_c + Mode: read_only + Range (bytes): + Data Width: 32 bits + Port Type: stream + + Port: axis_p1_p2_ldg_d + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p1_p2_mmacc_side + Mode: read_only + Range (bytes): + Data Width: 64 bits + Port Type: stream + + Port: axis_p1_p2_mmfeed_c + Mode: read_only + Range (bytes): + Data Width: 64 bits + Port Type: stream + + Port: axis_p1_p2_mmfeed_d0 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p1_p2_mmfeed_d1 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p1_p2_mmsxt_c + Mode: read_only + Range (bytes): + Data Width: 64 bits + Port Type: stream + + Port: axis_p2_p1_bsk_c + Mode: write_only + Range (bytes): + Data Width: 8 bits + Port Type: stream + + Port: axis_p2_p1_mmacc_c + Mode: write_only + Range (bytes): + Data Width: 16 bits + Port Type: stream + + Port: axis_p2_p1_mmacc_d0 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p1_mmacc_side + Mode: write_only + Range (bytes): + Data Width: 16 bits + Port Type: stream + + Port: axis_p2_p1_mmsxt_d0 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p1_side + Mode: write_only + Range (bytes): + Data Width: 256 bits + Port Type: stream + + Port: axis_p2_p3_batch + Mode: write_only + Range (bytes): + Data Width: 32 bits + Port Type: stream + + Port: axis_p2_p3_bsk_c + Mode: write_only + Range (bytes): + Data Width: 520 bits + Port Type: stream + + Port: axis_p2_p3_proc_c + Mode: write_only + Range (bytes): + Data Width: 16 bits + Port Type: stream + + Port: axis_p2_p3_proc_d0 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p3_proc_d1 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p3_proc_d2 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p3_proc_d3 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_bsk_c + Mode: read_only + Range (bytes): + Data Width: 8 bits + Port Type: stream + + Port: axis_p3_p2_proc_c + Mode: read_only + Range (bytes): + Data Width: 16 bits + Port Type: stream + + Port: axis_p3_p2_proc_d0 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_proc_d1 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_proc_d2 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_proc_d3 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p3_p2_side + Mode: read_only + Range (bytes): + Data Width: 256 bits + Port Type: stream + +-------------------------- +Instance: hpu_msplit_3parts_2in3_1 + Base Address: not_used + + Argument: axis_p1_p2_batch + Register Offset: 0x0 + Port: axis_p1_p2_batch + Memory: dc_0 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_bsk_c + Register Offset: 0x0 + Port: axis_p1_p2_bsk_c + Memory: dc_1 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_ldg_c + Register Offset: 0x0 + Port: axis_p1_p2_ldg_c + Memory: dc_2 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_ldg_d + Register Offset: 0x0 + Port: axis_p1_p2_ldg_d + Memory: dc_3 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmacc_side + Register Offset: 0x0 + Port: axis_p1_p2_mmacc_side + Memory: dc_4 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmfeed_c + Register Offset: 0x0 + Port: axis_p1_p2_mmfeed_c + Memory: dc_5 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmfeed_d0 + Register Offset: 0x0 + Port: axis_p1_p2_mmfeed_d0 + Memory: dc_6 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmfeed_d1 + Register Offset: 0x0 + Port: axis_p1_p2_mmfeed_d1 + Memory: dc_7 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmsxt_c + Register Offset: 0x0 + Port: axis_p1_p2_mmsxt_c + Memory: dc_8 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_bsk_c + Register Offset: 0x0 + Port: axis_p2_p1_bsk_c + Memory: dc_9 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_mmacc_c + Register Offset: 0x0 + Port: axis_p2_p1_mmacc_c + Memory: dc_10 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_mmacc_d0 + Register Offset: 0x0 + Port: axis_p2_p1_mmacc_d0 + Memory: dc_11 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_mmacc_side + Register Offset: 0x0 + Port: axis_p2_p1_mmacc_side + Memory: dc_12 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_mmsxt_d0 + Register Offset: 0x0 + Port: axis_p2_p1_mmsxt_d0 + Memory: dc_13 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_side + Register Offset: 0x0 + Port: axis_p2_p1_side + Memory: dc_14 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_batch + Register Offset: 0x0 + Port: axis_p2_p3_batch + Memory: dc_15 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_bsk_c + Register Offset: 0x0 + Port: axis_p2_p3_bsk_c + Memory: dc_16 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_c + Register Offset: 0x0 + Port: axis_p2_p3_proc_c + Memory: dc_17 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_d0 + Register Offset: 0x0 + Port: axis_p2_p3_proc_d0 + Memory: dc_18 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_d1 + Register Offset: 0x0 + Port: axis_p2_p3_proc_d1 + Memory: dc_19 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_d2 + Register Offset: 0x0 + Port: axis_p2_p3_proc_d2 + Memory: dc_20 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p3_proc_d3 + Register Offset: 0x0 + Port: axis_p2_p3_proc_d3 + Memory: dc_21 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_bsk_c + Register Offset: 0x0 + Port: axis_p3_p2_bsk_c + Memory: dc_22 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_c + Register Offset: 0x0 + Port: axis_p3_p2_proc_c + Memory: dc_23 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_d0 + Register Offset: 0x0 + Port: axis_p3_p2_proc_d0 + Memory: dc_24 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_d1 + Register Offset: 0x0 + Port: axis_p3_p2_proc_d1 + Memory: dc_25 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_d2 + Register Offset: 0x0 + Port: axis_p3_p2_proc_d2 + Memory: dc_26 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_proc_d3 + Register Offset: 0x0 + Port: axis_p3_p2_proc_d3 + Memory: dc_27 (MEM_STREAMING_CONNECTION) + + Argument: axis_p3_p2_side + Register Offset: 0x0 + Port: axis_p3_p2_side + Memory: dc_28 (MEM_STREAMING_CONNECTION) +Kernel: hpu_msplit_3parts_1in3 + +Definition +---------- + Signature: hpu_msplit_3parts_1in3 (void* M_AXI_UCORE_PTR, void* M_AXI_TRC_PTR, void* M_AXI_PEM_0_PTR, void* M_AXI_PEM_1_PTR, void* M_AXI_GLWE_0_PTR, void* M_AXI_KSK_0_PTR, void* M_AXI_KSK_1_PTR, void* M_AXI_KSK_2_PTR, void* M_AXI_KSK_3_PTR, void* M_AXI_KSK_4_PTR, void* M_AXI_KSK_5_PTR, void* M_AXI_KSK_6_PTR, void* M_AXI_KSK_7_PTR, void* axis_p1_p2_batch, void* axis_p1_p2_bsk_c, void* axis_p1_p2_ldg_c, void* axis_p1_p2_ldg_d, void* axis_p1_p2_mmacc_side, void* axis_p1_p2_mmfeed_c, void* axis_p1_p2_mmfeed_d0, void* axis_p1_p2_mmfeed_d1, void* axis_p1_p2_mmsxt_c, void* axis_p2_p1_bsk_c, void* axis_p2_p1_mmacc_c, void* axis_p2_p1_mmacc_d0, void* axis_p2_p1_mmacc_side, void* axis_p2_p1_mmsxt_d0, void* axis_p2_p1_side) + +Ports +----- + Port: axis_p1_p2_batch + Mode: write_only + Range (bytes): + Data Width: 32 bits + Port Type: stream + + Port: axis_p1_p2_bsk_c + Mode: write_only + Range (bytes): + Data Width: 520 bits + Port Type: stream + + Port: axis_p1_p2_ldg_c + Mode: write_only + Range (bytes): + Data Width: 32 bits + Port Type: stream + + Port: axis_p1_p2_ldg_d + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p1_p2_mmacc_side + Mode: write_only + Range (bytes): + Data Width: 64 bits + Port Type: stream + + Port: axis_p1_p2_mmfeed_c + Mode: write_only + Range (bytes): + Data Width: 64 bits + Port Type: stream + + Port: axis_p1_p2_mmfeed_d0 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p1_p2_mmfeed_d1 + Mode: write_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p1_p2_mmsxt_c + Mode: write_only + Range (bytes): + Data Width: 64 bits + Port Type: stream + + Port: axis_p2_p1_bsk_c + Mode: read_only + Range (bytes): + Data Width: 8 bits + Port Type: stream + + Port: axis_p2_p1_mmacc_c + Mode: read_only + Range (bytes): + Data Width: 16 bits + Port Type: stream + + Port: axis_p2_p1_mmacc_d0 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p1_mmacc_side + Mode: read_only + Range (bytes): + Data Width: 16 bits + Port Type: stream + + Port: axis_p2_p1_mmsxt_d0 + Mode: read_only + Range (bytes): + Data Width: 2048 bits + Port Type: stream + + Port: axis_p2_p1_side + Mode: read_only + Range (bytes): + Data Width: 256 bits + Port Type: stream + + Port: m_axi_glwe_0 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_ksk_0 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_ksk_1 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_ksk_2 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_ksk_3 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_ksk_4 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_ksk_5 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_ksk_6 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_ksk_7 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_pem_0 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_pem_1 + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 512 bits + Port Type: addressable + + Port: m_axi_trc + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 32 bits + Port Type: addressable + + Port: m_axi_ucore + Mode: master + Range (bytes): 0xFFFFFFFF + Data Width: 32 bits + Port Type: addressable + + Port: s_axil + Mode: slave + Range (bytes): 0x10000 + Data Width: 32 bits + Port Type: addressable + +-------------------------- +Instance: hpu_msplit_3parts_1in3_1 + Base Address: 0x800000 + + Argument: M_AXI_UCORE_PTR + Register Offset: 0x10 + Port: m_axi_ucore + Memory: HBM[1] (MEM_DRAM) + + Argument: M_AXI_TRC_PTR + Register Offset: 0x14 + Port: m_axi_trc + Memory: HBM[0] (MEM_HBM) + + Argument: M_AXI_PEM_0_PTR + Register Offset: 0x18 + Port: m_axi_pem_0 + Memory: HBM[10] (MEM_DRAM) + + Argument: M_AXI_PEM_1_PTR + Register Offset: 0x1c + Port: m_axi_pem_1 + Memory: HBM[11] (MEM_DRAM) + + Argument: M_AXI_GLWE_0_PTR + Register Offset: 0x20 + Port: m_axi_glwe_0 + Memory: HBM[12] (MEM_DRAM) + + Argument: M_AXI_KSK_0_PTR + Register Offset: 0x24 + Port: m_axi_ksk_0 + Memory: HBM[24] (MEM_DRAM) + + Argument: M_AXI_KSK_1_PTR + Register Offset: 0x28 + Port: m_axi_ksk_1 + Memory: HBM[25] (MEM_DRAM) + + Argument: M_AXI_KSK_2_PTR + Register Offset: 0x2c + Port: m_axi_ksk_2 + Memory: HBM[26] (MEM_DRAM) + + Argument: M_AXI_KSK_3_PTR + Register Offset: 0x30 + Port: m_axi_ksk_3 + Memory: HBM[27] (MEM_DRAM) + + Argument: M_AXI_KSK_4_PTR + Register Offset: 0x34 + Port: m_axi_ksk_4 + Memory: HBM[28] (MEM_DRAM) + + Argument: M_AXI_KSK_5_PTR + Register Offset: 0x38 + Port: m_axi_ksk_5 + Memory: HBM[29] (MEM_DRAM) + + Argument: M_AXI_KSK_6_PTR + Register Offset: 0x3c + Port: m_axi_ksk_6 + Memory: HBM[30] (MEM_DRAM) + + Argument: M_AXI_KSK_7_PTR + Register Offset: 0x40 + Port: m_axi_ksk_7 + Memory: HBM[31] (MEM_DRAM) + + Argument: axis_p1_p2_batch + Register Offset: 0x0 + Port: axis_p1_p2_batch + Memory: dc_0 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_bsk_c + Register Offset: 0x0 + Port: axis_p1_p2_bsk_c + Memory: dc_1 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_ldg_c + Register Offset: 0x0 + Port: axis_p1_p2_ldg_c + Memory: dc_2 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_ldg_d + Register Offset: 0x0 + Port: axis_p1_p2_ldg_d + Memory: dc_3 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmacc_side + Register Offset: 0x0 + Port: axis_p1_p2_mmacc_side + Memory: dc_4 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmfeed_c + Register Offset: 0x0 + Port: axis_p1_p2_mmfeed_c + Memory: dc_5 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmfeed_d0 + Register Offset: 0x0 + Port: axis_p1_p2_mmfeed_d0 + Memory: dc_6 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmfeed_d1 + Register Offset: 0x0 + Port: axis_p1_p2_mmfeed_d1 + Memory: dc_7 (MEM_STREAMING_CONNECTION) + + Argument: axis_p1_p2_mmsxt_c + Register Offset: 0x0 + Port: axis_p1_p2_mmsxt_c + Memory: dc_8 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_bsk_c + Register Offset: 0x0 + Port: axis_p2_p1_bsk_c + Memory: dc_9 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_mmacc_c + Register Offset: 0x0 + Port: axis_p2_p1_mmacc_c + Memory: dc_10 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_mmacc_d0 + Register Offset: 0x0 + Port: axis_p2_p1_mmacc_d0 + Memory: dc_11 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_mmacc_side + Register Offset: 0x0 + Port: axis_p2_p1_mmacc_side + Memory: dc_12 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_mmsxt_d0 + Register Offset: 0x0 + Port: axis_p2_p1_mmsxt_d0 + Memory: dc_13 (MEM_STREAMING_CONNECTION) + + Argument: axis_p2_p1_side + Register Offset: 0x0 + Port: axis_p2_p1_side + Memory: dc_14 (MEM_STREAMING_CONNECTION) +============================================================================== +Generated By +------------ + Command: v++ + Version: 2024.1 - 2024-05-20-23:21:20 (SW BUILD: 5074859) + Command Line: v++ --config /projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/cfg/hpu_msplit_3parts.cfg --connectivity.nk hpu_msplit_3parts_1in3:1 --connectivity.nk hpu_msplit_3parts_2in3:1 --connectivity.nk hpu_msplit_3parts_3in3:1 --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_bsk_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_bsk_c --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_c --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmsxt_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmsxt_c --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_ldg_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_ldg_c --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_ldg_d:hpu_msplit_3parts_2in3_1.axis_p1_p2_ldg_d --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_d0:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_d0 --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_d1:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_d1 --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmacc_side:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmacc_side --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_batch:hpu_msplit_3parts_2in3_1.axis_p1_p2_batch --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_d0:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_d0 --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_c:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_c --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_bsk_c:hpu_msplit_3parts_1in3_1.axis_p2_p1_bsk_c --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_side:hpu_msplit_3parts_1in3_1.axis_p2_p1_side --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_mmsxt_d0:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmsxt_d0 --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_side:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_side --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d0:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d0 --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d1:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d1 --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d2:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d2 --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d3:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d3 --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_c:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_c --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_bsk_c:hpu_msplit_3parts_3in3_1.axis_p2_p3_bsk_c --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_batch:hpu_msplit_3parts_3in3_1.axis_p2_p3_batch --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d0:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d0 --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d1:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d1 --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d2:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d2 --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d3:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d3 --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_c:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_c --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_bsk_c:hpu_msplit_3parts_2in3_1.axis_p3_p2_bsk_c --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_side:hpu_msplit_3parts_2in3_1.axis_p3_p2_side --connectivity.slr hpu_msplit_3parts_1in3_1:SLR0 --connectivity.slr hpu_msplit_3parts_2in3_1:SLR1 --connectivity.slr hpu_msplit_3parts_3in3_1:SLR2 --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_trc:HBM[0] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ucore:HBM[1] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_pem_0:HBM[10] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_pem_1:HBM[11] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_glwe_0:HBM[12] --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_0:HBM[2] --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_1:HBM[3] --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_2:HBM[4] --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_3:HBM[5] --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_4:HBM[6] --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_5:HBM[7] --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_6:HBM[8] --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_7:HBM[9] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_0:HBM[24] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_1:HBM[25] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_2:HBM[26] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_3:HBM[27] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_4:HBM[28] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_5:HBM[29] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_6:HBM[30] --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_7:HBM[31] --debug --input_files /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_1in3.xo --input_files /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_2in3.xo --input_files /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_3in3.xo --kernel_frequency 0:300 --link --optimize 0 --output /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin --platform xilinx_u55c_gen3x16_xdma_3_202210_1 --report_level 0 --save-temps --target hw --temp_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin --vivado.param project.writeIntermediateCheckpoints=1 --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/constraints/hpu_msplit_3parts_impl_opt_design_pre.xdc --vivado.prop run.synth_1.STEPS.SYNTH_DESIGN.ARGS.NO_SRLEXTRACT=true --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-hier_fanout_limit 1024} --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=AlternateCLBRouting + Options: --config /projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/cfg/hpu_msplit_3parts.cfg + --connectivity.nk hpu_msplit_3parts_1in3:1 + --connectivity.nk hpu_msplit_3parts_2in3:1 + --connectivity.nk hpu_msplit_3parts_3in3:1 + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_bsk_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_bsk_c + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_c + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmsxt_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmsxt_c + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_ldg_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_ldg_c + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_ldg_d:hpu_msplit_3parts_2in3_1.axis_p1_p2_ldg_d + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_d0:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_d0 + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_d1:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_d1 + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_mmacc_side:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmacc_side + --connectivity.sc hpu_msplit_3parts_1in3_1.axis_p1_p2_batch:hpu_msplit_3parts_2in3_1.axis_p1_p2_batch + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_d0:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_d0 + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_c:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_c + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_bsk_c:hpu_msplit_3parts_1in3_1.axis_p2_p1_bsk_c + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_side:hpu_msplit_3parts_1in3_1.axis_p2_p1_side + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_mmsxt_d0:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmsxt_d0 + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_side:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_side + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d0:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d0 + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d1:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d1 + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d2:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d2 + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d3:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d3 + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_c:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_c + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_bsk_c:hpu_msplit_3parts_3in3_1.axis_p2_p3_bsk_c + --connectivity.sc hpu_msplit_3parts_2in3_1.axis_p2_p3_batch:hpu_msplit_3parts_3in3_1.axis_p2_p3_batch + --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d0:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d0 + --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d1:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d1 + --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d2:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d2 + --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d3:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d3 + --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_c:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_c + --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_bsk_c:hpu_msplit_3parts_2in3_1.axis_p3_p2_bsk_c + --connectivity.sc hpu_msplit_3parts_3in3_1.axis_p3_p2_side:hpu_msplit_3parts_2in3_1.axis_p3_p2_side + --connectivity.slr hpu_msplit_3parts_1in3_1:SLR0 + --connectivity.slr hpu_msplit_3parts_2in3_1:SLR1 + --connectivity.slr hpu_msplit_3parts_3in3_1:SLR2 + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_trc:HBM[0] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ucore:HBM[1] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_pem_0:HBM[10] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_pem_1:HBM[11] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_glwe_0:HBM[12] + --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_0:HBM[2] + --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_1:HBM[3] + --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_2:HBM[4] + --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_3:HBM[5] + --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_4:HBM[6] + --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_5:HBM[7] + --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_6:HBM[8] + --connectivity.sp hpu_msplit_3parts_3in3_1.m_axi_bsk_7:HBM[9] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_0:HBM[24] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_1:HBM[25] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_2:HBM[26] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_3:HBM[27] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_4:HBM[28] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_5:HBM[29] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_6:HBM[30] + --connectivity.sp hpu_msplit_3parts_1in3_1.m_axi_ksk_7:HBM[31] + --debug + --input_files /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_1in3.xo + --input_files /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_2in3.xo + --input_files /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_3in3.xo + --kernel_frequency 0:300 + --link + --optimize 0 + --output /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin + --platform xilinx_u55c_gen3x16_xdma_3_202210_1 + --report_level 0 + --save-temps + --target hw + --temp_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin + --vivado.param project.writeIntermediateCheckpoints=1 + --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/constraints/hpu_msplit_3parts_impl_opt_design_pre.xdc + --vivado.prop run.synth_1.STEPS.SYNTH_DESIGN.ARGS.NO_SRLEXTRACT=true + --vivado.prop run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-hier_fanout_limit 1024} + --vivado.prop run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high + --vivado.prop run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=AlternateCLBRouting +============================================================================== +User Added Key Value Pairs +-------------------------- + +============================================================================== diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin.link_summary b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin.link_summary new file mode 100644 index 000000000..3d71a76a6 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_msplit_3parts.xclbin.link_summary @@ -0,0 +1,1377 @@ + +{ + "thisFile": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin.link_summary", + "connectId": "", + "serverToken": "", + "timestamp": "0" +} + + +{ + "type": "ET_CmdStep", + "dateTimestamp": "Tue Mar 25 21:43:59 2025", + "timestampMillis": "1742935439546", + "buildStep": { + "cmdId": "e88f97ad-c344-46ae-84b0-dceb8646f1dd", + "name": "v++", + "logFile": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/link.steps.log", + "commandLine": "/opt/xilinx/Vitis/2024.1/bin/unwrapped/lnx64.o/v++ --vivado.prop \"run.__KERNEL__.{STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS}={-directive sdx_optimization_effort_high}\" --advanced.misc \"report=type report_timing_summary name impl_report_timing_summary_route_design_summary steps {route_design} runs {impl_1} options {-max_paths 10}\" --advanced.misc \"report=type report_timing_summary name impl_report_timing_summary_post_route_phys_opt_design_summary steps {post_route_phys_opt_design} runs {impl_1} options {-max_paths 10}\" -l -g -t hw --platform xilinx_u55c_gen3x16_xdma_3_202210_1 --save-temps --temp_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin -o /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin --config /projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/cfg/hpu_msplit_3parts.cfg --vivado.prop run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/constraints/hpu_msplit_3parts_impl_opt_design_pre.xdc --kernel_frequency 0:300 /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_1in3.xo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_2in3.xo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_3in3.xo ", + "args": [ + "-l", + "-g", + "-t", + "hw", + "--platform", + "xilinx_u55c_gen3x16_xdma_3_202210_1", + "--save-temps", + "--temp_dir", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin", + "-o", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin", + "--config", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/cfg/hpu_msplit_3parts.cfg", + "--vivado.prop", + "run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/constraints/hpu_msplit_3parts_impl_opt_design_pre.xdc", + "--kernel_frequency", + "0:300", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_1in3.xo", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_2in3.xo", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_3in3.xo" + ], + "iniFiles": [ + { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/cfg/hpu_msplit_3parts.cfg", + "content": "# General configuration knobs\n# equivalent of --platform knob\n# platform=\n# equivalent of --log_dir knob\n# logdir=\n# equivalent of --report_dir knob\n# report_dir=\n# equivalent of --target knob\n#target=hw\n\n# Enable debug mode\ndebug=1\nsave-temps=1\n\n# Enable link mode\n# link=1\n\n# Vivado properties\n[vivado]\nparam=project.writeIntermediateCheckpoints=1\nprop=run.synth_1.STEPS.SYNTH_DESIGN.ARGS.NO_SRLEXTRACT=true\nprop=run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-hier_fanout_limit 1024}\nprop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high\nprop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=AlternateCLBRouting\n\n[connectivity]\n# Number of CU per xo\nnk=hpu_msplit_3parts_1in3:1\nnk=hpu_msplit_3parts_2in3:1\nnk=hpu_msplit_3parts_3in3:1\n\n# SLR assignement\nslr=hpu_msplit_3parts_1in3_1:SLR0\nslr=hpu_msplit_3parts_2in3_1:SLR1\nslr=hpu_msplit_3parts_3in3_1:SLR2\n\n# Axi4 memory connection\nsp=hpu_msplit_3parts_1in3_1.m_axi_trc:HBM[0]\n# Note that if the following is modified, do not forget to change the ucore config\nsp=hpu_msplit_3parts_1in3_1.m_axi_ucore:HBM[1]\n\nsp=hpu_msplit_3parts_1in3_1.m_axi_pem_0:HBM[10]\nsp=hpu_msplit_3parts_1in3_1.m_axi_pem_1:HBM[11]\nsp=hpu_msplit_3parts_1in3_1.m_axi_glwe_0:HBM[12]\n\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_0:HBM[2]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_1:HBM[3]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_2:HBM[4]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_3:HBM[5]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_4:HBM[6]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_5:HBM[7]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_6:HBM[8]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_7:HBM[9]\n\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_0:HBM[24]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_1:HBM[25]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_2:HBM[26]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_3:HBM[27]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_4:HBM[28]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_5:HBM[29]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_6:HBM[30]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_7:HBM[31]\n\n\n# AXI stream\n# part1 -\u003e part2\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_bsk_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_bsk_c\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_c\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmsxt_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmsxt_c\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_ldg_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_ldg_c\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_ldg_d:hpu_msplit_3parts_2in3_1.axis_p1_p2_ldg_d\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_d0:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_d0\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_d1:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_d1\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmacc_side:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmacc_side\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_batch:hpu_msplit_3parts_2in3_1.axis_p1_p2_batch\n\n# part2 -\u003e part1\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_d0:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_d0\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_c:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_c\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_bsk_c:hpu_msplit_3parts_1in3_1.axis_p2_p1_bsk_c\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_side:hpu_msplit_3parts_1in3_1.axis_p2_p1_side\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_mmsxt_d0:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmsxt_d0\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_side:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_side\n\n# part2 -\u003e part3\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d0:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d0\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d1:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d1\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d2:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d2\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d3:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d3\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_c:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_c\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_bsk_c:hpu_msplit_3parts_3in3_1.axis_p2_p3_bsk_c\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_batch:hpu_msplit_3parts_3in3_1.axis_p2_p3_batch\n# part3 -\u003e part2\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d0:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d0\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d1:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d1\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d2:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d2\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d3:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d3\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_c:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_c\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_bsk_c:hpu_msplit_3parts_2in3_1.axis_p3_p2_bsk_c\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_side:hpu_msplit_3parts_2in3_1.axis_p3_p2_side\n" + } + ], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:43:59 2025", + "timestampMillis": "1742935439546", + "status": { + "cmdId": "e88f97ad-c344-46ae-84b0-dceb8646f1dd", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_FlowMetaData", + "dateTimestamp": "Tue Mar 25 21:44:03 2025", + "timestampMillis": "1742935443837", + "buildSummary": { + "hardwarePlatform": "xilinx_u55c_gen3x16_xdma_3_202210_1.xpfm", + "hardwareDsa": "", + "platformDirectory": "/opt/xilinx/platforms/xilinx_u55c_gen3x16_xdma_3_202210_1", + "runtime": "OpenCL", + "systemConfig": "Linux", + "flow": "BF_LINK", + "target": "TT_HW", + "binaryContainer": { + "base": { + "type": "BT_UKNOWN", + "name": "hpu_msplit_3parts", + "file": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin", + "reports": [], + "uuid": "" + }, + "kernels": [] + }, + "kernels": [ + { + "base": { + "type": "KERNEL", + "name": "hpu_msplit_3parts_1in3", + "file": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_1in3.xo", + "reports": [], + "uuid": "" + }, + "sources": [], + "psSources": [], + "cuNames": [ + "hpu_msplit_3parts_1in3_1" + ], + "type": "RTL", + "frequency": 0, + "freqUnits": "" + }, + { + "base": { + "type": "KERNEL", + "name": "hpu_msplit_3parts_2in3", + "file": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_2in3.xo", + "reports": [], + "uuid": "" + }, + "sources": [], + "psSources": [], + "cuNames": [ + "hpu_msplit_3parts_2in3_1" + ], + "type": "RTL", + "frequency": 0, + "freqUnits": "" + }, + { + "base": { + "type": "KERNEL", + "name": "hpu_msplit_3parts_3in3", + "file": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_3in3.xo", + "reports": [], + "uuid": "" + }, + "sources": [], + "psSources": [], + "cuNames": [ + "hpu_msplit_3parts_3in3_1" + ], + "type": "RTL", + "frequency": 0, + "freqUnits": "" + } + ], + "toolVersion": "Vitis V++ Compiler Release 2024.1. SW Build 5074859 on 2024-05-20-23:21:20" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Tue Mar 25 21:44:03 2025", + "timestampMillis": "1742935443951", + "buildStep": { + "cmdId": "33b1fc21-fde3-4a78-8553-b2ccdebcbbf2", + "name": "system_link", + "logFile": "", + "commandLine": "system_link --xo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_1in3.xo --xo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_2in3.xo --xo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_3in3.xo -keep --config /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/syslinkConfig.ini --xpfm /opt/xilinx/platforms/xilinx_u55c_gen3x16_xdma_3_202210_1/xilinx_u55c_gen3x16_xdma_3_202210_1.xpfm --target hw --output_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int --temp_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/sys_link", + "args": [ + "--xo", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_1in3.xo", + "--xo", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_2in3.xo", + "--xo", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts_3in3.xo", + "-keep", + "--config", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/syslinkConfig.ini", + "--xpfm", + "/opt/xilinx/platforms/xilinx_u55c_gen3x16_xdma_3_202210_1/xilinx_u55c_gen3x16_xdma_3_202210_1.xpfm", + "--target", + "hw", + "--output_dir", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int", + "--temp_dir", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/sys_link" + ], + "iniFiles": [ + { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/syslinkConfig.ini", + "content": "nk=hpu_msplit_3parts_1in3:1\nnk=hpu_msplit_3parts_2in3:1\nnk=hpu_msplit_3parts_3in3:1\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_bsk_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_bsk_c\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_c\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmsxt_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmsxt_c\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_ldg_c:hpu_msplit_3parts_2in3_1.axis_p1_p2_ldg_c\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_ldg_d:hpu_msplit_3parts_2in3_1.axis_p1_p2_ldg_d\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_d0:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_d0\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmfeed_d1:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmfeed_d1\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_mmacc_side:hpu_msplit_3parts_2in3_1.axis_p1_p2_mmacc_side\nsc=hpu_msplit_3parts_1in3_1.axis_p1_p2_batch:hpu_msplit_3parts_2in3_1.axis_p1_p2_batch\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_d0:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_d0\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_c:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_c\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_bsk_c:hpu_msplit_3parts_1in3_1.axis_p2_p1_bsk_c\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_side:hpu_msplit_3parts_1in3_1.axis_p2_p1_side\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_mmsxt_d0:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmsxt_d0\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p1_mmacc_side:hpu_msplit_3parts_1in3_1.axis_p2_p1_mmacc_side\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d0:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d0\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d1:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d1\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d2:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d2\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_d3:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_d3\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_proc_c:hpu_msplit_3parts_3in3_1.axis_p2_p3_proc_c\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_bsk_c:hpu_msplit_3parts_3in3_1.axis_p2_p3_bsk_c\nsc=hpu_msplit_3parts_2in3_1.axis_p2_p3_batch:hpu_msplit_3parts_3in3_1.axis_p2_p3_batch\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d0:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d0\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d1:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d1\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d2:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d2\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_d3:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_d3\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_proc_c:hpu_msplit_3parts_2in3_1.axis_p3_p2_proc_c\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_bsk_c:hpu_msplit_3parts_2in3_1.axis_p3_p2_bsk_c\nsc=hpu_msplit_3parts_3in3_1.axis_p3_p2_side:hpu_msplit_3parts_2in3_1.axis_p3_p2_side\nsp=hpu_msplit_3parts_1in3_1.m_axi_trc:HBM[0]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ucore:HBM[1]\nsp=hpu_msplit_3parts_1in3_1.m_axi_pem_0:HBM[10]\nsp=hpu_msplit_3parts_1in3_1.m_axi_pem_1:HBM[11]\nsp=hpu_msplit_3parts_1in3_1.m_axi_glwe_0:HBM[12]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_0:HBM[2]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_1:HBM[3]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_2:HBM[4]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_3:HBM[5]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_4:HBM[6]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_5:HBM[7]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_6:HBM[8]\nsp=hpu_msplit_3parts_3in3_1.m_axi_bsk_7:HBM[9]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_0:HBM[24]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_1:HBM[25]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_2:HBM[26]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_3:HBM[27]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_4:HBM[28]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_5:HBM[29]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_6:HBM[30]\nsp=hpu_msplit_3parts_1in3_1.m_axi_ksk_7:HBM[31]\nslr=hpu_msplit_3parts_1in3_1:SLR0\nslr=hpu_msplit_3parts_2in3_1:SLR1\nslr=hpu_msplit_3parts_3in3_1:SLR2\n\n" + } + ], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:03 2025", + "timestampMillis": "1742935443952", + "status": { + "cmdId": "33b1fc21-fde3-4a78-8553-b2ccdebcbbf2", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:26 2025", + "timestampMillis": "1742935466920", + "status": { + "cmdId": "33b1fc21-fde3-4a78-8553-b2ccdebcbbf2", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Tue Mar 25 21:44:26 2025", + "timestampMillis": "1742935466923", + "buildStep": { + "cmdId": "b848552b-dd20-45da-819c-c68595e8a991", + "name": "cf2sw", + "logFile": "", + "commandLine": "cf2sw -sdsl /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/sdsl.dat -rtd /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/cf2sw.rtd -nofilter /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/cf2sw_full.rtd -xclbin /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xclbin_orig.xml -o /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xclbin_orig.1.xml", + "args": [ + "-sdsl", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/sdsl.dat", + "-rtd", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/cf2sw.rtd", + "-nofilter", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/cf2sw_full.rtd", + "-xclbin", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xclbin_orig.xml", + "-o", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xclbin_orig.1.xml" + ], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:26 2025", + "timestampMillis": "1742935466923", + "status": { + "cmdId": "b848552b-dd20-45da-819c-c68595e8a991", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:38 2025", + "timestampMillis": "1742935478613", + "status": { + "cmdId": "b848552b-dd20-45da-819c-c68595e8a991", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Tue Mar 25 21:44:38 2025", + "timestampMillis": "1742935478615", + "buildStep": { + "cmdId": "72bbf168-fd61-4e55-a3f8-502351704ed5", + "name": "rtd2_system_diagram", + "logFile": "", + "commandLine": "rtd2SystemDiagram", + "args": [], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:38 2025", + "timestampMillis": "1742935478615", + "status": { + "cmdId": "72bbf168-fd61-4e55-a3f8-502351704ed5", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Tue Mar 25 21:44:38 2025", + "timestampMillis": "1742935478799", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/systemDiagramModel.json", + "name": "", + "fileType": "JSON", + "reportType": "SYSTEM_DIAGRAM", + "cmdId": "" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:38 2025", + "timestampMillis": "1742935478799", + "status": { + "cmdId": "72bbf168-fd61-4e55-a3f8-502351704ed5", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Tue Mar 25 21:44:38 2025", + "timestampMillis": "1742935478801", + "buildStep": { + "cmdId": "96427ed0-9564-4a4a-817d-5767b19576ce", + "name": "vpl", + "logFile": "", + "commandLine": "vpl -t hw -f xilinx_u55c_gen3x16_xdma_3_202210_1 -s -g --kernel_frequency 0:300 --remote_ip_cache /projects/baroux/Fpga/fpga_u55c_syn/xrt/.ipcache --output_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int --log_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/logs/link --report_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link --config /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/vplConfig.ini -k /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/kernel_info.dat --webtalk_flag Vitis --temp_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link --no-info --iprepo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_3in3_1_0 --iprepo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_2in3_1_0 --iprepo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_1in3_1_0 --messageDb /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/run_link/vpl.pb /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/dr.bd.tcl", + "args": [ + "-t", + "hw", + "-f", + "xilinx_u55c_gen3x16_xdma_3_202210_1", + "-s", + "-g", + "--kernel_frequency", + "0:300", + "--remote_ip_cache", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/.ipcache", + "--output_dir", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int", + "--log_dir", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/logs/link", + "--report_dir", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link", + "--config", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/vplConfig.ini", + "-k", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/kernel_info.dat", + "--webtalk_flag", + "Vitis", + "--temp_dir", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link", + "--no-info", + "--iprepo", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_3in3_1_0", + "--iprepo", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_2in3_1_0", + "--iprepo", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_1in3_1_0", + "--messageDb", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/run_link/vpl.pb", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/dr.bd.tcl" + ], + "iniFiles": [ + { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/vplConfig.ini", + "content": "[advanced]\nmisc=report=type report_timing_summary name impl_report_timing_summary_route_design_summary steps {route_design} runs {impl_1} options {-max_paths 10}\nmisc=report=type report_timing_summary name impl_report_timing_summary_post_route_phys_opt_design_summary steps {post_route_phys_opt_design} runs {impl_1} options {-max_paths 10}\nparam=compiler.enablePerformanceTrace=1\nparam=hw_emu.enableDebugWaveform=1\nparam=hw_emu.enableProfiling=1\nparam=compiler.vppCurrentWorkingDir=/projects/baroux/Fpga/fpga_u55c_syn/xrt\nmisc=BinaryName=hpu_msplit_3parts\n\n[connectivity]\nnk=hpu_msplit_3parts_1in3:1:hpu_msplit_3parts_1in3_1\nnk=hpu_msplit_3parts_2in3:1:hpu_msplit_3parts_2in3_1\nnk=hpu_msplit_3parts_3in3:1:hpu_msplit_3parts_3in3_1\n\n[vivado]\nprop=run.__KERNEL__.{STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS}={-directive sdx_optimization_effort_high}\nparam=project.writeIntermediateCheckpoints=1\nprop=run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=/projects/baroux/Fpga/fpga_u55c_syn/xrt/kernel/xilinx_u55c_gen3x16_xdma_3_202210_1/constraints/hpu_msplit_3parts_impl_opt_design_pre.xdc\nprop=run.synth_1.STEPS.SYNTH_DESIGN.ARGS.NO_SRLEXTRACT=true\nprop=run.impl_1.{STEPS.OPT_DESIGN.ARGS.MORE OPTIONS}={-hier_fanout_limit 1024}\nprop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=AltSpreadLogic_high\nprop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=AlternateCLBRouting\n\n" + } + ], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:38 2025", + "timestampMillis": "1742935478801", + "status": { + "cmdId": "96427ed0-9564-4a4a-817d-5767b19576ce", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_CmdStep", + "dateTimestamp": "Tue Mar 25 21:44:39 2025", + "timestampMillis": "1742935479829", + "buildStep": { + "cmdId": "d20442de-939c-4a6f-9b9a-ddfda3d86cb7", + "name": "vpl", + "logFile": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/link.steps.log", + "commandLine": "/opt/xilinx/Vitis/2024.1/bin/unwrapped/lnx64.o/vpl -t hw -f xilinx_u55c_gen3x16_xdma_3_202210_1 -s -g --kernel_frequency 0:300 --remote_ip_cache /projects/baroux/Fpga/fpga_u55c_syn/xrt/.ipcache --output_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int --log_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/logs/link --report_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link --config /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/vplConfig.ini -k /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/kernel_info.dat --webtalk_flag Vitis --temp_dir /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link --no-info --iprepo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_3in3_1_0 --iprepo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_2in3_1_0 --iprepo /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xo/ip_repo/zama_ai_RTLKernel_hpu_msplit_3parts_1in3_1_0 --messageDb /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/run_link/vpl.pb /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/dr.bd.tcl ", + "args": [], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/run_link" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:39 2025", + "timestampMillis": "1742935479829", + "status": { + "cmdId": "d20442de-939c-4a6f-9b9a-ddfda3d86cb7", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_VivadoProject", + "dateTimestamp": "Tue Mar 25 21:44:43 2025", + "timestampMillis": "1742935483403", + "vivadoProject": { + "openDir": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl", + "openScript": "openprj.tcl", + "relativeProject": "prj/prj.xpr" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Tue Mar 25 21:44:43 2025", + "timestampMillis": "1742935483404", + "buildStep": { + "cmdId": "ce88c619-cd2a-42ba-8a22-4f799c78e22f", + "name": "vivado", + "logFile": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl/vivado.log", + "commandLine": "vivado -log vivado.log -applog -m64 -messageDb vivado.pb -mode batch -source vpl.tcl -notrace", + "args": [ + "-log", + "vivado.log", + "-applog", + " -m64", + "-messageDb", + "vivado.pb", + "-mode", + "batch", + "-source", + "vpl.tcl", + "-notrace" + ], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/run_link" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 21:44:43 2025", + "timestampMillis": "1742935483404", + "status": { + "cmdId": "ce88c619-cd2a-42ba-8a22-4f799c78e22f", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Tue Mar 25 21:47:29 2025", + "timestampMillis": "1742935649601", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/automation_summary_pre_synthesis.txt", + "name": "", + "fileType": "TEXT", + "reportType": "VITIS_DESIGN_FLOW", + "cmdId": "" + } +} + + +{ + "type": "ET_CmdStep", + "dateTimestamp": "Tue Mar 25 22:02:26 2025", + "timestampMillis": "1742936546572", + "buildStep": { + "cmdId": "9382dc96-73d2-4ae0-bb24-681279d2b32e", + "name": "vivado.impl", + "logFile": "", + "commandLine": "", + "args": [], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 22:02:26 2025", + "timestampMillis": "1742936546572", + "status": { + "cmdId": "9382dc96-73d2-4ae0-bb24-681279d2b32e", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_CmdStep", + "dateTimestamp": "Tue Mar 25 22:02:26 2025", + "timestampMillis": "1742936546572", + "buildStep": { + "cmdId": "d4e342ad-5c5e-49f7-bcaa-4ca00eccbd17", + "name": "vivado.impl.impl_1", + "logFile": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl/prj/prj.runs/impl_1/runme.log", + "commandLine": "", + "args": [], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl/prj/prj.runs/impl_1" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Tue Mar 25 22:02:26 2025", + "timestampMillis": "1742936546572", + "status": { + "cmdId": "d4e342ad-5c5e-49f7-bcaa-4ca00eccbd17", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 01:52:48 2025", + "timestampMillis": "1742950368758", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl/prj/prj.runs/impl_1/system_diagram.json", + "name": "", + "fileType": "JSON", + "reportType": "SYSTEM_DIAGRAM_PLUS", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 01:58:41 2025", + "timestampMillis": "1742950721280", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/kernel_service.json", + "name": "", + "fileType": "JSON", + "reportType": "KERNEL_SERVICE", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 01:58:41 2025", + "timestampMillis": "1742950721282", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/kernel_service.pb", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "KERNEL_SERVICE", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:05:09 2025", + "timestampMillis": "1742951109343", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl/prj/prj.runs/impl_1/dr_timing_summary.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_REPORT_TIMING_SUMMARY_FAIL", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:05:09 2025", + "timestampMillis": "1742951109346", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl/prj/prj.runs/impl_1/dr_timing_summary.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_REPORT_TIMING_SUMMARY_FAIL", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:05:09 2025", + "timestampMillis": "1742951109348", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/vivado/vpl/prj/prj.runs/impl_1/dr_timing_summary.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_REPORT_TIMING_SUMMARY_FAIL", + "cmdId": "" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002184", + "status": { + "cmdId": "ce88c619-cd2a-42ba-8a22-4f799c78e22f", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002202", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_slr_util_placed.pb", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "GLOBAL_SLR_UTIL_PLACED", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002202", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_full_util_routed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_UTILIZATION_ROUTE", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002203", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_kernel_util_routed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "KERNEL_UTILIZATION_ROUTE", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002203", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_full_util_routed.pb", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "GLOBAL_UTILIZATION_ROUTE", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002203", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_slr_util_routed.pb", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "GLOBAL_SLR_UTIL_ROUTED", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002204", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_full_util_placed.pb", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "GLOBAL_UTILIZATION_PLACEMENT", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002204", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_kernel_util_placed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "KERNEL_UTILIZATION_PLACEMENT", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002204", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_kernel_util_synthed.xutil", + "name": "", + "fileType": "XUTIL", + "reportType": "KERNEL_UTILIZATION_SYNTHESIS", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002205", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_slr_util_placed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_SLR_UTIL_PLACED", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002205", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_full_util_synthed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_UTILIZATION_SYNTHESIS", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002205", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_slr_util_routed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_SLR_UTIL_ROUTED", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002206", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_kernel_util_routed.xutil", + "name": "", + "fileType": "XUTIL", + "reportType": "KERNEL_UTILIZATION_ROUTE", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002206", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_full_util_synthed.pb", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "GLOBAL_UTILIZATION_SYNTHESIS", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002206", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_full_util_placed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_UTILIZATION_PLACEMENT", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002207", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_kernel_util_synthed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "KERNEL_UTILIZATION_SYNTHESIS", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002207", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_kernel_util_placed.xutil", + "name": "", + "fileType": "XUTIL", + "reportType": "KERNEL_UTILIZATION_PLACEMENT", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002267", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/syn/ulp_hpu_msplit_3parts_3in3_1_0_synth_1_ulp_hpu_msplit_3parts_3in3_1_0_utilization_synth.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_REPORT_UTILIZATION", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002267", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/syn/ulp_hpu_msplit_3parts_2in3_1_0_synth_1_ulp_hpu_msplit_3parts_2in3_1_0_utilization_synth.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_REPORT_UTILIZATION", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002267", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/syn/ulp_hpu_msplit_3parts_1in3_1_0_synth_1_ulp_hpu_msplit_3parts_1in3_1_0_utilization_synth.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_REPORT_UTILIZATION", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002268", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_system_diagram.json", + "name": "", + "fileType": "JSON", + "reportType": "SYSTEM_DIAGRAM_PLUS", + "cmdId": "d4e342ad-5c5e-49f7-bcaa-4ca00eccbd17" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002285", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_REPORT_TIMING_SUMMARY", + "cmdId": "d4e342ad-5c5e-49f7-bcaa-4ca00eccbd17" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002300", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpx", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "GLOBAL_REPORT_TIMING_SUMMARY", + "cmdId": "d4e342ad-5c5e-49f7-bcaa-4ca00eccbd17" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002301", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpv", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "GLOBAL_REPORT_TIMING_SUMMARY_CONCISE", + "cmdId": "d4e342ad-5c5e-49f7-bcaa-4ca00eccbd17" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002318", + "status": { + "cmdId": "d20442de-939c-4a6f-9b9a-ddfda3d86cb7", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002346", + "status": { + "cmdId": "96427ed0-9564-4a4a-817d-5767b19576ce", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002348", + "buildStep": { + "cmdId": "2ecccd10-c47f-4aaa-b5cd-d6c8647b572f", + "name": "rtdgen", + "logFile": "", + "commandLine": "rtdgen", + "args": [], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002348", + "status": { + "cmdId": "2ecccd10-c47f-4aaa-b5cd-d6c8647b572f", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002350", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts_xml.rtd", + "name": "", + "fileType": "JSON", + "reportType": "XCLBIN_INFO", + "cmdId": "" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002351", + "buildStep": { + "cmdId": "d50b9e7c-2808-435d-bc1a-5f415fd1cfd9", + "name": "cf2sw", + "logFile": "", + "commandLine": "cf2sw -a /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/address_map.xml -sdsl /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/sdsl.dat -xclbin /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xclbin_orig.xml -rtd /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.rtd -o /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.xml", + "args": [ + "-a", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/address_map.xml", + "-sdsl", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/sdsl.dat", + "-xclbin", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/xclbin_orig.xml", + "-rtd", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.rtd", + "-o", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.xml" + ], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:02 2025", + "timestampMillis": "1742952002351", + "status": { + "cmdId": "d50b9e7c-2808-435d-bc1a-5f415fd1cfd9", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013044", + "status": { + "cmdId": "d50b9e7c-2808-435d-bc1a-5f415fd1cfd9", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013044", + "buildStep": { + "cmdId": "f7f79c44-5798-4e5b-a797-b6bceb7569e7", + "name": "rtdgen", + "logFile": "", + "commandLine": "writeSystemDiagram", + "args": [ + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.rtd", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/systemDiagramModelSlrBaseAddress.json" + ], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013045", + "status": { + "cmdId": "f7f79c44-5798-4e5b-a797-b6bceb7569e7", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013049", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/systemDiagramModelSlrBaseAddress.json", + "name": "", + "fileType": "JSON", + "reportType": "SYSTEM_DIAGRAM_PLUS", + "cmdId": "" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013049", + "status": { + "cmdId": "f7f79c44-5798-4e5b-a797-b6bceb7569e7", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013049", + "buildStep": { + "cmdId": "779ee271-74da-4d52-a347-203fb158d528", + "name": "rtdgen", + "logFile": "", + "commandLine": "writeAutomationSummary", + "args": [ + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/automation_summary.txt" + ], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013049", + "status": { + "cmdId": "779ee271-74da-4d52-a347-203fb158d528", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013051", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/automation_summary.txt", + "name": "", + "fileType": "TEXT", + "reportType": "VITIS_DESIGN_FLOW", + "cmdId": "" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013051", + "status": { + "cmdId": "779ee271-74da-4d52-a347-203fb158d528", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013051", + "status": { + "cmdId": "2ecccd10-c47f-4aaa-b5cd-d6c8647b572f", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013053", + "buildStep": { + "cmdId": "187eb425-0099-4350-a876-7bf49df83606", + "name": "xclbinutil", + "logFile": "", + "commandLine": "xclbinutil --add-section BITSTREAM:RAW:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/partial.bit --force --target hw --key-value SYS:dfx_enable:true --add-section :JSON:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.rtd --append-section :JSON:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/appendSection.rtd --add-section CLOCK_FREQ_TOPOLOGY:JSON:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts_xml.rtd --add-section BUILD_METADATA:JSON:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts_build.rtd --add-section EMBEDDED_METADATA:RAW:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.xml --add-section SYSTEM_METADATA:RAW:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/systemDiagramModelSlrBaseAddress.json --key-value SYS:PlatformVBNV:xilinx_u55c_gen3x16_xdma_3_202210_1 --output /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin", + "args": [ + "--add-section", + "BITSTREAM:RAW:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/partial.bit", + "--force", + "--target", + "hw", + "--key-value", + "SYS:dfx_enable:true", + "--add-section", + ":JSON:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.rtd", + "--append-section", + ":JSON:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/appendSection.rtd", + "--add-section", + "CLOCK_FREQ_TOPOLOGY:JSON:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts_xml.rtd", + "--add-section", + "BUILD_METADATA:JSON:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts_build.rtd", + "--add-section", + "EMBEDDED_METADATA:RAW:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/hpu_msplit_3parts.xml", + "--add-section", + "SYSTEM_METADATA:RAW:/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/link/int/systemDiagramModelSlrBaseAddress.json", + "--key-value", + "SYS:PlatformVBNV:xilinx_u55c_gen3x16_xdma_3_202210_1", + "--output", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin" + ], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013053", + "status": { + "cmdId": "187eb425-0099-4350-a876-7bf49df83606", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013326", + "status": { + "cmdId": "187eb425-0099-4350-a876-7bf49df83606", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013328", + "buildStep": { + "cmdId": "d69687b2-69c8-414e-b41c-dae94f910cd3", + "name": "xclbinutilinfo", + "logFile": "", + "commandLine": "xclbinutil --quiet --force --info /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin.info --input /projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin", + "args": [ + "--quiet", + "--force", + "--info", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin.info", + "--input", + "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/hpu_msplit_3parts.xclbin" + ], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:13 2025", + "timestampMillis": "1742952013328", + "status": { + "cmdId": "d69687b2-69c8-414e-b41c-dae94f910cd3", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014017", + "status": { + "cmdId": "d69687b2-69c8-414e-b41c-dae94f910cd3", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_SubCmdStep", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014020", + "buildStep": { + "cmdId": "77aea7eb-2523-4ea9-a609-b4d45e330b6a", + "name": "generate_sc_driver", + "logFile": "", + "commandLine": "", + "args": [], + "iniFiles": [], + "cwd": "/projects/baroux/Fpga/fpga_u55c_syn/xrt" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014020", + "status": { + "cmdId": "77aea7eb-2523-4ea9-a609-b4d45e330b6a", + "state": "CS_RUNNING" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014020", + "status": { + "cmdId": "77aea7eb-2523-4ea9-a609-b4d45e330b6a", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014024", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/system_estimate_hpu_msplit_3parts.xtxt", + "name": "", + "fileType": "TEXT", + "reportType": "GLOBAL_SYSTEM_ESTIMATE", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014027", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/logs/optraceViewer.html", + "name": "", + "fileType": "HTML", + "reportType": "OPERATION_TRACE", + "cmdId": "" + } +} + + +{ + "type": "ET_Status", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014027", + "status": { + "cmdId": "e88f97ad-c344-46ae-84b0-dceb8646f1dd", + "state": "CS_PASSED" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014092", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/reports/link/v++_link_hpu_msplit_3parts_guidance.html", + "name": "", + "fileType": "HTML", + "reportType": "GLOBAL_RULECHECK_GUIDANCE", + "cmdId": "" + } +} + + +{ + "type": "ET_Report", + "dateTimestamp": "Wed Mar 26 02:20:14 2025", + "timestampMillis": "1742952014092", + "report": { + "path": "/projects/baroux/Fpga/fpga_u55c_syn/xrt/output/hw/_tmp_vitis_xclbin/v++_link_hpu_msplit_3parts_guidance.pb3", + "name": "", + "fileType": "BINARY_PROTOBUF", + "reportType": "GLOBAL_RULECHECK_GUIDANCE", + "cmdId": "" + } +} + diff --git a/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_regif_core.toml b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_regif_core.toml new file mode 100644 index 000000000..bdfc2b104 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/u55c_gf64/hpu_regif_core.toml @@ -0,0 +1,622 @@ +# This is a sample example of register-map definition + +module_name="hpu_regif_core" +description="Hpu top-level register interface. Used by the host to retrieved RTL information, configure it and issue commands." +word_size_b = 32 +offset = 0x00 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.Xrt] +description="Vitis Required registers" +offset= 0x0 + + # Currently not in used -> Placeholder only +[section.Xrt.register.reserved] + description="Xrt reserved" + default={Cst=0x00} + owner="User" + read_access="Read" + write_access="Write" + +# ===================================================================================================================== +[section.info] +description="Contain all the RTL parameters used that have impact on associated SW" +offset= 0x10 + +[section.info.register.version] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="VERSION"} + +[section.info.register.ntt_architecture] + description="NTT architecture" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="NTT_CORE_ARCH"} + +[section.info.register.ntt_structure] + description="NTT structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.radix = { size_b=8, offset_b=0 , default={Param="R"}, description="NTT radix"} + field.psi = { size_b=8, offset_b=8 , default={Param="PSI"}, description="NTT psi"} + field.div = { size_b=8, offset_b=16, default={Param="BWD_PSI_DIV"}, description="NTT backward div"} + field.delta = { size_b=8, offset_b=24, default={Param="DELTA"}, description="NTT network delta (for wmm arch)"} + +[section.info.register.ntt_rdx_cut] + description="NTT radix cuts, in log2 unit (for gf64 arch)" + owner="Parameter" + read_access="Read" + write_access="None" + field.radix_cut0 = { size_b=4, offset_b=0 , default={Param="NTT_RDX_CUT_S_0"}, description="NTT radix cut #0"} + field.radix_cut1 = { size_b=4, offset_b=4 , default={Param="NTT_RDX_CUT_S_1"}, description="NTT radix cut #1"} + field.radix_cut2 = { size_b=4, offset_b=8 , default={Param="NTT_RDX_CUT_S_2"}, description="NTT radix cut #2"} + field.radix_cut3 = { size_b=4, offset_b=12, default={Param="NTT_RDX_CUT_S_3"}, description="NTT radix cut #3"} + field.radix_cut4 = { size_b=4, offset_b=16, default={Param="NTT_RDX_CUT_S_4"}, description="NTT radix cut #4"} + field.radix_cut5 = { size_b=4, offset_b=20, default={Param="NTT_RDX_CUT_S_5"}, description="NTT radix cut #5"} + field.radix_cut6 = { size_b=4, offset_b=24, default={Param="NTT_RDX_CUT_S_6"}, description="NTT radix cut #6"} + field.radix_cut7 = { size_b=4, offset_b=28, default={Param="NTT_RDX_CUT_S_7"}, description="NTT radix cut #7"} + +[section.info.register.ntt_pbs] + description="Maximum number of PBS in the NTT pipeline" + owner="Parameter" + read_access="Read" + write_access="None" + field.batch_pbs_nb = { size_b=8, offset_b=0 , default={Param="BATCH_PBS_NB"}, description="Maximum number of PBS in the NTT pipe"} + field.total_pbs_nb = { size_b=8, offset_b=8 , default={Param="TOTAL_PBS_NB"}, description="Maximum number of PBS stored in PEP buffer"} + +[section.info.register.ntt_modulo] + description="Code associated to the NTT prime" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="MOD_NTT_NAME"} + +[section.info.register.application] + description="Code associated with the application" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="APPLICATION_NAME"} + +[section.info.register.ks_structure] + description="Key-switch structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.x = { size_b=8, offset_b=0 , default={Param="LBX"}, description="Number of coefficients on X dimension"} + field.y = { size_b=8, offset_b=8 , default={Param="LBY"}, description="Number of coefficients on Y dimension"} + field.z = { size_b=8, offset_b=16, default={Param="LBZ"}, description="Number of coefficients on Z dimension"} + +[section.info.register.ks_crypto_param] + description="Key-switch crypto parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.mod_ksk_w = { size_b=8, offset_b=0 , default={Param="MOD_KSK_W"}, description="Width of KSK modulo"} + field.ks_l = { size_b=8, offset_b=8 , default={Param="KS_L"}, description="Number of KS decomposition level"} + field.ks_b = { size_b=8, offset_b=16, default={Param="KS_B_W"}, description="Width of KS decomposition base"} + +[section.info.register.regf_structure] + description="Register file structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.reg_nb = { size_b=8, offset_b=0 , default={Param="REGF_REG_NB"}, description="Number of registers in regfile"} + field.coef_nb = { size_b=8, offset_b=8 , default={Param="REGF_COEF_NB"}, description="Number of coefficients at regfile interface"} + +[section.info.register.isc_structure] + description="Instruction scheduler structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.depth = { size_b=8, offset_b=0 , default={Param="ISC_DEPTH"}, description="Number of slots in ISC lookahead buffer."} + field.min_iop_size = { size_b=8, offset_b=8 , default={Param="MIN_IOP_SIZE"}, description="Minimum number of DOp per IOp to prevent sync_id overflow."} + +[section.info.register.pe_properties] + description="Processing elements parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.alu_nb = { size_b=8, offset_b=24 , default={Param="PEA_ALU_NB"}, description="Number of coefficients processed in parallel in pe_alu"} + field.pep_regf_period = { size_b=8, offset_b=16 , default={Param="PEP_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEP and regfile"} + field.pem_regf_period = { size_b=8, offset_b=8 , default={Param="PEM_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEM and regfile"} + field.pea_regf_period = { size_b=8, offset_b=0 , default={Param="PEA_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEA and regfile"} + +[section.info.register.bsk_structure] + description="BSK manager structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.bsk_cut_nb = { size_b=8, offset_b=8 , default={Param="BSK_CUT_NB"}, description="BSK cut nb"} + +[section.info.register.ksk_structure] + description="KSK manager structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.ksk_cut_nb = { size_b=8, offset_b=8 , default={Param="KSK_CUT_NB"}, description="KSK cut nb"} + +[section.info.register.hbm_axi4_nb] + description="Number of AXI4 connections to HBM" + owner="Parameter" + read_access="Read" + write_access="None" + field.bsk_pc = { size_b=8, offset_b=0 , default={Param="BSK_PC"}, description="Number of HBM connections for BSK"} + field.ksk_pc = { size_b=8, offset_b=8, default={Param="KSK_PC"}, description="Number of HBM connections for KSK"} + field.pem_pc = { size_b=8, offset_b=16, default={Param="PEM_PC"}, description="Number of HBM connections for ciphertexts (PEM)"} + field.glwe_pc = { size_b=8, offset_b=24, default={Param="GLWE_PC"}, description="Number of HBM connections for GLWE"} + +[section.info.register.hbm_axi4_dataw_pem] + description="Ciphertext HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_PEM_DATA_W"} + +[section.info.register.hbm_axi4_dataw_glwe] + description="GLWE HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_GLWE_DATA_W"} + +[section.info.register.hbm_axi4_dataw_bsk] + description="BSK HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_BSK_DATA_W"} + +[section.info.register.hbm_axi4_dataw_ksk] + description="KSK HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_KSK_DATA_W"} + +# ===================================================================================================================== +[section.bpip] +offset= 0x200 +description="BPIP configuration" + +[section.bpip.register.use] + description="(1) Use BPIP mode, (0) use IPIP mode (default)" + owner="User" + read_access="Read" + write_access="Write" + field.use_bpip = { size_b=1, offset_b=0 , default={Cst=1}, description="use"} + field.use_opportunism = { size_b=1, offset_b=1 , default={Cst=0}, description="use opportunistic PBS flush"} + +[section.bpip.register.timeout] + description="Timeout for BPIP mode" + owner="User" + read_access="Read" + write_access="Write" + default={Cst=0xffffffff} + +# ===================================================================================================================== +[section.hbm_axi4_addr_1in3] +offset= 0x400 +description="HBM AXI4 connection address offset" + +[section.hbm_axi4_addr_1in3.register.ct] + description="Address offset for each ciphertext HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb","_pc1_lsb", "_pc1_msb"] + +[section.hbm_axi4_addr_1in3.register.glwe] + description="Address offset for each GLWE HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb"] + + +[section.hbm_axi4_addr_1in3.register.ksk] + description="Address offset for each KSK HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb"] + + [section.hbm_axi4_addr_1in3.register.trc] + description="Address offset for each trace HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb"] + + +# ===================================================================================================================== +[section.hbm_axi4_addr_3in3] +description="HBM AXI4 connection address offset" + +[section.hbm_axi4_addr_3in3.register.bsk] + description="Address offset for each BSK HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb"] + + +# ===================================================================================================================== +[section.status_1in3] +description="HPU status of part 1in3" +offset= 0x800 + +[section.status_1in3.register.error] + description="Error register (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.pbs = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 1in3"} + +# ===================================================================================================================== +[section.status_3in3] +description="HPU status of parts 2in3 and 3in3" + +[section.status_3in3.register.error] + description="Error register (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.pbs = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 3in3"} + +# ===================================================================================================================== +[section.ksk_avail] +description="KSK availability configuration" +offset= 0x1000 + +[section.ksk_avail.register.avail] + description="KSK available bit" + owner="User" + read_access="Read" + write_access="Write" + field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"} + +[section.ksk_avail.register.reset] + description="KSK reset sequence" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"} + field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"} + +# ===================================================================================================================== +[section.bsk_avail] +description="BSK availability configuration" + +[section.bsk_avail.register.avail] + description="BSK available bit" + owner="User" + read_access="Read" + write_access="Write" + field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"} + +[section.bsk_avail.register.reset] + description="BSK reset sequence" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"} + field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"} + +# ===================================================================================================================== +[section.runtime_1in3] +description="Runtime information" +offset= 0x2000 + +[section.runtime_1in3.register.pep_cmux_loop] + description="PEP: CMUX iteration loop number" + owner="Kernel" + read_access="Read" + write_access="None" + field.br_loop = { size_b=15, offset_b=0 , default={Cst=0}, description="PBS current BR-loop"} + field.br_loop_c = { size_b=1, offset_b=15 , default={Cst=0}, description="PBS current BR-loop parity"} + field.ks_loop = { size_b=15, offset_b=16 , default={Cst=0}, description="KS current KS-loop"} + field.ks_loop_c = { size_b=1, offset_b=31 , default={Cst=0}, description="KS current KS-loop parity"} + +[section.runtime_1in3.register.pep_pointer_0] + description="PEP: pointers (part 1)" + owner="Kernel" + read_access="Read" + write_access="None" + field.pool_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP pool_rp"} + field.pool_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP pool_wp"} + field.ldg_pt = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ldg_pt"} + field.ldb_pt = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ldb_pt"} + +[section.runtime_1in3.register.pep_pointer_1] + description="PEP: pointers (part 2)" + owner="Kernel" + read_access="Read" + write_access="None" + field.ks_in_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP ks_in_rp"} + field.ks_in_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP ks_in_wp"} + field.ks_out_rp = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ks_out_rp"} + field.ks_out_wp = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ks_out_wp"} + +[section.runtime_1in3.register.pep_pointer_2] + description="PEP: pointers (part 3)" + owner="Kernel" + read_access="Read" + write_access="None" + field.pbs_in_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP pbs_in_rp"} + field.pbs_in_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP pbs_in_wp"} + field.ipip_flush_last_pbs_in_loop = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP IPIP flush last pbs_in_loop"} + +[section.runtime_1in3.register.isc_latest_instruction] + description="ISC: 4 latest instructions received ([0] is the most recent)" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_0","_1","_2","_3"] + +[section.runtime_1in3.register.pep_seq_bpip_batch_cnt] + description="PEP: BPIP batch counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_flush_cnt] + description="PEP: BPIP batch triggered by a flush counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_timeout_cnt] + description="PEP: BPIP batch triggered by a timeout counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_waiting_batch_cnt] + description="PEP: BPIP batch that waits the trigger counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_filling_cnt] + description="PEP: Count batch with filled with a given number of CT (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_1","_2","_3","_4","_5","_6","_7","_8","_9","_10","_11","_12","_13","_14","_15","_16"] + +[section.runtime_1in3.register.pep_seq_ld_ack_cnt] + description="PEP: load BLWE ack counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_cmux_not_full_batch_cnt] + description="PEP: not full batch CMUX counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_ipip_flush_cnt] + description="PEP: IPIP flush CMUX counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldb_rcp_dur] + description="PEP: load BLWE reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldg_req_dur] + description="PEP: load GLWE request max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldg_rcp_dur] + description="PEP: load GLWE reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_load_ksk_rcp_dur] + description="PEP: load KSK slice reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"] + + +[section.runtime_1in3.register.pep_mmacc_sxt_rcp_dur] + description="PEP: MMACC SXT reception duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_mmacc_sxt_req_dur] + description="PEP: MMACC SXT request duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_mmacc_sxt_cmd_wait_b_dur] + description="PEP: MMACC SXT command wait for b duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_inst_cnt] + description="PEP: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ack_cnt] + description="PEP: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_inst_cnt] + description="PEM: load input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_ack_cnt] + description="PEM: load instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_store_inst_cnt] + description="PEM: store input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_store_ack_cnt] + description="PEM: store instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pea_inst_cnt] + description="PEA: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pea_ack_cnt] + description="PEA: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.isc_inst_cnt] + description="ISC: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.isc_ack_cnt] + description="ISC: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_info_0] + description="PEM: load first data)" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_pc0_0","_pc0_1","_pc0_2","_pc0_3","_pc1_0","_pc1_1","_pc1_2","_pc1_3"] + +[section.runtime_1in3.register.pem_load_info_1] + description="PEM: load first address" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_pc0_lsb","_pc0_msb","_pc1_lsb","_pc1_msb"] + +[section.runtime_1in3.register.pem_store_info_0] + description="PEM: store info 0)" + owner="Kernel" + read_access="Read" + write_access="None" + field.cmd_vld = { size_b=1, offset_b=0 , default={Cst=0}, description="PEM_ST cmd vld"} + field.cmd_rdy = { size_b=1, offset_b=1 , default={Cst=0}, description="PEM_ST cmd rdy"} + field.pem_regf_rd_req_vld = { size_b=1, offset_b=2 , default={Cst=0}, description="PEM_ST pem_regf_rd_req_vld"} + field.pem_regf_rd_req_rdy = { size_b=1, offset_b=3 , default={Cst=0}, description="PEM_ST pem_regf_rd_req_rdy"} + field.brsp_fifo_in_vld = { size_b=4, offset_b=4 , default={Cst=0}, description="PEM_ST brsp_fifo_in_vld"} + field.brsp_fifo_in_rdy = { size_b=4, offset_b=8 , default={Cst=0}, description="PEM_ST brsp_fifo_in_rdy"} + field.rcp_fifo_in_vld = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST rcp_fifo_in_vld"} + field.rcp_fifo_in_rdy = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST rcp_fifo_in_rdy"} + field.r2_axi_vld = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST r2_axi_vld"} + field.r2_axi_rdy = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST r2_axi_rdy"} + field.c0_enough_location = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST c0_enough_location"} + +[section.runtime_1in3.register.pem_store_info_1] + description="PEM: store info 1" + owner="Kernel" + read_access="Read" + write_access="None" + field.s0_cmd_vld = { size_b=4, offset_b=0 , default={Cst=0}, description="PEM_ST s0_cmd_vld"} + field.s0_cmd_rdy = { size_b=4, offset_b=4 , default={Cst=0}, description="PEM_ST s0_cmd_rdy"} + field.m_axi_bvalid = { size_b=4, offset_b=8 , default={Cst=0}, description="PEM_ST m_axi_bvalid"} + field.m_axi_bready = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST m_axi_bready"} + field.m_axi_wvalid = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST m_axi_wvalid"} + field.m_axi_wready = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST m_axi_wready"} + field.m_axi_awvalid = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST m_axi_awvalid"} + field.m_axi_awready = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST m_axi_awready"} + +[section.runtime_1in3.register.pem_store_info_2] + description="PEM: store info 2" + owner="Kernel" + read_access="Read" + write_access="None" + field.c0_free_loc_cnt = { size_b=16, offset_b=0 , default={Cst=0}, description="PEM_ST c0_free_loc_cnt"} + field.brsp_bresp_cnt = { size_b=16, offset_b=16 , default={Cst=0}, description="PEM_ST brsp_bresp_cnt"} + +[section.runtime_1in3.register.pem_store_info_3] + description="PEM: store info 3" + owner="Kernel" + read_access="Read" + write_access="None" + field.brsp_ack_seen = { size_b=16, offset_b=0 , default={Cst=0}, description="PEM_ST brsp_ack_seen"} + field.c0_cmd_cnt = { size_b=8, offset_b=16 , default={Cst=0}, description="PEM_ST c0_cmd_cnt"} + + +# ===================================================================================================================== +[section.runtime_3in3] +description="Runtime information" + +[section.runtime_3in3.register.pep_load_bsk_rcp_dur] + description="PEP: load BSK slice reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"] + +[section.runtime_3in3.register.pep_bskif_req_info_0] + description="PEP: BSK_IF: requester info 0" + owner="Kernel" + read_access="Read" + write_access="None" + field.req_br_loop_rp = { size_b=16, offset_b=0 , default={Cst=0}, description="PEP BSK_IF requester BSK read pointer"} + field.req_br_loop_wp = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK write pointer"} + +[section.runtime_3in3.register.pep_bskif_req_info_1] + description="PEP: BSK_IF: requester info 0" + owner="Kernel" + read_access="Read" + write_access="None" + field.req_prf_br_loop = { size_b=16, offset_b=0 , default={Cst=0}, description="PEP BSK_IF requester BSK prefetch pointer"} + field.req_parity = { size_b=1, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK pointer parity"} + field.req_assigned = { size_b=1, offset_b=31 , default={Cst=0}, description="PEP BSK_IF requester assignment"} + +# ===================================================================================================================== +[section.WorkAck] +description="Purpose of this section" +offset= 0x8000 + +[section.WorkAck.register.workq] + description="Insert work in workq and read status" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.WorkAck.register.ackq] + description="Pop ack from in ackq" + owner="Kernel" + read_access="ReadNotify" + write_access="None" diff --git a/backends/tfhe-hpu-backend/config_store/v80/Readme.md b/backends/tfhe-hpu-backend/config_store/v80/Readme.md new file mode 100644 index 000000000..ecf5014fc --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/Readme.md @@ -0,0 +1,74 @@ +NB: Versal don't have the pdi embedded in the configuration. Instead user is in charge of pdi upload in FPGA flash. +Thus, a given configuration could works on multiple pdi. + +# Fpga version @250MHz +This configuration as based on the following Fpga commit: +``` +commit ad668f931eff0c281a0848d43360da0b8813539a (HEAD -> dev/hpu_v80, origin/dev/hpu_v80, origin/baroux/dev/hpu_v80, baroux/dev/hpu_v80) +Merge: 1489024a f308f067 +Author: Baptiste Roux +Date: Fri Feb 14 19:02:53 2025 +0100 + + [MERGE] 'dev/hpu' into baroux/dev/hpu_v80 + + Retrieved CI bugfix from dev/hpu +``` +Tagged as `aved_v1.0` + +Built with the following command: (i.e. versal/run_syn_hpu_msplit_3parts_psi32.sh) +``` +TOP=top_hpu_assembly +TOP_MSPLIT=TOP_MSPLIT_1 +TOP_BATCH=TOP_BATCH_TOPhpu_BPBS12_TPBS32 +TOP_PCMAX=TOP_PCMAX_pem2_glwe1_bsk16_ksk16 +TOP_PC=TOP_PC_pem2_glwe1_bsk8_ksk16 +APPLICATION=APPLI_msg2_carry2_pfail64_132b_gaussian_1f72dba +NTT_MOD=NTT_MOD_goldilocks +NTT_CORE_ARCH=NTT_CORE_ARCH_gf64 +NTT_CORE_R_PSI=NTT_CORE_R2_PSI32 +NTT_CORE_RDX_CUT=NTT_CORE_RDX_CUT_n5c6 +NTT_CORE_DIV=NTT_CORE_DIV_1 +BSK_SLOT_CUT=BSK_SLOT8_CUT8 +KSK_SLOT_CUT=KSK_SLOT8_CUT16 +KSLB=KSLB_x3y64z3 +HPU_PART=HPU_PART_gf64 +AXI_DATA_W=AXI_DATA_W_256 +FPGA=FPGA_v80 + +just build $TOP new "-F TOP_MSPLIT $TOP_MSPLIT -F TOP_BATCH $TOP_BATCH -F TOP_PCMAX $TOP_PCMAX -F TOP_PC $TOP_PC -F APPLICATION $APPLICATION -F NTT_MOD $NTT_MOD -F NTT_CORE_ARCH $NTT_CORE_ARCH -F NTT_CORE_R_PSI $NTT_CORE_R_PSI -F NTT_CORE_RDX_CUT $NTT_CORE_RDX_CUT -F NTT_CORE_DIV $NTT_CORE_DIV -F BSK_SLOT_CUT $BSK_SLOT_CUT -F KSK_SLOT_CUT $KSK_SLOT_CUT -F KSLB $KSLB -F HPU_PART $HPU_PART -F AXI_DATA_W $AXI_DATA_W -F FPGA $FPGA" | tee build_out.log +``` + +# Fpga version @350MHz +This configuration as based on the following Fpga commit: +``` +commit d29dbeaccf09adfe0ee13e326f4633e14726b020 (HEAD -> baroux/dev/hpu_v80_2024.2, origin/baroux/dev/hpu_v80_2024.2) +Author: pgardratzama +Date: Tue Feb 11 16:12:10 2025 +0100 + + adds script to synthetize HPU 1 part PSI32 +``` +Mainly the that commit as above with flow modification from Pierre Gardrat to support Vivado 2024.2. +NB: Based on unofficial branch and thus not tagged + +Built with the following command: (i.e. versal/run_syn_hpu_1part_psi32.sh) +``` +TOP=fpga_top_hpu +TOP_MSPLIT=TOP_MSPLIT_1 +TOP_BATCH=TOP_BATCH_TOPhpu_BPBS12_TPBS32 +TOP_PCMAX=TOP_PCMAX_pem2_glwe1_bsk16_ksk16 +TOP_PC=TOP_PC_pem2_glwe1_bsk8_ksk16 +APPLICATION=APPLI_msg2_carry2_pfail64_132b_gaussian_1f72dba +NTT_MOD=NTT_MOD_goldilocks +NTT_CORE_ARCH=NTT_CORE_ARCH_gf64 +NTT_CORE_R_PSI=NTT_CORE_R2_PSI32 +NTT_CORE_RDX_CUT=NTT_CORE_RDX_CUT_n5c6 +NTT_CORE_DIV=NTT_CORE_DIV_1 +BSK_SLOT_CUT=BSK_SLOT8_CUT8 +KSK_SLOT_CUT=KSK_SLOT8_CUT16 +KSLB=KSLB_x3y64z3 +HPU_PART=HPU_PART_gf64 +AXI_DATA_W=AXI_DATA_W_256 +FPGA=FPGA_v80 + +just build $TOP new "-F TOP_MSPLIT $TOP_MSPLIT -F TOP_BATCH $TOP_BATCH -F TOP_PCMAX $TOP_PCMAX -F TOP_PC $TOP_PC -F APPLICATION $APPLICATION -F NTT_MOD $NTT_MOD -F NTT_CORE_ARCH $NTT_CORE_ARCH -F NTT_CORE_R_PSI $NTT_CORE_R_PSI -F NTT_CORE_RDX_CUT $NTT_CORE_RDX_CUT -F NTT_CORE_DIV $NTT_CORE_DIV -F BSK_SLOT_CUT $BSK_SLOT_CUT -F KSK_SLOT_CUT $KSK_SLOT_CUT -F KSLB $KSLB -F HPU_PART $HPU_PART -F AXI_DATA_W $AXI_DATA_W -F FPGA $FPGA" | tee build_out.log +``` diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_0.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_0.asm new file mode 100644 index 000000000..838beed9e --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_0.asm @@ -0,0 +1,15 @@ +# CUST_0 +# Simple IOp to check the xfer between Hpu/Cpu +# Construct constant in dest slot -> 249 (0xf9) +SUB R0 R0 R0 +ADDS R0 R0 1 +ST TD[0].0 R0 +SUB R1 R1 R1 +ADDS R1 R1 2 +ST TD[0].1 R1 +SUB R2 R2 R2 +ADDS R2 R2 3 +ST TD[0].2 R2 +SUB R3 R3 R3 +ADDS R3 R3 3 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_1.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_1.asm new file mode 100644 index 000000000..3679e2c5f --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_1.asm @@ -0,0 +1,11 @@ +# CUST_1 +# Simple IOp to check the xfer between Hpu/Cpu +# Dest <- Src_a +LD R0 TS[0].0 +LD R1 TS[0].1 +LD R2 TS[0].2 +LD R3 TS[0].3 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_10.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_10.asm new file mode 100644 index 000000000..f591d66b3 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_10.asm @@ -0,0 +1,25 @@ +; CUST_8 +; Simple IOp to check the ALU operation +; Dst[0].0 <- Src[0].0 + Src[1].0 +LD R1 TS[0].0 +LD R2 TS[1].0 +ADD R0 R1 R2 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 + Src[1].1 +LD R5 TS[0].1 +LD R6 TS[1].1 +ADD R4 R5 R6 +ST TD[0].2 R4 + +; Dst[0].2 <- Src[0].2 + Src[1].2 +LD R9 TS[0].2 +LD R10 TS[1].2 +ADD R8 R9 R10 +ST TD[0].2 R8 + +; Dst[0].3 <- Src[0].3 + Src[1].3 +LD R13 TS[0].3 +LD R14 TS[1].3 +ADD R12 R13 R14 +ST TD[0].3 R0 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_16.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_16.asm new file mode 100644 index 000000000..0b4cfe80f --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_16.asm @@ -0,0 +1,6 @@ +# CUST_16 +# Simple IOp to check PBS behavior +# Dest <- PBSNone(Src_a.0) +LD R0 TS[0].0 +PBS_F R0 R0 PbsNone +ST TD[0].0 R0 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_17.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_17.asm new file mode 100644 index 000000000..bdb6711a7 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_17.asm @@ -0,0 +1,15 @@ +# CUST_17 +# Simple IOp to check PBS behavior +# Dest <- PBSNone(Src_a) +LD R0 TS[0].0 +PBS R0 R0 PbsNone +ST TD[0].0 R0 +LD R1 TS[0].1 +PBS R1 R1 PbsNone +ST TD[0].1 R1 +LD R2 TS[0].2 +PBS R2 R2 PbsNone +ST TD[0].2 R2 +LD R3 TS[0].3 +PBS_F R3 R3 PbsNone +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_18.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_18.asm new file mode 100644 index 000000000..c4b9a46a0 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_18.asm @@ -0,0 +1,23 @@ +; CUST_18 +; Simple IOp to check extraction pattern +; Correct result: +; * Dst[0,1] <- Src[0][0,1] +; * Dst[2,3] <- Src[1][0,1] + +; Pack Src[0][0,1] with a Mac and extract Carry/Msg in Dst[0][0,1] +LD R0 TS[0].0 +LD R1 TS[0].1 +MAC R3 R1 R0 4 +PBS R4 R3 PbsMsgOnly +PBS R5 R3 PbsCarryInMsg +ST TD[0].0 R4 +ST TD[0].1 R5 + +; Pack Src[1][0,1] with a Mac and extract Carry/Msg in Dst[0][2,3] +LD R10 TS[1].0 +LD R11 TS[1].1 +MAC R13 R11 R10 4 +PBS R14 R13 PbsMsgOnly +PBS R15 R13 PbsCarryInMsg +ST TD[0].2 R14 +ST TD[0].3 R15 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_19.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_19.asm new file mode 100644 index 000000000..0974347fa --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_19.asm @@ -0,0 +1,19 @@ +; CUST_19 +; Simple IOp to check PbsMl2 +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- 0 +; * Dst[0][2] <- Src[0][0] +1 +; * Dst[0][3] <- 0 +; i.e Cust_19(0x2) => 0x32 + +; Construct a 0 for destination padding +SUB R16 R16 R16 + +; Apply PbsMl2 on Src[0] result goes in dest[0][0-3] (0-padded) +LD R0 TS[0].0 +PBS_ML2_F R0 R0 PbsTestMany2 +ST TD[0].0 R0 +ST TD[0].1 R16 +ST TD[0].2 R1 +ST TD[0].3 R16 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_2.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_2.asm new file mode 100644 index 000000000..bc8e0175e --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_2.asm @@ -0,0 +1,11 @@ +# CUST_2 +# Simple IOp to check the xfer between Hpu/Cpu +# Dest <- Src_b +LD R0 TS[1].0 +LD R1 TS[1].1 +LD R2 TS[1].2 +LD R3 TS[1].3 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_20.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_20.asm new file mode 100644 index 000000000..5f29f8ee5 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_20.asm @@ -0,0 +1,22 @@ +; CUST_20 +; Simple IOp to check PbsMl4 +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- Src[0][0] +1 +; * Dst[0][2] <- Src[0][0] +2 +; * Dst[0][3] <- Src[0][0] +3 +; i.e Cust_20(0x0) => 0xe4 + +SUB R16 R16 R16 +ST TD[0].0 R0 +ST TD[0].1 R0 +ST TD[0].2 R0 +ST TD[0].3 R0 + +; Apply PbsMl4 on Src[0] result goes in dest[0][0-3] +LD R0 TS[0].0 +PBS_ML4_F R0 R0 PbsTestMany4 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_21.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_21.asm new file mode 100644 index 000000000..5a601bbe6 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_21.asm @@ -0,0 +1,24 @@ +; CUST_21 +; Simple IOp to check PbsMl8 +; WARN: This operation required 16b ct width +; Correct result: +; * Dst[0][0] <- Src[0][0] +; * Dst[0][1] <- Src[0][0] +1 +; * Dst[0][2] <- Src[0][0] +2 +; * Dst[0][3] <- Src[0][0] +3 +; * Dst[0][4] <- Src[0][0] +4 +; * Dst[0][5] <- Src[0][0] +5 +; * Dst[0][6] <- Src[0][0] +6 +; * Dst[0][7] <- Src[0][0] +7 + +; Apply PbsMl8 on Src[0] result goes in dest[0][0-7] +LD R0 TS[0].0 +PBS_ML8_F R0 R0 PbsTestMany8 +ST TD[0].0 R0 +ST TD[0].1 R1 +ST TD[0].2 R2 +ST TD[0].3 R3 +ST TD[0].4 R4 +ST TD[0].5 R5 +ST TD[0].6 R6 +ST TD[0].7 R7 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_3.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_3.asm new file mode 100644 index 000000000..d13ca243c --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_3.asm @@ -0,0 +1,16 @@ +# CUST_3 +# Simple IOp to check isc behavior +# Generate obvious deps and check that isc correctly issued the dop +# Correct result must bu Dest <- Src[0] +LD R0 TS[0].0 +LD R1 TS[0].1 +LD R2 TS[0].2 +LD R3 TS[0].3 +PBS R4 R0 PbsNone +ST TD[0].0 R4 +PBS R4 R1 PbsNone +ST TD[0].1 R4 +PBS R4 R2 PbsNone +ST TD[0].2 R4 +PBS_F R4 R3 PbsNone +ST TD[0].3 R4 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_4.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_4.asm new file mode 100644 index 000000000..192068543 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_4.asm @@ -0,0 +1,264 @@ +# CUST_4 +# Just to check if this batch times out +LD R0 TS[0].31 +LD R1 TS[1].31 +LD R3 TS[0].27 +LD R4 TS[1].27 +LD R6 TS[0].30 +LD R7 TS[1].30 +LD R9 TS[0].28 +LD R10 TS[1].28 +LD R12 TS[0].29 +LD R13 TS[1].29 +LD R15 TS[0].23 +LD R16 TS[1].23 +LD R18 TS[0].26 +LD R19 TS[1].26 +LD R21 TS[0].24 +LD R22 TS[1].24 +LD R24 TS[0].20 +LD R25 TS[1].20 +LD R27 TS[0].13 +LD R28 TS[1].13 +LD R30 TS[0].25 +LD R31 TS[1].25 +LD R33 TS[0].22 +LD R34 TS[1].22 +LD R36 TS[0].17 +LD R37 TS[1].17 +LD R39 TS[0].19 +LD R40 TS[1].19 +LD R42 TS[0].15 +LD R43 TS[1].15 +LD R45 TS[0].12 +LD R46 TS[1].12 +LD R48 TS[0].7 +LD R49 TS[1].7 +LD R51 TS[0].6 +LD R52 TS[1].6 +LD R54 TS[0].10 +LD R55 TS[1].10 +LD R57 TS[0].14 +LD R58 TS[1].14 +LD R60 TS[0].11 +LD R61 TS[1].11 +ADD R2 R0 R1 +ADD R5 R3 R4 +LD R63 TS[0].18 +LD R3 TS[1].18 +ADD R8 R6 R7 +ST TH.0 R6 +ST TH.1 R7 +ADD R11 R9 R10 +ST TH.2 R11 +LD R9 TH.2 +ADD R14 R12 R13 +ST TH.3 R12 +ST TH.4 R13 +ADD R17 R15 R16 +ST TH.5 R17 +ADD R20 R18 R19 +ST TH.6 R18 +ST TH.7 R19 +LD R15 TH.5 +ADD R23 R21 R22 +ST TH.8 R23 +LD R21 TH.8 +ADD R26 R24 R25 +ST TH.9 R24 +ST TH.10 R25 +ADD R29 R27 R28 +ST TH.11 R29 +LD R27 TH.11 +ADD R32 R30 R31 +ST TH.12 R30 +ST TH.13 R31 +ADD R35 R33 R34 +ST TH.14 R35 +ADD R38 R36 R37 +ST TH.15 R36 +ST TH.16 R37 +LD R33 TH.14 +PBS_ML2 R0 R2 PbsManyGenProp +PBS_ML2 R6 R5 PbsManyGenProp +PBS_ML2 R10 R9 PbsManyGenProp +PBS_ML2 R12 R8 PbsManyGenProp +PBS_ML2 R16 R14 PbsManyGenProp +PBS_ML2 R18 R15 PbsManyGenProp +PBS_ML2 R22 R21 PbsManyGenProp +PBS_ML2 R24 R20 PbsManyGenProp +PBS_ML2 R28 R27 PbsManyGenProp +PBS_ML2 R30 R26 PbsManyGenProp +PBS_ML2 R34 R32 PbsManyGenProp +PBS_ML2_F R36 R33 PbsManyGenProp +ADD R41 R39 R40 +LD R39 TS[0].16 +LD R40 TS[1].16 +ST TH.17 R38 +ST TH.18 R33 +LD R33 TS[0].1 +ST TH.19 R32 +LD R32 TS[1].1 +ST TH.20 R26 +ST TH.21 R27 +LD R27 TS[0].21 +ST TH.22 R20 +LD R20 TS[1].21 +ST TH.23 R21 +ST TH.24 R15 +LD R15 TS[0].0 +ST TH.25 R14 +LD R14 TS[1].0 +ST TH.26 R8 +ST TH.27 R9 +LD R9 TS[0].3 +ST TH.28 R5 +LD R5 TS[1].3 +ST TH.29 R2 +ADD R44 R42 R43 +LD R42 TS[0].2 +LD R43 TS[1].2 +ST TH.30 R41 +ADD R47 R45 R46 +LD R45 TS[0].9 +LD R46 TS[1].9 +ST TH.31 R44 +ADD R50 R48 R49 +LD R48 TS[0].5 +LD R49 TS[1].5 +ST TH.32 R47 +ADD R53 R51 R52 +LD R51 TS[0].4 +LD R52 TS[1].4 +ST TH.33 R50 +ADD R56 R54 R55 +LD R54 TS[0].8 +LD R55 TS[1].8 +ST TH.34 R53 +ADD R59 R57 R58 +ADD R62 R60 R61 +ADD R4 R63 R3 +ADD R38 R39 R40 +ADD R26 R33 R32 +ADD R21 R27 R20 +ADD R8 R15 R14 +ADD R2 R9 R5 +ADD R41 R42 R43 +ADD R44 R45 R46 +ADD R47 R48 R49 +ADD R50 R51 R52 +ADD R53 R54 R55 +MAC R57 R11 R7 2 +LD R58 TH.31 +LD R63 TH.32 +LD R3 TH.17 +ST TH.35 R41 +LD R39 TH.30 +ST TH.36 R21 +ST TH.37 R47 +ST TH.38 R53 +ST TH.39 R44 +ST TH.40 R50 +ST TH.41 R0 +LD R27 TH.35 +ST TH.42 R12 +ST TH.43 R13 +LD R9 TH.39 +ST TH.44 R16 +ST TH.45 R17 +LD R5 TH.37 +ST TH.46 R18 +ST TH.47 R19 +ST TH.48 R6 +LD R6 TH.40 +ST TH.49 R22 +ST TH.50 R23 +ST TH.51 R10 +LD R10 TH.38 +ST TH.52 R24 +ST TH.53 R25 +ST TH.54 R28 +LD R28 TH.33 +ST TH.55 R30 +ST TH.56 R31 +ST TH.57 R29 +LD R29 TH.36 +ST TH.58 R34 +ST TH.59 R35 +ST TH.60 R36 +LD R36 TH.34 +PBS_ML2 R60 R58 PbsManyGenProp +PBS_ML2 R32 R38 PbsManyGenProp +PBS_ML2 R14 R63 PbsManyGenProp +PBS_ML2 R42 R8 PbsManyGenProp +PBS_ML2 R48 R3 PbsManyGenProp +PBS_ML2 R54 R62 PbsManyGenProp +PBS_ML2 R40 R39 PbsManyGenProp +PBS_ML2 R20 R4 PbsManyGenProp +PBS_ML2 R46 R59 PbsManyGenProp +PBS_ML2 R52 R26 PbsManyGenProp +PBS_ML2 R44 R56 PbsManyGenProp +PBS_ML2_F R50 R2 PbsManyGenProp +LD R11 TH.45 +ST TH.61 R37 +ST TH.62 R2 +LD R2 TH.53 +ST TH.63 R56 +LD R56 TH.59 +ST TH.64 R26 +ST TH.65 R59 +LD R59 TH.43 +ST TH.66 R4 +MAC R37 R11 R57 4 +MAC R26 R2 R56 2 +MAC R4 R59 R11 2 +MAC R2 R4 R57 4 +MAC R59 R33 R61 2 +LD R58 TH.57 +LD R62 TH.56 +ADDS R4 R42 0 +MAC R38 R47 R58 2 +MAC R63 R49 R59 4 +MAC R8 R21 R49 2 +MULS R3 R43 2 +ADDS R3 R3 0 +MAC R39 R62 R41 2 +MAC R42 R8 R59 4 +MAC R21 R53 R3 4 +PBS_ML2 R0 R27 PbsManyGenProp +PBS_ML2 R12 R9 PbsManyGenProp +PBS_ML2 R16 R5 PbsManyGenProp +PBS_ML2 R18 R6 PbsManyGenProp +PBS_ML2 R22 R10 PbsManyGenProp +PBS_ML2 R24 R28 PbsManyGenProp +PBS_ML2 R30 R29 PbsManyGenProp +PBS_ML2 R34 R36 PbsManyGenProp +PBS R11 R2 PbsReduceCarryPad +PBS R33 R4 PbsGenPropAdd +PBS R47 R3 PbsReduceCarry2 +PBS_F R49 R42 PbsReduceCarryPad +MAC R43 R1 R53 2 +ST TD[0].0 R33 +LD R29 TH.61 +MAC R8 R47 R52 4 +ADDS R27 R11 1 +MAC R9 R31 R39 4 +ADDS R5 R49 1 +MAC R6 R43 R3 4 +MAC R10 R45 R13 2 +MAC R28 R23 R25 2 +MAC R36 R29 R31 2 +MAC R2 R19 R51 2 +MAC R4 R35 R17 2 +MAC R1 R13 R28 4 +MAC R53 R10 R28 4 +MAC R47 R36 R39 4 +MAC R52 R17 R2 4 +MAC R11 R4 R2 4 +PBS R62 R21 PbsReduceCarry3 +PBS R42 R8 PbsGenPropAdd +PBS R33 R6 PbsReduceCarryPad +PBS R49 R53 PbsReduceCarryPad +PBS R43 R47 PbsReduceCarryPad +PBS_F R3 R11 PbsReduceCarryPad +MAC R45 R62 R0 4 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_8.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_8.asm new file mode 100644 index 000000000..c02eee9cd --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_8.asm @@ -0,0 +1,19 @@ +; CUST_8 +; Simple IOp to check the ALU operation +; Dst[0].0 <- Src[0].0 + Src[1].0 +LD R1 TS[0].0 +LD R2 TS[1].0 +ADD R0 R1 R2 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 - Src[1].1 +LD R5 TS[0].1 +LD R6 TS[1].1 +SUB R4 R5 R6 +ST TD[0].1 R4 + +; Dst[0].2 <- Src[0].2 + (Src[1].2 *4) +LD R9 TS[0].2 +LD R10 TS[1].2 +MAC R8 R9 R10 4 +ST TD[0].2 R8 diff --git a/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_9.asm b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_9.asm new file mode 100644 index 000000000..5e5cc4129 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/custom_iop/cust_9.asm @@ -0,0 +1,21 @@ +; CUST_9 +; Simple IOp to check the ALU Scalar operation +; Dst[0].0 <- Src[0].0 + Imm[0].0 +LD R1 TS[0].0 +ADDS R0 R1 TI[0].0 +ST TD[0].0 R0 + +; Dst[0].1 <- Src[0].1 - Imm[0].1 +LD R5 TS[0].1 +SUBS R4 R5 TI[0].1 +ST TD[0].1 R4 + +; Dst[0].2 <- Imm[0].2 - Src[0].2 +LD R9 TS[0].2 +SSUB R8 R9 TI[0].2 +ST TD[0].2 R8 + +; Dst[0].3 <- Src[0].3 * Imm[0].3 +LD R13 TS[0].3 +MULS R12 R13 TI[0].3 +ST TD[0].3 R12 diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml new file mode 100644 index 000000000..a5fd2df84 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_config.toml @@ -0,0 +1,112 @@ + +[fpga] + regmap=["${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_1in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_cfg_3in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_1in3.toml", + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_regif_core_prc_3in3.toml"] + polling_us=10 +[fpga.ffi.V80] + ami_id=1 # First ami device in the list + qdma_h2c="/dev/qdma${V80_PCIE_DEV}001-MM-1" + qdma_c2h="/dev/qdma${V80_PCIE_DEV}001-MM-2" + +[rtl] + bpip_use = true + bpip_use_opportunism = true + bpip_timeout = 100_000 + +[board] + ct_mem = 32768 + ct_pc = [ + {Hbm= {pc=32}}, + {Hbm= {pc=33}}, + ] + heap_size = 16384 + + + lut_mem = 256 + lut_pc = {Hbm={pc=34}} + + fw_size= 16777216 # i.e. 16 MiB + fw_pc = {Ddr= {offset= 0x3900_0000}} # NB: Allocation must take place in the Discret DDR + + bsk_pc = [ + {Hbm={pc=8}}, + {Hbm={pc=12}}, + {Hbm={pc=24}}, + {Hbm={pc=28}}, + {Hbm={pc=40}}, + {Hbm={pc=44}}, + {Hbm={pc=56}}, + {Hbm={pc=60}} + ] + + ksk_pc = [ + {Hbm={pc=0}}, + {Hbm={pc=1}}, + {Hbm={pc=2}}, + {Hbm={pc=3}}, + {Hbm={pc=4}}, + {Hbm={pc=5}}, + {Hbm={pc=6}}, + {Hbm={pc=7}}, + {Hbm={pc=16}}, + {Hbm={pc=17}}, + {Hbm={pc=18}}, + {Hbm={pc=19}}, + {Hbm={pc=20}}, + {Hbm={pc=21}}, + {Hbm={pc=22}}, + {Hbm={pc=23}} + ] + + trace_pc = {Hbm={pc=35}} + trace_depth = 32 # In MB + +[firmware] + #implementation = "Ilp" + implementation = "Llt" + integer_w=[2,4,6,8,10,12,14,16,32,64,128] + min_batch_size = 11 + kogge_cfg = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml" + custom_iop.'IOP[0]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_0.asm" + custom_iop.'IOP[1]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_1.asm" + custom_iop.'IOP[2]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_2.asm" + custom_iop.'IOP[3]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_3.asm" + custom_iop.'IOP[4]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_4.asm" + custom_iop.'IOP[8]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_8.asm" + custom_iop.'IOP[9]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_9.asm" + custom_iop.'IOP[16]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_16.asm" + custom_iop.'IOP[17]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_17.asm" + custom_iop.'IOP[18]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_18.asm" + custom_iop.'IOP[19]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_19.asm" + custom_iop.'IOP[20]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_20.asm" + custom_iop.'IOP[21]' = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/custom_iop/cust_21.asm" + +[firmware.op_cfg.default] + fill_batch_fifo = true + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.MUL] + fill_batch_fifo = false + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.MULS] + fill_batch_fifo = false + min_batch_size = false + use_tiers = false + flush_behaviour = "Patient" + flush = true + +[firmware.op_cfg.by_op.ERC_20] + fill_batch_fifo = true + min_batch_size = false + use_tiers = true + flush_behaviour = "Patient" + flush = true diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_1in3.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_1in3.toml new file mode 100644 index 000000000..bfdb80263 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_1in3.toml @@ -0,0 +1,256 @@ +module_name="hpu_regif_core_cfg_1in3" +description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it." +word_size_b = 32 +offset = 0x00 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.entry_cfg_1in3] +description="entry_cfg_1in3 section with known value used for debug." +offset= 0x0 + +[section.entry_cfg_1in3.register.dummy_val0] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x01010101} + +[section.entry_cfg_1in3.register.dummy_val1] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x11111111} + +[section.entry_cfg_1in3.register.dummy_val2] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x21212121} + + +[section.entry_cfg_1in3.register.dummy_val3] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x31313131} + +# ===================================================================================================================== +[section.info] +description="RTL architecture parameters" +offset= 0x10 + +[section.info.register.version] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="VERSION"} + +[section.info.register.ntt_architecture] + description="NTT architecture" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="NTT_CORE_ARCH"} + +[section.info.register.ntt_structure] + description="NTT structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.radix = { size_b=8, offset_b=0 , default={Param="R"}, description="NTT radix"} + field.psi = { size_b=8, offset_b=8 , default={Param="PSI"}, description="NTT psi"} + field.div = { size_b=8, offset_b=16, default={Param="BWD_PSI_DIV"}, description="NTT backward div"} + field.delta = { size_b=8, offset_b=24, default={Param="DELTA"}, description="NTT network delta (for wmm arch)"} + +[section.info.register.ntt_rdx_cut] + description="NTT radix cuts, in log2 unit (for gf64 arch)" + owner="Parameter" + read_access="Read" + write_access="None" + field.radix_cut0 = { size_b=4, offset_b=0 , default={Param="NTT_RDX_CUT_S_0"}, description="NTT radix cut #0"} + field.radix_cut1 = { size_b=4, offset_b=4 , default={Param="NTT_RDX_CUT_S_1"}, description="NTT radix cut #1"} + field.radix_cut2 = { size_b=4, offset_b=8 , default={Param="NTT_RDX_CUT_S_2"}, description="NTT radix cut #2"} + field.radix_cut3 = { size_b=4, offset_b=12, default={Param="NTT_RDX_CUT_S_3"}, description="NTT radix cut #3"} + field.radix_cut4 = { size_b=4, offset_b=16, default={Param="NTT_RDX_CUT_S_4"}, description="NTT radix cut #4"} + field.radix_cut5 = { size_b=4, offset_b=20, default={Param="NTT_RDX_CUT_S_5"}, description="NTT radix cut #5"} + field.radix_cut6 = { size_b=4, offset_b=24, default={Param="NTT_RDX_CUT_S_6"}, description="NTT radix cut #6"} + field.radix_cut7 = { size_b=4, offset_b=28, default={Param="NTT_RDX_CUT_S_7"}, description="NTT radix cut #7"} + +[section.info.register.ntt_pbs] + description="Maximum number of PBS in the NTT pipeline" + owner="Parameter" + read_access="Read" + write_access="None" + field.batch_pbs_nb = { size_b=8, offset_b=0 , default={Param="BATCH_PBS_NB"}, description="Maximum number of PBS in the NTT pipe"} + field.total_pbs_nb = { size_b=8, offset_b=8 , default={Param="TOTAL_PBS_NB"}, description="Maximum number of PBS stored in PEP buffer"} + +[section.info.register.ntt_modulo] + description="Code associated to the NTT prime" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="MOD_NTT_NAME"} + +[section.info.register.application] + description="Code associated with the application" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="APPLICATION_NAME"} + +[section.info.register.ks_structure] + description="Key-switch structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.x = { size_b=8, offset_b=0 , default={Param="LBX"}, description="Number of coefficients on X dimension"} + field.y = { size_b=8, offset_b=8 , default={Param="LBY"}, description="Number of coefficients on Y dimension"} + field.z = { size_b=8, offset_b=16, default={Param="LBZ"}, description="Number of coefficients on Z dimension"} + +[section.info.register.ks_crypto_param] + description="Key-switch crypto parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.mod_ksk_w = { size_b=8, offset_b=0 , default={Param="MOD_KSK_W"}, description="Width of KSK modulo"} + field.ks_l = { size_b=8, offset_b=8 , default={Param="KS_L"}, description="Number of KS decomposition level"} + field.ks_b = { size_b=8, offset_b=16, default={Param="KS_B_W"}, description="Width of KS decomposition base"} + +[section.info.register.regf_structure] + description="Register file structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.reg_nb = { size_b=8, offset_b=0 , default={Param="REGF_REG_NB"}, description="Number of registers in regfile"} + field.coef_nb = { size_b=8, offset_b=8 , default={Param="REGF_COEF_NB"}, description="Number of coefficients at regfile interface"} + +[section.info.register.isc_structure] + description="Instruction scheduler structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.depth = { size_b=8, offset_b=0 , default={Param="ISC_DEPTH"}, description="Number of slots in ISC lookahead buffer."} + field.min_iop_size = { size_b=8, offset_b=8 , default={Param="MIN_IOP_SIZE"}, description="Minimum number of DOp per IOp to prevent sync_id overflow."} + +[section.info.register.pe_properties] + description="Processing elements parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.alu_nb = { size_b=8, offset_b=24 , default={Param="PEA_ALU_NB"}, description="Number of coefficients processed in parallel in pe_alu"} + field.pep_regf_period = { size_b=8, offset_b=16 , default={Param="PEP_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEP and regfile"} + field.pem_regf_period = { size_b=8, offset_b=8 , default={Param="PEM_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEM and regfile"} + field.pea_regf_period = { size_b=8, offset_b=0 , default={Param="PEA_REGF_PERIOD"}, description="Number of cycles between 2 consecutive data transfer between PEA and regfile"} + +[section.info.register.bsk_structure] + description="BSK manager structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.bsk_cut_nb = { size_b=8, offset_b=8 , default={Param="BSK_CUT_NB"}, description="BSK cut nb"} + +[section.info.register.ksk_structure] + description="KSK manager structure parameters" + owner="Parameter" + read_access="Read" + write_access="None" + field.ksk_cut_nb = { size_b=8, offset_b=8 , default={Param="KSK_CUT_NB"}, description="KSK cut nb"} + +[section.info.register.hbm_axi4_nb] + description="Number of AXI4 connections to HBM" + owner="Parameter" + read_access="Read" + write_access="None" + field.bsk_pc = { size_b=8, offset_b=0 , default={Param="BSK_PC"}, description="Number of HBM connections for BSK"} + field.ksk_pc = { size_b=8, offset_b=8, default={Param="KSK_PC"}, description="Number of HBM connections for KSK"} + field.pem_pc = { size_b=8, offset_b=16, default={Param="PEM_PC"}, description="Number of HBM connections for ciphertexts (PEM)"} + field.glwe_pc = { size_b=8, offset_b=24, default={Param="GLWE_PC"}, description="Number of HBM connections for GLWE"} + +[section.info.register.hbm_axi4_dataw_pem] + description="Ciphertext HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_PEM_DATA_W"} + +[section.info.register.hbm_axi4_dataw_glwe] + description="GLWE HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_GLWE_DATA_W"} + +[section.info.register.hbm_axi4_dataw_bsk] + description="BSK HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_BSK_DATA_W"} + +[section.info.register.hbm_axi4_dataw_ksk] + description="KSK HBM AXI4 connection data width" + owner="Parameter" + read_access="Read" + write_access="None" + default={Param="AXI4_KSK_DATA_W"} + + +# ===================================================================================================================== +[section.hbm_axi4_addr_1in3] +offset= 0x1000 +description="HBM AXI4 connection address offset" + +[section.hbm_axi4_addr_1in3.register.ct] + description="Address offset for each ciphertext HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb","_pc1_lsb", "_pc1_msb"] + +[section.hbm_axi4_addr_1in3.register.glwe] + description="Address offset for each GLWE HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb"] + + +[section.hbm_axi4_addr_1in3.register.ksk] + description="Address offset for each KSK HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"] + + [section.hbm_axi4_addr_1in3.register.trc] + description="Address offset for each trace HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb"] + +# ===================================================================================================================== +[section.bpip] +offset= 0x2000 +description="BPIP configuration" + +[section.bpip.register.use] + description="(1) Use BPIP mode, (0) use IPIP mode (default)" + owner="User" + read_access="Read" + write_access="Write" + field.use_bpip = { size_b=1, offset_b=0 , default={Cst=1}, description="use"} + field.use_opportunism = { size_b=1, offset_b=1 , default={Cst=0}, description="use opportunistic PBS flush"} + +[section.bpip.register.timeout] + description="Timeout for BPIP mode" + owner="User" + read_access="Read" + write_access="Write" + default={Cst=0xffffffff} diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml new file mode 100644 index 000000000..4afc095ab --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_cfg_3in3.toml @@ -0,0 +1,51 @@ +module_name="hpu_regif_core_cfg_3in3" +description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it." +word_size_b = 32 +offset = 0x20000 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.entry_cfg_3in3] +description="entry_cfg_3in3 section with known value used for debug." +offset= 0x0 + +[section.entry_cfg_3in3.register.dummy_val0] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x03030303} + +[section.entry_cfg_3in3.register.dummy_val1] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x13131313} + +[section.entry_cfg_3in3.register.dummy_val2] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x23232323} + +[section.entry_cfg_3in3.register.dummy_val3] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x33333333} + +# ===================================================================================================================== +[section.hbm_axi4_addr_3in3] +description="HBM AXI4 connection address offset" +offset= 0x10 + +[section.hbm_axi4_addr_3in3.register.bsk] + description="Address offset for each BSK HBM AXI4 connection" + owner="User" + read_access="Read" + write_access="Write" + duplicate=["_pc0_lsb", "_pc0_msb", "_pc1_lsb", "_pc1_msb", "_pc2_lsb", "_pc2_msb", "_pc3_lsb", "_pc3_msb", "_pc4_lsb", "_pc4_msb", "_pc5_lsb", "_pc5_msb", "_pc6_lsb", "_pc6_msb", "_pc7_lsb", "_pc7_msb", "_pc8_lsb", "_pc8_msb", "_pc9_lsb", "_pc9_msb", "_pc10_lsb", "_pc10_msb", "_pc11_lsb", "_pc11_msb", "_pc12_lsb", "_pc12_msb", "_pc13_lsb", "_pc13_msb", "_pc14_lsb", "_pc14_msb", "_pc15_lsb", "_pc15_msb"] diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_prc_1in3.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_prc_1in3.toml new file mode 100644 index 000000000..ef20175f8 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_prc_1in3.toml @@ -0,0 +1,336 @@ +module_name="hpu_regif_core_prc_1in3" +description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it." +word_size_b = 32 +offset = 0x10000 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.entry_prc_1in3] +description="entry_prc_1in3 section with known value used for debug." +offset= 0x0 + +[section.entry_prc_1in3.register.dummy_val0] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x02020202} + +[section.entry_prc_1in3.register.dummy_val1] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x12121212} + +[section.entry_prc_1in3.register.dummy_val2] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x22222222} + +[section.entry_prc_1in3.register.dummy_val3] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x32323232} + +# ===================================================================================================================== +[section.status_1in3] +description="HPU status of part 1in3" +offset= 0x10 + +[section.status_1in3.register.error] + description="Error register (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.pbs = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 1in3"} + +# ===================================================================================================================== +[section.ksk_avail] +description="KSK availability configuration" +offset= 0x1000 + +[section.ksk_avail.register.avail] + description="KSK available bit" + owner="User" + read_access="Read" + write_access="Write" + field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"} + +[section.ksk_avail.register.reset] + description="KSK reset sequence" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"} + field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"} + +# ===================================================================================================================== +[section.runtime_1in3] +description="Runtime information" +offset= 0x2000 + +[section.runtime_1in3.register.pep_cmux_loop] + description="PEP: CMUX iteration loop number" + owner="Kernel" + read_access="Read" + write_access="None" + field.br_loop = { size_b=15, offset_b=0 , default={Cst=0}, description="PBS current BR-loop"} + field.br_loop_c = { size_b=1, offset_b=15 , default={Cst=0}, description="PBS current BR-loop parity"} + field.ks_loop = { size_b=15, offset_b=16 , default={Cst=0}, description="KS current KS-loop"} + field.ks_loop_c = { size_b=1, offset_b=31 , default={Cst=0}, description="KS current KS-loop parity"} + +[section.runtime_1in3.register.pep_pointer_0] + description="PEP: pointers (part 1)" + owner="Kernel" + read_access="Read" + write_access="None" + field.pool_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP pool_rp"} + field.pool_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP pool_wp"} + field.ldg_pt = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ldg_pt"} + field.ldb_pt = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ldb_pt"} + +[section.runtime_1in3.register.pep_pointer_1] + description="PEP: pointers (part 2)" + owner="Kernel" + read_access="Read" + write_access="None" + field.ks_in_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP ks_in_rp"} + field.ks_in_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP ks_in_wp"} + field.ks_out_rp = { size_b=8, offset_b=16 , default={Cst=0}, description="PEP ks_out_rp"} + field.ks_out_wp = { size_b=8, offset_b=24 , default={Cst=0}, description="PEP ks_out_wp"} + +[section.runtime_1in3.register.pep_pointer_2] + description="PEP: pointers (part 3)" + owner="Kernel" + read_access="Read" + write_access="None" + field.pbs_in_rp = { size_b=8, offset_b=0 , default={Cst=0}, description="PEP pbs_in_rp"} + field.pbs_in_wp = { size_b=8, offset_b=8 , default={Cst=0}, description="PEP pbs_in_wp"} + field.ipip_flush_last_pbs_in_loop = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP IPIP flush last pbs_in_loop"} + +[section.runtime_1in3.register.isc_latest_instruction] + description="ISC: 4 latest instructions received ([0] is the most recent)" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_0","_1","_2","_3"] + +[section.runtime_1in3.register.pep_seq_bpip_batch_cnt] + description="PEP: BPIP batch counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_flush_cnt] + description="PEP: BPIP batch triggered by a flush counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_timeout_cnt] + description="PEP: BPIP batch triggered by a timeout counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_waiting_batch_cnt] + description="PEP: BPIP batch that waits the trigger counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_bpip_batch_filling_cnt] + description="PEP: Count batch with filled with a given number of CT (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_1","_2","_3","_4","_5","_6","_7","_8","_9","_10","_11","_12","_13","_14","_15","_16"] + +[section.runtime_1in3.register.pep_seq_ld_ack_cnt] + description="PEP: load BLWE ack counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_cmux_not_full_batch_cnt] + description="PEP: not full batch CMUX counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_seq_ipip_flush_cnt] + description="PEP: IPIP flush CMUX counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldb_rcp_dur] + description="PEP: load BLWE reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldg_req_dur] + description="PEP: load GLWE request max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ldg_rcp_dur] + description="PEP: load GLWE reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_load_ksk_rcp_dur] + description="PEP: load KSK slice reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"] + + +[section.runtime_1in3.register.pep_mmacc_sxt_rcp_dur] + description="PEP: MMACC SXT reception duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_mmacc_sxt_req_dur] + description="PEP: MMACC SXT request duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_mmacc_sxt_cmd_wait_b_dur] + description="PEP: MMACC SXT command wait for b duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_inst_cnt] + description="PEP: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pep_ack_cnt] + description="PEP: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_inst_cnt] + description="PEM: load input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_ack_cnt] + description="PEM: load instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_store_inst_cnt] + description="PEM: store input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_store_ack_cnt] + description="PEM: store instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pea_inst_cnt] + description="PEA: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pea_ack_cnt] + description="PEA: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.isc_inst_cnt] + description="ISC: input instruction counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.isc_ack_cnt] + description="ISC: instruction acknowledge counter (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + +[section.runtime_1in3.register.pem_load_info_0] + description="PEM: load first data)" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_pc0_0","_pc0_1","_pc0_2","_pc0_3","_pc1_0","_pc1_1","_pc1_2","_pc1_3"] + +[section.runtime_1in3.register.pem_load_info_1] + description="PEM: load first address" + owner="Kernel" + read_access="Read" + write_access="None" + duplicate=["_pc0_lsb","_pc0_msb","_pc1_lsb","_pc1_msb"] + +[section.runtime_1in3.register.pem_store_info_0] + description="PEM: store info 0)" + owner="Kernel" + read_access="Read" + write_access="None" + field.cmd_vld = { size_b=1, offset_b=0 , default={Cst=0}, description="PEM_ST cmd vld"} + field.cmd_rdy = { size_b=1, offset_b=1 , default={Cst=0}, description="PEM_ST cmd rdy"} + field.pem_regf_rd_req_vld = { size_b=1, offset_b=2 , default={Cst=0}, description="PEM_ST pem_regf_rd_req_vld"} + field.pem_regf_rd_req_rdy = { size_b=1, offset_b=3 , default={Cst=0}, description="PEM_ST pem_regf_rd_req_rdy"} + field.brsp_fifo_in_vld = { size_b=4, offset_b=4 , default={Cst=0}, description="PEM_ST brsp_fifo_in_vld"} + field.brsp_fifo_in_rdy = { size_b=4, offset_b=8 , default={Cst=0}, description="PEM_ST brsp_fifo_in_rdy"} + field.rcp_fifo_in_vld = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST rcp_fifo_in_vld"} + field.rcp_fifo_in_rdy = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST rcp_fifo_in_rdy"} + field.r2_axi_vld = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST r2_axi_vld"} + field.r2_axi_rdy = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST r2_axi_rdy"} + field.c0_enough_location = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST c0_enough_location"} + +[section.runtime_1in3.register.pem_store_info_1] + description="PEM: store info 1" + owner="Kernel" + read_access="Read" + write_access="None" + field.s0_cmd_vld = { size_b=4, offset_b=0 , default={Cst=0}, description="PEM_ST s0_cmd_vld"} + field.s0_cmd_rdy = { size_b=4, offset_b=4 , default={Cst=0}, description="PEM_ST s0_cmd_rdy"} + field.m_axi_bvalid = { size_b=4, offset_b=8 , default={Cst=0}, description="PEM_ST m_axi_bvalid"} + field.m_axi_bready = { size_b=4, offset_b=12 , default={Cst=0}, description="PEM_ST m_axi_bready"} + field.m_axi_wvalid = { size_b=4, offset_b=16 , default={Cst=0}, description="PEM_ST m_axi_wvalid"} + field.m_axi_wready = { size_b=4, offset_b=20 , default={Cst=0}, description="PEM_ST m_axi_wready"} + field.m_axi_awvalid = { size_b=4, offset_b=24 , default={Cst=0}, description="PEM_ST m_axi_awvalid"} + field.m_axi_awready = { size_b=4, offset_b=28 , default={Cst=0}, description="PEM_ST m_axi_awready"} + +[section.runtime_1in3.register.pem_store_info_2] + description="PEM: store info 2" + owner="Kernel" + read_access="Read" + write_access="None" + field.c0_free_loc_cnt = { size_b=16, offset_b=0 , default={Cst=0}, description="PEM_ST c0_free_loc_cnt"} + field.brsp_bresp_cnt = { size_b=16, offset_b=16 , default={Cst=0}, description="PEM_ST brsp_bresp_cnt"} + +[section.runtime_1in3.register.pem_store_info_3] + description="PEM: store info 3" + owner="Kernel" + read_access="Read" + write_access="None" + field.brsp_ack_seen = { size_b=16, offset_b=0 , default={Cst=0}, description="PEM_ST brsp_ack_seen"} + field.c0_cmd_cnt = { size_b=8, offset_b=16 , default={Cst=0}, description="PEM_ST c0_cmd_cnt"} diff --git a/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_prc_3in3.toml b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_prc_3in3.toml new file mode 100644 index 000000000..627f140c1 --- /dev/null +++ b/backends/tfhe-hpu-backend/config_store/v80/hpu_regif_core_prc_3in3.toml @@ -0,0 +1,100 @@ +module_name="hpu_regif_core_prc_3in3" +description="HPU top-level register interface. Used by the host to retrieve design information, and to configure it." +word_size_b = 32 +offset = 0x30000 +range = 0x10000 +ext_pkg = ["axi_if_common_param_pkg", "axi_if_shell_axil_pkg"] + +# ===================================================================================================================== +[section.entry_prc_3in3] +description="entry_prc_3in3 section with known value used for debug." +offset= 0x0 + +[section.entry_prc_3in3.register.dummy_val0] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x04040404} + +[section.entry_prc_3in3.register.dummy_val1] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x14141414} + +[section.entry_prc_3in3.register.dummy_val2] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x24242424} + +[section.entry_prc_3in3.register.dummy_val3] + description="RTL version" + owner="Parameter" + read_access="Read" + write_access="None" + default={Cst=0x34343434} + +# ===================================================================================================================== +[section.status_3in3] +description="HPU status of parts 2in3 and 3in3" +offset= 0x10 + +[section.status_3in3.register.error] + description="Error register (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.pbs = { size_b=32, offset_b=0 , default={Cst=0}, description="HPU error part 3in3"} + +# ===================================================================================================================== +[section.bsk_avail] +description="BSK availability configuration" +offset= 0x1000 + +[section.bsk_avail.register.avail] + description="BSK available bit" + owner="User" + read_access="Read" + write_access="Write" + field.avail = { size_b=1, offset_b=0 , default={Cst=0}, description="avail"} + +[section.bsk_avail.register.reset] + description="BSK reset sequence" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + field.request = { size_b=1, offset_b=0 , default={Cst=0}, description="request"} + field.done = { size_b=1, offset_b=31 , default={Cst=0}, description="done"} + +# ===================================================================================================================== +[section.runtime_3in3] +description="Runtime information" +offset= 0x2000 + +[section.runtime_3in3.register.pep_load_bsk_rcp_dur] + description="PEP: load BSK slice reception max duration (Could be reset by user)" + owner="Kernel" + read_access="Read" + write_access="WriteNotify" + duplicate=["_pc0","_pc1","_pc2","_pc3","_pc4","_pc5","_pc6","_pc7","_pc8","_pc9","_pc10","_pc11","_pc12","_pc13","_pc14","_pc15"] + +[section.runtime_3in3.register.pep_bskif_req_info_0] + description="PEP: BSK_IF: requester info 0" + owner="Kernel" + read_access="Read" + write_access="None" + field.req_br_loop_rp = { size_b=16, offset_b=0 , default={Cst=0}, description="PEP BSK_IF requester BSK read pointer"} + field.req_br_loop_wp = { size_b=16, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK write pointer"} + +[section.runtime_3in3.register.pep_bskif_req_info_1] + description="PEP: BSK_IF: requester info 0" + owner="Kernel" + read_access="Read" + write_access="None" + field.req_prf_br_loop = { size_b=16, offset_b=0 , default={Cst=0}, description="PEP BSK_IF requester BSK prefetch pointer"} + field.req_parity = { size_b=1, offset_b=16 , default={Cst=0}, description="PEP BSK_IF requester BSK pointer parity"} + field.req_assigned = { size_b=1, offset_b=31 , default={Cst=0}, description="PEP BSK_IF requester assignment"} diff --git a/backends/tfhe-hpu-backend/figures/tfhe-hpu-backend.excalidraw.png b/backends/tfhe-hpu-backend/figures/tfhe-hpu-backend.excalidraw.png new file mode 100644 index 000000000..ea0f7571e Binary files /dev/null and b/backends/tfhe-hpu-backend/figures/tfhe-hpu-backend.excalidraw.png differ diff --git a/backends/tfhe-hpu-backend/python/README b/backends/tfhe-hpu-backend/python/README new file mode 100644 index 000000000..9a5d3893f --- /dev/null +++ b/backends/tfhe-hpu-backend/python/README @@ -0,0 +1,12 @@ +This contains a small library to read trace files retrieved from the hardware or the mockup. + +To run, please add the lib directory to your PYTHONPATH: + +export PYTHONPATH=$(readlink -m ./lib) + +Make sure you start from a fresh python virtual environment and install the requirements in +requirements.txt: + +python -m venv new_env +source new_env/bin/activate +pip3 install -r requirements.txt diff --git a/backends/tfhe-hpu-backend/python/bin/demo.py b/backends/tfhe-hpu-backend/python/bin/demo.py new file mode 100755 index 000000000..c00bdddb4 --- /dev/null +++ b/backends/tfhe-hpu-backend/python/bin/demo.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +from pandas import DataFrame +from isctrace.analysis import Refilled, Retired, Trace + +freq_mhz = 300 + +iops = Trace.from_hw("data/trace.json") + +def analyze_iop(iop): + retired = Retired(iop) + + # Print the retired instructions as a table + print(retired.to_df().to_string()) + + # Print a batch latency table + latency_table = retired.pbs_latency_table(freq_mhz=freq_mhz).drop(columns='data') + print(latency_table) + + # And the runtime + runtime = retired.runtime_us(freq_mhz=freq_mhz) + print(f"batches: {latency_table['count'].sum()}") + print(f"Runtime: {runtime}us") + +if __name__ == "__main__": + analyze_iop(iops[0]) + +# vim: fdm=marker diff --git a/backends/tfhe-hpu-backend/python/data/trace.json b/backends/tfhe-hpu-backend/python/data/trace.json new file mode 100644 index 000000000..065585eb7 --- /dev/null +++ b/backends/tfhe-hpu-backend/python/data/trace.json @@ -0,0 +1,7628 @@ +[ + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x0 ", + "timestamp": 1350109813 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350109832 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x4 ", + "timestamp": 1350109839 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350109858 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350109884 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350109910 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350109936 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 5, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R5 @0x1 ", + "timestamp": 1350109943 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350109962 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 6, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R6 @0x5 ", + "timestamp": 1350109969 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350109988 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110014 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110040 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110066 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 10, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R10 @0x2 ", + "timestamp": 1350110073 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110092 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 11, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R11 @0x6 ", + "timestamp": 1350110099 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110118 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110144 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110170 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110196 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x3 ", + "timestamp": 1350110203 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x0 ", + "timestamp": 1350110216 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x0 ", + "timestamp": 1350110235 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110254 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 16, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R16 @0x7 ", + "timestamp": 1350110261 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110280 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110306 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110332 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110358 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110384 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110410 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110436 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110462 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110488 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110514 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 1350110534 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x4 ", + "timestamp": 1350110638 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x4 ", + "timestamp": 1350110657 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 1350110664 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 5, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R5 @0x1 ", + "timestamp": 1350110915 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 5, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R5 @0x1 ", + "timestamp": 1350110934 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 6, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R6 @0x5 ", + "timestamp": 1350111192 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 6, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R6 @0x5 ", + "timestamp": 1350111211 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 7, + "src0_rid": 5, + "src1_rid": 6, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R7 R5 R6 ", + "timestamp": 1350111218 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 10, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R10 @0x2 ", + "timestamp": 1350111490 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 10, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R10 @0x2 ", + "timestamp": 1350111509 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 11, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R11 @0x6 ", + "timestamp": 1350111928 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 11, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R11 @0x6 ", + "timestamp": 1350111947 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 12, + "src0_rid": 10, + "src1_rid": 11, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R12 R10 R11 ", + "timestamp": 1350111954 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x3 ", + "timestamp": 1350112212 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x3 ", + "timestamp": 1350112231 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 16, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R16 @0x7 ", + "timestamp": 1350112524 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 16, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R16 @0x7 ", + "timestamp": 1350112543 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 17, + "src0_rid": 15, + "src1_rid": 16, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R17 R15 R16 ", + "timestamp": 1350112550 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 1350112794 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 1350112813 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 1350112820 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 1350112924 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 7, + "src0_rid": 5, + "src1_rid": 6, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R7 R5 R6 ", + "timestamp": 1350114869 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 7, + "src0_rid": 5, + "src1_rid": 6, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R7 R5 R6 ", + "timestamp": 1350114888 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 8, + "src_rid": 7, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R8 R7 PbsCmpSign ", + "timestamp": 1350114895 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 8, + "src_rid": 7, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R8 R7 PbsCmpSign ", + "timestamp": 1350114999 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 12, + "src0_rid": 10, + "src1_rid": 11, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R12 R10 R11 ", + "timestamp": 1350116951 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 12, + "src0_rid": 10, + "src1_rid": 11, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R12 R10 R11 ", + "timestamp": 1350116970 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpSign ", + "timestamp": 1350116977 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpSign ", + "timestamp": 1350117081 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 17, + "src0_rid": 15, + "src1_rid": 16, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R17 R15 R16 ", + "timestamp": 1350119033 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 17, + "src0_rid": 15, + "src1_rid": 16, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R17 R15 R16 ", + "timestamp": 1350119052 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "PBS_F": { + "dst_rid": 18, + "src_rid": 17, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R18 R17 PbsCmpSign ", + "timestamp": 1350119059 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS_F": { + "dst_rid": 18, + "src_rid": 17, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R18 R17 PbsCmpSign ", + "timestamp": 1350119156 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 1350407127 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 1350407134 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 8, + "src_rid": 7, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R8 R7 PbsCmpSign ", + "timestamp": 1350407209 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 9, + "src_rid": 8, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R9 R8 1 ", + "timestamp": 1350407216 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpSign ", + "timestamp": 1350407298 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 14, + "src_rid": 13, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R14 R13 1 ", + "timestamp": 1350407305 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "PBS_F": { + "dst_rid": 18, + "src_rid": 17, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R18 R17 PbsCmpSign ", + "timestamp": 1350407380 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 19, + "src_rid": 18, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R19 R18 1 ", + "timestamp": 1350407387 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 1350409262 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 1350409281 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 9, + "src_rid": 8, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R9 R8 1 ", + "timestamp": 1350411338 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 9, + "src_rid": 8, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R9 R8 1 ", + "timestamp": 1350411357 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 20, + "src0_rid": 9, + "src1_rid": 4, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R20 R9 R4 4 ", + "timestamp": 1350411364 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 14, + "src_rid": 13, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R14 R13 1 ", + "timestamp": 1350413421 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 14, + "src_rid": 13, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R14 R13 1 ", + "timestamp": 1350413440 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 19, + "src_rid": 18, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R19 R18 1 ", + "timestamp": 1350415504 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 19, + "src_rid": 18, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R19 R18 1 ", + "timestamp": 1350415523 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 22, + "src0_rid": 19, + "src1_rid": 14, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R22 R19 R14 4 ", + "timestamp": 1350415530 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 20, + "src0_rid": 9, + "src1_rid": 4, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R20 R9 R4 4 ", + "timestamp": 1350417580 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 20, + "src0_rid": 9, + "src1_rid": 4, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R20 R9 R4 4 ", + "timestamp": 1350417599 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 21, + "src_rid": 20, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R21 R20 PbsCmpReduce ", + "timestamp": 1350417606 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 21, + "src_rid": 20, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R21 R20 PbsCmpReduce ", + "timestamp": 1350417710 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 22, + "src0_rid": 19, + "src1_rid": 14, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R22 R19 R14 4 ", + "timestamp": 1350419662 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 22, + "src0_rid": 19, + "src1_rid": 14, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R22 R19 R14 4 ", + "timestamp": 1350419681 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "PBS_F": { + "dst_rid": 23, + "src_rid": 22, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R23 R22 PbsCmpReduce ", + "timestamp": 1350419688 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS_F": { + "dst_rid": 23, + "src_rid": 22, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R23 R22 PbsCmpReduce ", + "timestamp": 1350419785 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 21, + "src_rid": 20, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R21 R20 PbsCmpReduce ", + "timestamp": 1350707770 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "PBS_F": { + "dst_rid": 23, + "src_rid": 22, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R23 R22 PbsCmpReduce ", + "timestamp": 1350707789 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 24, + "src0_rid": 23, + "src1_rid": 21, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R24 R23 R21 4 ", + "timestamp": 1350707796 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 24, + "src0_rid": 23, + "src1_rid": 21, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R24 R23 R21 4 ", + "timestamp": 1350709923 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 24, + "src0_rid": 23, + "src1_rid": 21, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R24 R23 R21 4 ", + "timestamp": 1350709942 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "PBS_F": { + "dst_rid": 25, + "src_rid": 24, + "gid": 29, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R25 R24 PbsCmpLtMrg ", + "timestamp": 1350709949 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS_F": { + "dst_rid": 25, + "src_rid": 24, + "gid": 29, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R25 R24 PbsCmpLtMrg ", + "timestamp": 1350710046 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "PBS_F": { + "dst_rid": 25, + "src_rid": 24, + "gid": 29, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R25 R24 PbsCmpLtMrg ", + "timestamp": 1350997905 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "ST": { + "rid": 25, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R25 ", + "timestamp": 1350997912 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RDUNLOCK", + "insn": { + "ST": { + "rid": 25, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R25 ", + "timestamp": 1350998338 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "RETIRE", + "insn": { + "ST": { + "rid": 25, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R25 ", + "timestamp": 1350998357 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "ISSUE", + "insn": { + "SYNC": { + "sid": 65535, + "opcode": { + "optype": "SYNC", + "subtype": 0 + } + } + }, + "insn_asm": "SYNC 65535 ", + "timestamp": 1350998364 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481733 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x3 ", + "timestamp": 503481740 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481759 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x7 ", + "timestamp": 503481766 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481785 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481811 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481837 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481863 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481889 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481915 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 7, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R7 @0x2 ", + "timestamp": 503481922 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481941 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 8, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R8 @0x6 ", + "timestamp": 503481948 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481967 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503481993 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482019 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482045 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482071 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482097 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 14, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R14 @0x1 ", + "timestamp": 503482104 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482123 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x5 ", + "timestamp": 503482130 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x3 ", + "timestamp": 503482143 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x3 ", + "timestamp": 503482162 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482181 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482207 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482233 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482259 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482285 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482311 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 21, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R21 @0x0 ", + "timestamp": 503482318 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482337 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 22, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R22 @0x4 ", + "timestamp": 503482344 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482363 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482389 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482415 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x7 ", + "timestamp": 503482428 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x7 ", + "timestamp": 503482447 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482466 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 503482473 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482492 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482518 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482544 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 503482564 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 7, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R7 @0x2 ", + "timestamp": 503482710 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 7, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R7 @0x2 ", + "timestamp": 503482729 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 8, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R8 @0x6 ", + "timestamp": 503483141 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 8, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R8 @0x6 ", + "timestamp": 503483160 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 9, + "src0_rid": 7, + "src1_rid": 8, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R9 R7 R8 ", + "timestamp": 503483167 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 14, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R14 @0x1 ", + "timestamp": 503483439 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 14, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R14 @0x1 ", + "timestamp": 503483458 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x5 ", + "timestamp": 503483723 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x5 ", + "timestamp": 503483742 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 16, + "src0_rid": 14, + "src1_rid": 15, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R16 R14 R15 ", + "timestamp": 503483749 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 21, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R21 @0x0 ", + "timestamp": 503484021 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 21, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R21 @0x0 ", + "timestamp": 503484040 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 22, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R22 @0x4 ", + "timestamp": 503484466 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 22, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R22 @0x4 ", + "timestamp": 503484485 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 23, + "src0_rid": 21, + "src1_rid": 22, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R23 R21 R22 ", + "timestamp": 503484492 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 503484603 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 503484622 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 503484629 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 503484733 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 9, + "src0_rid": 7, + "src1_rid": 8, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R9 R7 R8 ", + "timestamp": 503486678 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 9, + "src0_rid": 7, + "src1_rid": 8, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R9 R7 R8 ", + "timestamp": 503486697 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 10, + "src_rid": 9, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R10 R9 PbsCmpSign ", + "timestamp": 503486704 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 10, + "src_rid": 9, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R10 R9 PbsCmpSign ", + "timestamp": 503486808 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 16, + "src0_rid": 14, + "src1_rid": 15, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R16 R14 R15 ", + "timestamp": 503488760 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 16, + "src0_rid": 14, + "src1_rid": 15, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R16 R14 R15 ", + "timestamp": 503488779 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 17, + "src_rid": 16, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R17 R16 PbsCmpSign ", + "timestamp": 503488786 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 17, + "src_rid": 16, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R17 R16 PbsCmpSign ", + "timestamp": 503488890 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 23, + "src0_rid": 21, + "src1_rid": 22, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R23 R21 R22 ", + "timestamp": 503490842 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 23, + "src0_rid": 21, + "src1_rid": 22, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R23 R21 R22 ", + "timestamp": 503490861 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 24, + "src_rid": 23, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R24 R23 PbsCmpSign ", + "timestamp": 503490868 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 24, + "src_rid": 23, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R24 R23 PbsCmpSign ", + "timestamp": 503490965 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 503872701 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 503872708 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 10, + "src_rid": 9, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R10 R9 PbsCmpSign ", + "timestamp": 503872776 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 11, + "src_rid": 10, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R11 R10 1 ", + "timestamp": 503872783 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 17, + "src_rid": 16, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R17 R16 PbsCmpSign ", + "timestamp": 503872858 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 18, + "src_rid": 17, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R18 R17 1 ", + "timestamp": 503872865 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 24, + "src_rid": 23, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R24 R23 PbsCmpSign ", + "timestamp": 503872940 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 25, + "src_rid": 24, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R25 R24 1 ", + "timestamp": 503872947 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 503874836 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 503874855 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 5, + "src_rid": 4, + "msg_cst": { + "Cst": 4 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R5 R4 4 ", + "timestamp": 503874862 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 11, + "src_rid": 10, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R11 R10 1 ", + "timestamp": 503876912 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 11, + "src_rid": 10, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R11 R10 1 ", + "timestamp": 503876931 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 18, + "src_rid": 17, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R18 R17 1 ", + "timestamp": 503878995 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 18, + "src_rid": 17, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R18 R17 1 ", + "timestamp": 503879014 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 25, + "src_rid": 24, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R25 R24 1 ", + "timestamp": 503881078 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 25, + "src_rid": 24, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R25 R24 1 ", + "timestamp": 503881097 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 5, + "src_rid": 4, + "msg_cst": { + "Cst": 4 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R5 R4 4 ", + "timestamp": 503883154 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 5, + "src_rid": 4, + "msg_cst": { + "Cst": 4 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R5 R4 4 ", + "timestamp": 503883173 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 6, + "src_rid": 5, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R6 R5 PbsCmpReduce ", + "timestamp": 503883180 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 6, + "src_rid": 5, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R6 R5 PbsCmpReduce ", + "timestamp": 503883277 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 6, + "src_rid": 5, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R6 R5 PbsCmpReduce ", + "timestamp": 504270837 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 12, + "src0_rid": 6, + "src1_rid": 11, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R12 R6 R11 4 ", + "timestamp": 504270844 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 12, + "src0_rid": 6, + "src1_rid": 11, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R12 R6 R11 4 ", + "timestamp": 504272971 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 12, + "src0_rid": 6, + "src1_rid": 11, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R12 R6 R11 4 ", + "timestamp": 504272990 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpReduce ", + "timestamp": 504272997 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpReduce ", + "timestamp": 504273094 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpReduce ", + "timestamp": 504660668 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 19, + "src0_rid": 13, + "src1_rid": 18, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R19 R13 R18 4 ", + "timestamp": 504660675 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 19, + "src0_rid": 13, + "src1_rid": 18, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R19 R13 R18 4 ", + "timestamp": 504662802 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 19, + "src0_rid": 13, + "src1_rid": 18, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R19 R13 R18 4 ", + "timestamp": 504662821 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 20, + "src_rid": 19, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R20 R19 PbsCmpReduce ", + "timestamp": 504662828 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 20, + "src_rid": 19, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R20 R19 PbsCmpReduce ", + "timestamp": 504662925 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 20, + "src_rid": 19, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R20 R19 PbsCmpReduce ", + "timestamp": 505050499 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 26, + "src0_rid": 20, + "src1_rid": 25, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R26 R20 R25 4 ", + "timestamp": 505050506 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 26, + "src0_rid": 20, + "src1_rid": 25, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R26 R20 R25 4 ", + "timestamp": 505052633 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 26, + "src0_rid": 20, + "src1_rid": 25, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R26 R20 R25 4 ", + "timestamp": 505052652 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 27, + "src_rid": 26, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R27 R26 PbsCmpReduce ", + "timestamp": 505052659 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 27, + "src_rid": 26, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R27 R26 PbsCmpReduce ", + "timestamp": 505052756 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 27, + "src_rid": 26, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R27 R26 PbsCmpReduce ", + "timestamp": 505440337 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 28, + "src_rid": 27, + "gid": 12, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R28 R27 PbsCmpGt ", + "timestamp": 505440344 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 28, + "src_rid": 27, + "gid": 12, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R28 R27 PbsCmpGt ", + "timestamp": 505440441 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 28, + "src_rid": 27, + "gid": 12, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R28 R27 PbsCmpGt ", + "timestamp": 505828029 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "ST": { + "rid": 28, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R28 ", + "timestamp": 505828036 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RDUNLOCK", + "insn": { + "ST": { + "rid": 28, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R28 ", + "timestamp": 505828462 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "RETIRE", + "insn": { + "ST": { + "rid": 28, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R28 ", + "timestamp": 505828481 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 8 + }, + "cmd": "ISSUE", + "insn": { + "SYNC": { + "sid": 65535, + "opcode": { + "optype": "SYNC", + "subtype": 0 + } + } + }, + "insn_asm": "SYNC 65535 ", + "timestamp": 505828488 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809042997 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x0 ", + "timestamp": 2809043004 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043023 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x4 ", + "timestamp": 2809043030 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043049 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043075 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 6 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043101 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043127 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 5, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R5 @0x1 ", + "timestamp": 2809043134 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043153 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 6, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R6 @0x5 ", + "timestamp": 2809043160 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043179 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043205 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043231 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043257 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 10, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R10 @0x2 ", + "timestamp": 2809043264 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043283 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 11, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R11 @0x6 ", + "timestamp": 2809043290 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043309 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043335 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043361 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043387 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x3 ", + "timestamp": 2809043394 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x0 ", + "timestamp": 2809043407 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 0, + "slot": { + "Addr": 0 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R0 @0x0 ", + "timestamp": 2809043426 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043445 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "LD": { + "rid": 16, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R16 @0x7 ", + "timestamp": 2809043452 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043471 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043497 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043523 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043549 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043575 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043601 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043627 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043653 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043679 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x4 ", + "timestamp": 2809043692 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 1, + "slot": { + "Addr": 4 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R1 @0x4 ", + "timestamp": 2809043711 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043730 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 2809043737 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043756 + }, + { + "state": { + "pdg": false, + "rd_pdg": false, + "vld": false, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 7 + }, + "cmd": "REFILL", + "insn": null, + "insn_asm": null, + "timestamp": 2809043776 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 5, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R5 @0x1 ", + "timestamp": 2809044104 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 5, + "slot": { + "Addr": 1 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R5 @0x1 ", + "timestamp": 2809044123 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 6, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R6 @0x5 ", + "timestamp": 2809044381 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 6, + "slot": { + "Addr": 5 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R6 @0x5 ", + "timestamp": 2809044400 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 7, + "src0_rid": 5, + "src1_rid": 6, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R7 R5 R6 ", + "timestamp": 2809044407 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 10, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R10 @0x2 ", + "timestamp": 2809044672 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 10, + "slot": { + "Addr": 2 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R10 @0x2 ", + "timestamp": 2809044691 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 11, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R11 @0x6 ", + "timestamp": 2809045103 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 11, + "slot": { + "Addr": 6 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R11 @0x6 ", + "timestamp": 2809045122 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 12, + "src0_rid": 10, + "src1_rid": 11, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R12 R10 R11 ", + "timestamp": 2809045129 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x3 ", + "timestamp": 2809045380 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 15, + "slot": { + "Addr": 3 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R15 @0x3 ", + "timestamp": 2809045399 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "LD": { + "rid": 16, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R16 @0x7 ", + "timestamp": 2809045692 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "LD": { + "rid": 16, + "slot": { + "Addr": 7 + }, + "opcode": { + "optype": "MEM", + "subtype": 0 + } + } + }, + "insn_asm": "LD R16 @0x7 ", + "timestamp": 2809045711 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "SUB": { + "dst_rid": 17, + "src0_rid": 15, + "src1_rid": 16, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R17 R15 R16 ", + "timestamp": 2809045718 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 2809045864 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 2, + "src0_rid": 0, + "src1_rid": 1, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R2 R0 R1 ", + "timestamp": 2809045883 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 2809045890 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 2809045994 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 7, + "src0_rid": 5, + "src1_rid": 6, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R7 R5 R6 ", + "timestamp": 2809047946 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 7, + "src0_rid": 5, + "src1_rid": 6, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R7 R5 R6 ", + "timestamp": 2809047965 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 8, + "src_rid": 7, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R8 R7 PbsCmpSign ", + "timestamp": 2809047972 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 8, + "src_rid": 7, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R8 R7 PbsCmpSign ", + "timestamp": 2809048076 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 12, + "src0_rid": 10, + "src1_rid": 11, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R12 R10 R11 ", + "timestamp": 2809050028 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 12, + "src0_rid": 10, + "src1_rid": 11, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R12 R10 R11 ", + "timestamp": 2809050047 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpSign ", + "timestamp": 2809050054 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpSign ", + "timestamp": 2809050158 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "SUB": { + "dst_rid": 17, + "src0_rid": 15, + "src1_rid": 16, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R17 R15 R16 ", + "timestamp": 2809052103 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "SUB": { + "dst_rid": 17, + "src0_rid": 15, + "src1_rid": 16, + "mul_factor": 0, + "opcode": { + "optype": "ARITH", + "subtype": 2 + } + } + }, + "insn_asm": "SUB R17 R15 R16 ", + "timestamp": 2809052122 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 18, + "src_rid": 17, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R18 R17 PbsCmpSign ", + "timestamp": 2809052129 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 18, + "src_rid": 17, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R18 R17 PbsCmpSign ", + "timestamp": 2809052226 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 3, + "src_rid": 2, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R3 R2 PbsCmpSign ", + "timestamp": 2809434193 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 8, + "src_rid": 7, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R8 R7 PbsCmpSign ", + "timestamp": 2809434212 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 13, + "src_rid": 12, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R13 R12 PbsCmpSign ", + "timestamp": 2809434231 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 18, + "src_rid": 17, + "gid": 10, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R18 R17 PbsCmpSign ", + "timestamp": 2809434250 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 2809434257 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 9, + "src_rid": 8, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R9 R8 1 ", + "timestamp": 2809434264 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 14, + "src_rid": 13, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R14 R13 1 ", + "timestamp": 2809434271 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "ADDS": { + "dst_rid": 19, + "src_rid": 18, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R19 R18 1 ", + "timestamp": 2809434278 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 2809436384 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 4, + "src_rid": 3, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R4 R3 1 ", + "timestamp": 2809436403 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 9, + "src_rid": 8, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R9 R8 1 ", + "timestamp": 2809438467 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 9, + "src_rid": 8, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R9 R8 1 ", + "timestamp": 2809438486 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 20, + "src0_rid": 9, + "src1_rid": 4, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R20 R9 R4 4 ", + "timestamp": 2809438493 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 14, + "src_rid": 13, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R14 R13 1 ", + "timestamp": 2809440543 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 14, + "src_rid": 13, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R14 R13 1 ", + "timestamp": 2809440562 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "ADDS": { + "dst_rid": 19, + "src_rid": 18, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R19 R18 1 ", + "timestamp": 2809442626 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "ADDS": { + "dst_rid": 19, + "src_rid": 18, + "msg_cst": { + "Cst": 1 + }, + "opcode": { + "optype": "ARITH", + "subtype": 9 + } + } + }, + "insn_asm": "ADDS R19 R18 1 ", + "timestamp": 2809442645 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 22, + "src0_rid": 19, + "src1_rid": 14, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R22 R19 R14 4 ", + "timestamp": 2809442652 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 20, + "src0_rid": 9, + "src1_rid": 4, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R20 R9 R4 4 ", + "timestamp": 2809444702 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 20, + "src0_rid": 9, + "src1_rid": 4, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R20 R9 R4 4 ", + "timestamp": 2809444721 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 21, + "src_rid": 20, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R21 R20 PbsCmpReduce ", + "timestamp": 2809444728 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 21, + "src_rid": 20, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R21 R20 PbsCmpReduce ", + "timestamp": 2809444832 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 22, + "src0_rid": 19, + "src1_rid": 14, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R22 R19 R14 4 ", + "timestamp": 2809446784 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 22, + "src0_rid": 19, + "src1_rid": 14, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R22 R19 R14 4 ", + "timestamp": 2809446803 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 23, + "src_rid": 22, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R23 R22 PbsCmpReduce ", + "timestamp": 2809446810 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 23, + "src_rid": 22, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R23 R22 PbsCmpReduce ", + "timestamp": 2809446907 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 21, + "src_rid": 20, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R21 R20 PbsCmpReduce ", + "timestamp": 2809832535 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 23, + "src_rid": 22, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R23 R22 PbsCmpReduce ", + "timestamp": 2809832617 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "MAC": { + "dst_rid": 24, + "src0_rid": 23, + "src1_rid": 21, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R24 R23 R21 4 ", + "timestamp": 2809832624 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "MAC": { + "dst_rid": 24, + "src0_rid": 23, + "src1_rid": 21, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R24 R23 R21 4 ", + "timestamp": 2809834751 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "MAC": { + "dst_rid": 24, + "src0_rid": 23, + "src1_rid": 21, + "mul_factor": 4, + "opcode": { + "optype": "ARITH", + "subtype": 5 + } + } + }, + "insn_asm": "MAC R24 R23 R21 4 ", + "timestamp": 2809834770 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "PBS": { + "dst_rid": 25, + "src_rid": 24, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R25 R24 PbsCmpReduce ", + "timestamp": 2809834777 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS": { + "dst_rid": 25, + "src_rid": 24, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R25 R24 PbsCmpReduce ", + "timestamp": 2809834874 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "PBS": { + "dst_rid": 25, + "src_rid": 24, + "gid": 11, + "opcode": { + "optype": "PBS", + "subtype": 0 + } + } + }, + "insn_asm": "PBS R25 R24 PbsCmpReduce ", + "timestamp": 2810222448 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "PBS_F": { + "dst_rid": 26, + "src_rid": 25, + "gid": 13, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R26 R25 PbsCmpGte ", + "timestamp": 2810222455 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "PBS_F": { + "dst_rid": 26, + "src_rid": 25, + "gid": 13, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R26 R25 PbsCmpGte ", + "timestamp": 2810222552 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "PBS_F": { + "dst_rid": 26, + "src_rid": 25, + "gid": 13, + "opcode": { + "optype": "PBS", + "subtype": 8 + } + } + }, + "insn_asm": "PBS_F R26 R25 PbsCmpGte ", + "timestamp": 2810511944 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "ST": { + "rid": 26, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R26 ", + "timestamp": 2810511951 + }, + { + "state": { + "pdg": true, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RDUNLOCK", + "insn": { + "ST": { + "rid": 26, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R26 ", + "timestamp": 2810512461 + }, + { + "state": { + "pdg": true, + "rd_pdg": false, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "RETIRE", + "insn": { + "ST": { + "rid": 26, + "slot": { + "Addr": 8 + }, + "opcode": { + "optype": "MEM", + "subtype": 1 + } + } + }, + "insn_asm": "ST @0x8 R26 ", + "timestamp": 2810512480 + }, + { + "state": { + "pdg": false, + "rd_pdg": true, + "vld": true, + "wr_lock": 0, + "rd_lock": 0, + "sync_id": 9 + }, + "cmd": "ISSUE", + "insn": { + "SYNC": { + "sid": 65535, + "opcode": { + "optype": "SYNC", + "subtype": 0 + } + } + }, + "insn_asm": "SYNC 65535 ", + "timestamp": 2810512487 + } +] diff --git a/backends/tfhe-hpu-backend/python/lib/example.json b/backends/tfhe-hpu-backend/python/lib/example.json new file mode 100644 index 000000000..0516eeabc --- /dev/null +++ b/backends/tfhe-hpu-backend/python/lib/example.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3701d5e7d53eef6478a1b03a2c8e32cf5d20c1eb6829e754fe1ced4a0a16bed +size 693363 diff --git a/backends/tfhe-hpu-backend/python/lib/isctrace/__init__.py b/backends/tfhe-hpu-backend/python/lib/isctrace/__init__.py new file mode 100644 index 000000000..fffbda402 --- /dev/null +++ b/backends/tfhe-hpu-backend/python/lib/isctrace/__init__.py @@ -0,0 +1,4 @@ +from . import hw +from . import fmt +from . import analysis +from . import mockup diff --git a/backends/tfhe-hpu-backend/python/lib/isctrace/analysis.py b/backends/tfhe-hpu-backend/python/lib/isctrace/analysis.py new file mode 100644 index 000000000..5a237fe2f --- /dev/null +++ b/backends/tfhe-hpu-backend/python/lib/isctrace/analysis.py @@ -0,0 +1,300 @@ +# An abstraction layer that can be use to analyze both mockup and hardware +# traces + +import sys +import logging +from collections import defaultdict +from itertools import tee, chain, starmap +from operator import attrgetter, sub +from typing import Iterable, Iterator + +import numpy as np +from pandas import DataFrame + +def delta(a: Iterable[float]): + a, b = tee(a, 2) + b = chain(range(0,1), b) + return starmap(sub, zip(a,b)) + +def group_by_time(it, timef, threshold): + try: + batch = [next(it)] + ptime = timef(batch[0]) + for obj, time in map(lambda i: (i, timef(i)), it): + delta = time - ptime + if (delta < threshold): + batch.append(obj) + else: + yield batch + batch = [obj] + ptime = time + if(len(batch)): + yield batch + except StopIteration: + return + +class BaseEvent: + def as_dict(self): + return {'event': self.__class__.__name__} + +class InsnEvent: + def as_dict(self): + ret = BaseEvent.as_dict(self) + ret.update({'insn': str(self.insn)}) + return ret + +class Refill(InsnEvent): + def __init__(self, insn): + self.insn = insn + +class Issue(InsnEvent): + def __init__(self, insn): + self.insn = insn + +class Retire(InsnEvent): + def __init__(self, insn): + self.insn = insn + +class RdUnlock(InsnEvent): + def __init__(self, insn): + self.insn = insn + +class ReqTimeout(BaseEvent): + def __init__(self, stamp): + self.timestamp = stamp + def as_dict(self): + ret = super().as_dict() + ret.update({'data': f"{self.__dict__}"}) + return ret + +class Timeout(BaseEvent): + def __init__(self): + pass + +class DelTimeout(BaseEvent): + def __init__(self): + pass + +class BatchStart(BaseEvent): + def __init__(self, pe_id, issued): + self.pe_id = pe_id + self.issued = issued + def as_dict(self): + ret = super().as_dict() + ret.update({'data': f"{self.__dict__}"}) + return ret + +""" +A trace event +""" +class Event: + def __init__(self, timestamp, data): + self.timestamp = timestamp + self.data = data + + def as_dict(self): + ret = {'timestamp': self.timestamp} + ret.update(self.data.as_dict()) + return ret + +""" +A simplified instruction +""" +class Instruction: + def __init__(self, opcode, args): + self.opcode = opcode + self.args = args + + def is_flush(self): + return self.opcode.endswith("_F") + + def is_pbs(self): + return self.opcode.startswith("PBS") + + def as_dict(self): + return self.__dict__ + + def __str__(self): + return f"{self.opcode} {self.args}" + + def __hash__(self): + return hash(str(self)) + + def __eq__(self, other): + return hash(self) == hash(other) + +class Batch: + def __init__(self, insns, latency = None): + self._insns = insns + self.latency = self._insns[-1].latency if latency is not None else latency + + def reltime(self): + return max(map(lambda x: x.reltime, self._insns)) + + def __len__(self): + return len(self._insns) + + def __getitem__(self, k): + return self._insns[k] + +""" +Accumulator class for instruction latency +""" +class Latency: + def __init__(self): + self.acc = [] + self.data = set() + + def append(self, other, data): + self.acc.append(other.latency) + self.data.add(data) + + def as_dict(self): + if len(self.acc): + npa = np.array(list(filter(lambda x: x != np.NAN, self.acc)), + dtype=float) + return {"min": npa.min(), "avg": npa.mean(), + "max": npa.max(), "sum": npa.sum(), + "count": len(npa), "data": self.data} + else: + return {"min": 'NA', "avg": 'NA', + "max": 'NA', "sum": 'NA', + "count": 0, "data": self.data} + +class InstructionStats: + def __init__(self, insn, latency, timestamp, delta, reltime): + self.timestamp = timestamp + self.latency = latency + self.delta = delta + self.reltime = reltime + self.insn = insn + + def as_dict(self): + ret = { + 'timestamp': self.timestamp, + 'latency': self.latency, + 'delta': self.delta, + 'reltime': self.reltime, + } + if self.insn is not None: + ret.update(self.insn.as_dict()) + return ret + +def peek(it: Iterable): + ret, copy = tee(iter(it), 2) + try: + val = next(copy) + except StopIteration: + val = None + return ret, val + +""" +Iterable yielding Stats objects when iterated, results are not cached so don't +save the results if you want them more than once. +""" +class Retired: + BATCH_THRESHOLD = 150000 + + def __init__(self, trace: Iterable['Event']): + self._events = list(self._filter(trace)) + + @staticmethod + def _filter(events: Iterable['Event']): + isn_map = {} + events, first = peek(events) + if first is None: + return + first_stamp = prev_stamp = first.timestamp + for event in filter(lambda x: x.data.__class__ in (Issue, Retire), events): + insn = event.data.insn + timestamp = event.timestamp + if (event.data.__class__ == Retire): + if insn in isn_map: + latency = timestamp - isn_map[insn] + del isn_map[insn] + else: + latency = np.NAN + delta = timestamp - prev_stamp + reltime = timestamp - first_stamp + yield InstructionStats(insn, latency, timestamp, delta, reltime) + prev_stamp = timestamp + elif (event.data.__class__ == Issue): + isn_map[insn] = timestamp + + def __iter__(self): + return iter(self._events) + + def to_df(self): + return DataFrame.from_records([x.as_dict() for x in self], + index='timestamp') + + def runtime_us(self, freq_mhz) -> 'useconds': + return (self._events[-1].timestamp - self._events[0].timestamp)/freq_mhz + + def pbs_batches(self, threshold = BATCH_THRESHOLD): + pbs = filter(lambda i: i.insn.opcode.startswith('PBS'), self) + batches = list(map(Batch, group_by_time(pbs, attrgetter('timestamp'), threshold))) + for batch, latency in zip(batches, delta(x.reltime() for x in batches)): + batch.latency = latency + return batches + + def pbs_latency_table(self, freq_mhz = 350, threshold = BATCH_THRESHOLD): + pbs_latency_table = defaultdict(Latency, {}) + for batch in self.pbs_batches(threshold): + pbs_latency_table[len(batch)].append(batch, batch[0].reltime) + table = {i: x.as_dict() for i,x in pbs_latency_table.items()} + df = DataFrame.from_dict(table, orient="index") + clk_cols = ['min', 'avg', 'max', 'sum'] + df.loc[:, clk_cols] = df.loc[:, clk_cols].apply(lambda x: x/freq_mhz) + df.index.name = 'batch size' + return df.sort_index() + + def pbs_flushes(self): + batch = [] + for insn in self: + if insn.is_pbs(): + batch.append(insn) + + if insn.is_flush(): + yield Batch(batch) + batch = [] + + if len(batch): + yield Batch(batch) + +class Issued(Retired): + match_class = Issue + @classmethod + def _filter(cls, events: Iterable): + events, first = peek(events) + if first is None: + return + first_stamp = prev_stamp = first.timestamp + for event in filter(lambda x: x.data.__class__ == cls.match_class, events): + insn = event.data.insn + timestamp = event.timestamp + if (event.data.__class__ == cls.match_class): + latency = None + delta = timestamp - prev_stamp + reltime = timestamp - first_stamp + yield InstructionStats(insn, latency, timestamp, delta, reltime) + prev_stamp = timestamp + +class Refilled(Issued): + match_class = Refill + +class Trace: + def __init__(self, events: Iterable['Event']): + self._events = list(events) + + def __iter__(self): + return iter(self._events) + + def __len__(self): + return len(self._events) + + def to_df(self): + df = DataFrame.from_records([x.as_dict() for x in self], + index='timestamp') + df['reltime'] = df.index - df.index[0] + return df diff --git a/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py new file mode 100644 index 000000000..edc6eb411 --- /dev/null +++ b/backends/tfhe-hpu-backend/python/lib/isctrace/fmt.py @@ -0,0 +1,110 @@ +from . import analysis + +class BaseInstruction: + def __init__(self, data): + self.data = data + + def args(self): + return str(self.data) + + def __str__(self): + return f'{self.__class__.__name__} {self.args()}' + +class NamedInstruction: + def __init__(self, name, args): + self.name = name + self._args = args + def args(self): + return self._args + def __str__(self): + return f'{self.name} {self.args()}' + +class PBS(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'R{self.dst_rid} R{self.src_rid} @{self.gid}' + +class LD(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'R{self.rid} @{hex(self.slot["Addr"])}' + +class ST(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'@{hex(self.slot["Addr"])} R{self.rid}' + +class MAC(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'R{self.dst_rid} R{self.src0_rid} ' +\ + f'R{self.src1_rid} X{self.mul_factor} ' + +class ADD(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'R{self.dst_rid} R{self.src0_rid} R{self.src1_rid}' + +class ADDS(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'R{self.dst_rid} R{self.src_rid} {self.msg_cst["Cst"]}' + +class SUB(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'R{self.dst_rid} R{self.src0_rid} R{self.src1_rid}' + +class SSUB(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'R{self.dst_rid} {self.msg_cst["Cst"]} R{self.src_rid}' + +class SUBS(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f'R{self.dst_rid} R{self.src_rid} {self.msg_cst["Cst"]}' + +class SYNC(BaseInstruction): + def __init__(self, d): + self.__dict__ = d + + def args(self): + return f"{self.sid}" + +PBS_ML2 = PBS +PBS_ML4 = PBS +PBS_ML8 = PBS +PBS_F = PBS +PBS_ML2_F = PBS +PBS_ML4_F = PBS +PBS_ML8_F = PBS +MULS = ADDS +SUBS = ADDS + +class Insn: + def __init__(self, insn): + self.opcode, data = next(iter(insn.items())) + self.data = globals()[self.opcode](data) if self.opcode in globals() \ + else NamedInstruction(self.opcode, data) + + def to_analysis(self): + return analysis.Instruction(self.opcode, self.data.args()) diff --git a/backends/tfhe-hpu-backend/python/lib/isctrace/hw.py b/backends/tfhe-hpu-backend/python/lib/isctrace/hw.py new file mode 100644 index 000000000..8ef72d80e --- /dev/null +++ b/backends/tfhe-hpu-backend/python/lib/isctrace/hw.py @@ -0,0 +1,83 @@ +import json +from collections import defaultdict +from itertools import accumulate, chain, islice, tee +from operator import attrgetter +from typing import Iterator +import logging + +import numpy as np +from pandas import DataFrame + +from . import analysis, fmt + +""" +A trace event +""" +class Event: + EVENT_MAP = { + "ISSUE": lambda x: analysis.Issue(fmt.Insn(x.insn).to_analysis()), + "RETIRE": lambda x: analysis.Retire(fmt.Insn(x.insn).to_analysis()), + "RDUNLOCK": lambda x: analysis.RdUnlock(fmt.Insn(x.insn).to_analysis()), + "REFILL": lambda x: analysis.Refill(None), + } + + def __init__(self, trace_dict): + self.cmd = trace_dict['cmd'] + self.insn_asm = trace_dict['insn_asm'] + self.timestamp = trace_dict['timestamp'] + self.insn = trace_dict['insn'] + self.sync_id = trace_dict['state']['sync_id'] + + def as_dict(self): + return self.__dict__ + + @staticmethod + def default(): + return Event({"cmd": "NONE", "insn_asm": "", "timestamp": 0}) + + def to_analysis(self) -> 'analysis.Event': + return analysis.Event( + timestamp=self.timestamp, + data=self.EVENT_MAP[self.cmd](self)) + + +""" +A collection of hardware events +""" +class Trace: + def __init__(self, events): + self._events = events + + @staticmethod + def from_json(filename): + with open(filename, 'r') as fd: + return Trace([Event(x) for x in json.load(fd)]) + + def __iter__(self): + return iter(self._events) + + def __len__(self): + return len(self._events) + + # Tries to split the event stream in IOP boundaries + def iops(self): + id_map = defaultdict(list, {}) + for event in self: + id_map[event.sync_id].append(event) + opcode = next(iter(event.insn.keys())) if event.insn is not None else None + + if opcode == "SYNC": + yield Trace(id_map[event.sync_id]) + del id_map[event.sync_id] + + if len(id_map): + logging.warn("The trace contains incomplete IOPs") + + def to_analysis(self) -> Iterator['analysis.Event']: + return analysis.Trace(x.to_analysis() for x in self) + +def from_hw(filename) -> 'analysis.Trace': + return [x.to_analysis() for x in Trace.from_json(filename).iops()] + +# Register a factory function directly in the analysis module +setattr(analysis.Trace, 'from_hw', from_hw) diff --git a/backends/tfhe-hpu-backend/python/lib/isctrace/mockup.py b/backends/tfhe-hpu-backend/python/lib/isctrace/mockup.py new file mode 100644 index 000000000..d9da62029 --- /dev/null +++ b/backends/tfhe-hpu-backend/python/lib/isctrace/mockup.py @@ -0,0 +1,93 @@ +# A Library to load mockup traces +import json + +import pandas + +from . import analysis, fmt + + +class ArgId: + def __init__(self, d): + self.__dict__ = d + +class Instruction: + def __init__(self, d): + self.__dict__.update(d) + self.dst_id = ArgId(self.dst_id) + self.srca_id = ArgId(self.srca_id) + self.srcb_id = ArgId(self.srcb_id) + self.insn = fmt.Insn(d['op']) + + def __str__(self): + return str(self.insn) + +class Slot: + def __init__(self, d): + self.insn_data = Instruction(d['inst']) + self.state = d['state'] + + def __str__(self): + return str(self.insn_data) + + def to_analysis(self): + return self.insn_data.insn.to_analysis() + +# The only two subtypes +class Query: + def __init__(self, event): + self.__dict__.update(event) + self.slot = Slot(self.slot) + self.subtype = self.cmd + self.desc = str(self.slot) + def to_analysis(self): + return getattr(analysis, self.subtype)(self.slot.to_analysis()) + +class ReqTimeout: + def __init__(self, timestamp): + self.timestamp = timestamp + def to_analysis(self): + return analysis.ReqTimeout(self.timestamp) + +class BatchStart: + def __init__(self, d): + self.pe_id = d['pe_id'] + self.issued = d['issued'] + def to_analysis(self): + return analysis.BatchStart(self.pe_id, self.issued) + +class NamedEvent: + def __init__(self, name): + self.name = name + def to_analysis(self): + return getattr(analysis, self.name)() + +class Event: + def __init__(self, trace_dict): + self.timestamp = trace_dict['timestamp'] + event = trace_dict['event'] + + if event.__class__ == dict: + key = next(iter(event.keys())) + self.event = globals()[key](event[key]) + else: + self.event = NamedEvent(event) + + def to_analysis(self): + return analysis.Event( + timestamp=self.timestamp, + data=self.event.to_analysis()) + +class Trace: + def __init__(self, jsonfile): + with open(jsonfile, 'r') as fd: + self.traces = list(map(Event, json.load(fd))) + def __iter__(self): + return iter(self.traces) + def to_analysis(self): + return analysis.Trace((x.to_analysis() for x in self)) + +def from_mockup(filename: str) -> 'analysis.Trace': + return Trace(filename).to_analysis() + +# Register a from directly in analysis code +setattr(analysis.Trace, 'from_mockup', from_mockup) diff --git a/backends/tfhe-hpu-backend/python/requirements.txt b/backends/tfhe-hpu-backend/python/requirements.txt new file mode 100644 index 000000000..d2fe66b85 --- /dev/null +++ b/backends/tfhe-hpu-backend/python/requirements.txt @@ -0,0 +1,3 @@ +pandas == 2.2.3 +numpy == 1.26.3 +logging diff --git a/backends/tfhe-hpu-backend/src/asm/dop/arg.rs b/backends/tfhe-hpu-backend/src/asm/dop/arg.rs new file mode 100644 index 000000000..84a48d801 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/dop/arg.rs @@ -0,0 +1,519 @@ +//! +//! Gather DOp argument in a common type +//! Provides a FromStr implementation for parsing + +use crate::asm::CtId; + +use super::field::{ImmId, MemId, RegId, SyncId}; +use super::*; +use lazy_static::lazy_static; + +/// Minimum asm arg width to have aligned field +pub const ARG_MIN_WIDTH: usize = 16; +pub const DOP_MIN_WIDTH: usize = 10; + +/// Generic arguments +/// Used to pack argument under the same type +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Arg { + Reg(RegId), + Mem(MemId), + Imm(ImmId), + Pbs(Pbs), + Sync(SyncId), +} + +/// Use Display trait to convert into asm human readable file +/// Simply defer to inner type display impl while forcing the display width +impl std::fmt::Display for Arg { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Arg::Reg(inner) => write!(f, "{inner: write!(f, "{inner: write!(f, "{inner: write!(f, "{inner: write!(f, "{inner: Result { + lazy_static! { + static ref DOP_ARG_RE: regex::Regex = regex::Regex::new( + r"(?^R(?[0-9]+))|(?^@((?0x[0-9a-fA-F]+)|(?[0-9]+)))|(?^(?TS|TD|TH)(\[(?\d+)\])*\.(?\d+))|(?^((?0x[0-9a-fA-F]+)|(?[0-9]+)))|(?^TI\[(?\d+)\]\.(?\d+))|(?^Pbs(?(\S+)))" + ) + .expect("Invalid regex"); + } + + if let Some(caps) = DOP_ARG_RE.captures(s) { + if let Some(_register) = caps.name("register") { + let rid = caps["rid"] + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?; + Ok(Arg::Reg(RegId(rid))) + } else if let Some(_mem_addr) = caps.name("mem_addr") { + let cid = if let Some(raw_cid) = caps.name("cid") { + raw_cid + .as_str() + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + } else { + // One of them must match, otherwise error will be arose before + let raw_hex_cid = caps.name("hex_cid").unwrap(); + u16::from_str_radix(&raw_hex_cid.as_str()[2..], 16) + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + }; + Ok(Arg::Mem(MemId::Addr(CtId(cid)))) + } else if let Some(_mem_tmpl) = caps.name("mem_tmpl") { + let tid = if let Some(raw_tid) = caps.name("mt_id") { + Some( + raw_tid + .as_str() + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?, + ) + } else { + None + }; + let bid = caps["mt_bid"] + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?; + + match &caps["mt_orig"] { + "TS" => { + if tid.is_none() { + return Err(ParsingError::InvalidArg(format!("Memory template Src must have following format `TS[tid].bid` (parsed {})",&caps["mem_tmpl"]))); + } + Ok(Arg::Mem(MemId::Src { + tid: tid.unwrap(), + bid: bid as u8, + })) + } + "TD" => { + if tid.is_none() { + return Err(ParsingError::InvalidArg(format!("Memory template Dst must have following format `TD[tid].bid` (parsed {})",&caps["mem_tmpl"]))); + } + Ok(Arg::Mem(MemId::Dst { + tid: tid.unwrap(), + bid: bid as u8, + })) + } + "TH" => { + if tid.is_some() { + return Err(ParsingError::InvalidArg(format!("Memory template Heap must have following format `TH.bid` (parsed {})",&caps["mem_tmpl"]))); + } + Ok(Arg::Mem(MemId::Heap { bid })) + } + _ => Err(ParsingError::InvalidArg(format!( + "Invalid memory template argument {}", + &caps["mem_tmpl"] + ))), + } + } else if let Some(_imm_cst) = caps.name("imm_cst") { + let cst = if let Some(raw_cst) = caps.name("cst") { + raw_cst + .as_str() + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + } else { + // One of them must match, otherwise error will be arose before + let raw_hex_cst = caps.name("hex_cst").unwrap(); + u16::from_str_radix(&raw_hex_cst.as_str()[2..], 16) + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + }; + Ok(Arg::Imm(ImmId::Cst(cst))) + } else if let Some(_imm_var) = caps.name("imm_var") { + let tid = caps["it_id"] + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?; + let bid = caps["it_bid"] + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?; + Ok(Arg::Imm(ImmId::Var { tid, bid })) + } else if let Some(_pbs) = caps.name("pbs") { + Ok(Arg::Pbs(Pbs::from_str(&caps["pbs_name"])?)) + } else { + Err(ParsingError::Unmatch(format!( + "Invalid argument format {s}" + ))) + } + } else { + Err(ParsingError::Unmatch(format!( + "Invalid argument format {s}" + ))) + } + } +} + +pub trait FromAsm +where + Self: Sized, +{ + fn from_args(opcode: u8, args: &[arg::Arg]) -> Result; +} + +#[enum_dispatch] +pub trait ToAsm +where + Self: Sized, +{ + fn name(&self) -> &'static str { + std::any::type_name_of_val(self) + } + + fn args(&self) -> Vec { + let mut arg = self.dst(); + arg.extend_from_slice(self.src().as_slice()); + arg + } + fn dst(&self) -> Vec; + fn src(&self) -> Vec; +} + +#[enum_dispatch] +pub trait IsFlush +where + Self: Sized, +{ + fn is_flush(&self) -> bool { + false + } +} + +pub trait ToFlush +where + Self: Sized + Clone, +{ + fn to_flush(&self) -> Self { + self.clone() + } +} + +impl FromAsm for field::PeArithInsn { + fn from_args(opcode: u8, args: &[arg::Arg]) -> Result { + if (args.len() != 3) && (args.len() != 4) { + return Err(ParsingError::ArgNumber(3, args.len())); + } + + let dst_rid = match args[0] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[0].clone(), + )) + } + }; + let src0_rid = match args[1] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[1].clone(), + )) + } + }; + let src1_rid = match args[2] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[2].clone(), + )) + } + }; + + let mul_factor = if let Some(arg) = args.get(3) { + match arg { + Arg::Imm(ImmId::Cst(id)) => MulFactor(*id as u8), + _ => { + return Err(ParsingError::ArgType( + "Arg::Imm::Cst".to_string(), + args[3].clone(), + )) + } + } + } else { + MulFactor(0) + }; + + Ok(Self { + opcode: Opcode::from(opcode), + mul_factor, + src1_rid, + src0_rid, + dst_rid, + }) + } +} + +impl ToAsm for PeArithInsn { + fn dst(&self) -> Vec { + vec![arg::Arg::Reg(self.dst_rid)] + } + fn src(&self) -> Vec { + let mut src = vec![arg::Arg::Reg(self.src0_rid), arg::Arg::Reg(self.src1_rid)]; + if self.mul_factor != MulFactor(0) { + src.push(arg::Arg::Imm(ImmId::Cst(self.mul_factor.0 as u16))); + } + src + } +} + +impl FromAsm for field::PeArithMsgInsn { + fn from_args(opcode: u8, args: &[arg::Arg]) -> Result { + if args.len() != 3 { + return Err(ParsingError::ArgNumber(3, args.len())); + } + + let dst_rid = match args[0] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[0].clone(), + )) + } + }; + let src_rid = match args[1] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[1].clone(), + )) + } + }; + let msg_cst = match args[2] { + Arg::Imm(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Imm".to_string(), + args[2].clone(), + )) + } + }; + + Ok(Self { + opcode: Opcode::from(opcode), + msg_cst, + src_rid, + dst_rid, + }) + } +} + +impl ToAsm for PeArithMsgInsn { + fn dst(&self) -> Vec { + vec![arg::Arg::Reg(self.dst_rid)] + } + fn src(&self) -> Vec { + vec![arg::Arg::Reg(self.src_rid), arg::Arg::Imm(self.msg_cst)] + } +} + +impl FromAsm for field::PeMemInsn { + fn from_args(opcode: u8, args: &[arg::Arg]) -> Result { + if args.len() != 2 { + return Err(ParsingError::ArgNumber(2, args.len())); + } + + let (rid, mid) = match opcode { + _x if _x == u8::from(opcode::Opcode::LD()) => { + let rid = match args[0] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[0].clone(), + )) + } + }; + let slot = match args[1] { + Arg::Mem(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Mem".to_string(), + args[1].clone(), + )) + } + }; + (rid, slot) + } + _x if _x == u8::from(opcode::Opcode::ST()) => { + let slot = match args[0] { + Arg::Mem(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Mem".to_string(), + args[0].clone(), + )) + } + }; + + let rid = match args[1] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[1].clone(), + )) + } + }; + (rid, slot) + } + _ => { + return Err(ParsingError::Unmatch( + "PeMemInsn expect LD/ST opcode".to_string(), + )) + } + }; + + Ok(Self { + opcode: Opcode::from(opcode), + slot: mid, + rid, + }) + } +} + +impl ToAsm for PeMemInsn { + fn dst(&self) -> Vec { + match self.opcode { + _x if _x == opcode::Opcode::LD() => vec![Arg::Reg(self.rid)], + _x if _x == opcode::Opcode::ST() => vec![Arg::Mem(self.slot)], + _ => panic!("Unsupported opcode for PeMemInsn"), + } + } + fn src(&self) -> Vec { + match self.opcode { + _x if _x == opcode::Opcode::LD() => vec![Arg::Mem(self.slot)], + _x if _x == opcode::Opcode::ST() => vec![Arg::Reg(self.rid)], + _ => panic!("Unsupported opcode for PeMemInsn"), + } + } +} + +impl FromAsm for field::PePbsInsn { + fn from_args(opcode: u8, args: &[arg::Arg]) -> Result { + if args.len() != 3 { + return Err(ParsingError::ArgNumber(3, args.len())); + } + + let dst_rid = match args[0] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[0].clone(), + )) + } + }; + let src_rid = match args[1] { + Arg::Reg(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Reg".to_string(), + args[1].clone(), + )) + } + }; + let pbs_lut = match &args[2] { + Arg::Pbs(id) => id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Pbs".to_string(), + args[2].clone(), + )) + } + }; + + Ok(Self { + opcode: Opcode::from(opcode), + gid: pbs_lut.gid(), + src_rid, + dst_rid, + }) + } +} + +impl ToAsm for PePbsInsn { + fn dst(&self) -> Vec { + vec![Arg::Reg(self.dst_rid)] + } + fn src(&self) -> Vec { + vec![ + Arg::Reg(self.src_rid), + Arg::Pbs(Pbs::from_hex(self.gid).unwrap()), + ] + } +} + +impl FromAsm for field::PeSyncInsn { + fn from_args(opcode: u8, args: &[arg::Arg]) -> Result { + if (args.len() != 1) && (!args.is_empty()) { + return Err(ParsingError::ArgNumber(1, args.len())); + } + + let sid = if let Some(arg) = args.get(1) { + match arg { + Arg::Sync(id) => *id, + _ => { + return Err(ParsingError::ArgType( + "Arg::Sync".to_string(), + args[1].clone(), + )) + } + } + } else { + SyncId(0) + }; + + Ok(Self { + opcode: Opcode::from(opcode), + sid, + }) + } +} + +impl ToAsm for PeSyncInsn { + fn dst(&self) -> Vec { + vec![] + } + fn src(&self) -> Vec { + vec![Arg::Sync(self.sid)] + } +} + +impl ToFlush for field::PePbsInsn { + fn to_flush(&self) -> Self { + PePbsInsn { + opcode: self.opcode.to_flush(), + ..*self + } + } +} +impl ToFlush for field::PeSyncInsn {} +impl ToFlush for field::PeArithInsn {} +impl ToFlush for field::PeArithMsgInsn {} +impl ToFlush for field::PeMemInsn {} diff --git a/backends/tfhe-hpu-backend/src/asm/dop/dop_macro.rs b/backends/tfhe-hpu-backend/src/asm/dop/dop_macro.rs new file mode 100644 index 000000000..e447ea404 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/dop/dop_macro.rs @@ -0,0 +1,388 @@ +//! DOp definition is repetitive +//! +//! Indeed except the behavior DOp shared a small set of format. +//! And for a given format all the parsing logic is the same +//! A macro rules is used to help with DOp definition + +#[macro_export] +macro_rules! impl_dop_parser { + ( + $asm: literal, + $opcode: expr, + $field: ty, + $fmt: ty + $(,)? + ) => { + ::paste::paste! { + impl [] { + fn from_args(args: &[arg::Arg]) -> Result { + let fmt_op = $field::from_args($opcode.into(), args)?; + Ok(DOp::[< $asm:upper >](Self(fmt_op))) + } + + fn from_hex(hex: DOpRepr) -> DOp { + DOp::[< $asm:upper >](Self($field::from(&$fmt::from_bits(hex)))) + } + + pub fn opcode() -> u8 { + $opcode.into() + } + } + + impl ToAsm for []{ + fn name(&self) -> &'static str { + $asm + } + fn args(&self) -> Vec { + self.0.args() + } + fn dst(&self) -> Vec { + self.0.dst() + } + fn src(&self) -> Vec { + self.0.src() + } + } + + impl ToHex for [] { + fn to_hex(&self) -> DOpRepr { + $fmt::from(&self.0).into_bits() + } + } + } + }; +} + +#[macro_export] +macro_rules! impl_dop { + // Arith operations --------------------------------------------------------------------------- + ( + $asm: literal, + $opcode: expr, + PeArithInsn + $(,)? + ) => { + ::paste::paste! { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + pub struct [](pub PeArithInsn); + + impl [] { + pub fn new(dst: RegId, src0: RegId, src1: RegId) -> Self { + Self(PeArithInsn { + opcode: $opcode, + mul_factor: MulFactor(0), + src1_rid: src1, + src0_rid: src0, + dst_rid: dst, + }) + } + } + + impl IsFlush for []{} + impl_dop_parser!($asm, $opcode, PeArithInsn, PeArithHex); + } + }; + // Arith operations with mult_factor ---------------------------------------------------------- + ( + $asm: literal, + $opcode: expr, + PeArithInsn_mul_factor + $(,)? + ) => { + ::paste::paste! { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + pub struct [](pub PeArithInsn); + + impl [] { + pub fn new(dst_rid: RegId, src0_rid: RegId, src1_rid: RegId, mul_factor: MulFactor) -> Self { + Self(PeArithInsn { + opcode: $opcode, + mul_factor, + src1_rid, + src0_rid, + dst_rid, + }) + } + } + + impl IsFlush for [] {} + impl_dop_parser!($asm, $opcode, PeArithInsn, PeArithHex); + } + }; + // ArithMsg operations ------------------------------------------------------------------------ + ( + $asm: literal, + $opcode: expr, + PeArithMsgInsn + $(,)? + ) => { + ::paste::paste! { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + pub struct [](pub PeArithMsgInsn); + + impl [] { + pub fn new(dst_rid: RegId, src_rid: RegId, msg_cst: ImmId) -> Self { + Self(PeArithMsgInsn { + opcode: $opcode, + msg_cst, + src_rid, + dst_rid, + }) + } + /// Access inner imm for template patching + pub fn msg_mut(&mut self) -> &mut ImmId { + &mut self.0.msg_cst + } + } + + impl IsFlush for []{} + impl_dop_parser!($asm, $opcode, PeArithMsgInsn, PeArithMsgHex); + } + }; + + // Mem operations ------------------------------------------------------------------------ + // Load flavor + ( + $asm: literal, + $opcode: expr, + PeMemInsn_ld + $(,)? + ) => { + ::paste::paste! { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + pub struct [](pub PeMemInsn); + + impl [] { + pub fn new(rid: RegId, mid: MemId) -> Self { + Self(PeMemInsn { + opcode: $opcode, + slot: mid, + rid, + }) + } + /// Access inner rid + pub fn rid(&self) -> &RegId { + &self.0.rid + } + /// Access inner memory slot + pub fn slot(&self) -> &MemId { + &self.0.slot + } + /// Access inner memory for template patching + pub fn slot_mut(&mut self) -> &mut MemId { + &mut self.0.slot + } + } + + impl IsFlush for []{} + impl_dop_parser!($asm, $opcode, PeMemInsn, PeMemHex); + } + }; + + // Store flavor + ( + $asm: literal, + $opcode: expr, + PeMemInsn_st + $(,)? + ) => { + ::paste::paste! { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + pub struct [](pub PeMemInsn); + + impl [] { + pub fn new( mid: MemId, rid: RegId) -> Self { + Self(PeMemInsn { + opcode: $opcode, + slot: mid, + rid, + }) + } + /// Access inner rid + pub fn rid(&self) -> &RegId { + &self.0.rid + } + /// Access inner memory slot + pub fn slot(&self) -> &MemId { + &self.0.slot + } + /// Access inner memory for template patching + pub fn slot_mut(&mut self) -> &mut MemId { + &mut self.0.slot + } + } + + impl IsFlush for []{} + impl_dop_parser!($asm, $opcode, PeMemInsn, PeMemHex); + } + }; + + // Pbs operations ------------------------------------------------------------------------ + ( + $asm: literal, + $opcode: expr, + PePbsInsn + $(,)? + ) => { + ::paste::paste! { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + pub struct [](pub PePbsInsn); + + impl [] { + pub fn new(dst_rid: RegId, src_rid: RegId, gid: PbsGid) -> Self { + Self(PePbsInsn { + opcode: $opcode, + gid, + src_rid, + dst_rid, + }) + } + } + + impl IsFlush for [] { + fn is_flush(&self) -> bool { + $opcode.is_flush() + } + } + impl_dop_parser!($asm, $opcode, PePbsInsn, PePbsHex); + } + }; + + // Sync operations ------------------------------------------------------------------------ + ( + $asm: literal, + $opcode: expr, + PeSyncInsn + $(,)? + ) => { + ::paste::paste! { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + pub struct [](pub PeSyncInsn); + + impl [] { + pub fn new(sid: Option) -> Self { + Self(PeSyncInsn { + opcode: $opcode, + sid: sid.unwrap_or(SyncId(0)) + }) + } + + } + + impl IsFlush for []{} + impl_dop_parser!($asm, $opcode, PeSyncInsn, PeSyncHex); + } + }; +} + +#[macro_export] +macro_rules! dop { + ( + $([$asm: literal, $opcode: expr, $type: ty $({$fmt: tt})? $(,$flush: literal)?] $(,)?)* + ) => { + ::paste::paste! { + type AsmCallback = fn(&[arg::Arg]) -> Result; + type HexCallback = fn(DOpRepr) -> DOp; + + $( + impl_dop!($asm, $opcode, [< $type $(_ $fmt)? >]); + )* + + /// Aggregate DOp concrete type in one enumeration + // #[derive(Debug, Clone)] + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[enum_dispatch(ToAsm, ToHex, IsFlush)] + #[allow(non_camel_case_types)] + pub enum DOp{ + // $([< $asm:upper >]($type),)* + $([< $asm:upper >]([< DOp $asm:camel>]),)* + } + + impl ToFlush for DOp { + fn to_flush(&self) -> Self { + match self { + $( + DOp::[< $asm:upper >](inner) => DOp::[< $asm:upper $($flush)?>] + ([< DOp $asm:camel $($flush:camel)? >](inner.0.to_flush())), + )* + } + } + } + + impl DOp { + pub fn from_args(name: &str, args: &[arg::Arg]) -> Result { + if let Some(cb) = DOP_LUT.asm.get(name) { + cb(args) + } else { + Err(ParsingError::Unmatch(format!("{name} unknown"))) + } + } + /// Construct DOp from hex word + pub fn from_hex(hex: DOpRepr) -> Result { + let raw = DOpRawHex::from_bits(hex); + if let Some(cb) = DOP_LUT.hex.get(&raw.opcode()) { + Ok(cb(hex)) + } else { + Err(ParsingError::Unmatch(format!("DOp {:x?} unknown [hex {:x}]", raw.opcode(), hex))) + } + } + } + + impl std::fmt::Display for DOp { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{: Result { + + // Split asm string in a vector of arguments + let arg_str = asm.split_whitespace().collect::>(); + if !arg_str.is_empty() { + let name = arg_str[0]; + let args = arg_str[1..] + .iter() + .map(|s| { + arg::Arg::from_str(s) + }) + .collect::, _>>()?; + + Self::from_args(name, args.as_slice()) + }else { + Err(ParsingError::Empty) + } + } + } + + /// Parser utilities + /// Hashmap for Name -> to fromArg impl + struct DOpFromArg{ + asm: HashMap, + hex: HashMap, + } + lazy_static! { + static ref DOP_LUT: DOpFromArg = { + + let mut dop_from_arg = DOpFromArg{ + asm: HashMap::new(), + hex: HashMap::new(), + }; + + $( + dop_from_arg.asm.insert(stringify!([< $asm:upper >]).to_string(), []::from_args); + dop_from_arg.hex.insert(u8::from($opcode), []::from_hex); + )* + dop_from_arg + }; + } + } + }; +} diff --git a/backends/tfhe-hpu-backend/src/asm/dop/field.rs b/backends/tfhe-hpu-backend/src/asm/dop/field.rs new file mode 100644 index 000000000..af45d80c4 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/dop/field.rs @@ -0,0 +1,168 @@ +//! List of DOp field +//! Mainly thin wrapper over basic type to enforce correct used of asm fields + +// Retrieved CtId definition +// This definition is on the boundaries between IOp and DOp and thus define in the top. +use super::opcode::Opcode; +use crate::asm::CtId; + +/// Register argument +/// Direct mapping of value to register Id +/// 7bits wide -> 128 registers +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, Default, +)] +pub struct RegId(pub u8); + +impl std::ops::Add for RegId { + type Output = RegId; + fn add(self, rhs: usize) -> Self::Output { + RegId(self.0 + (rhs as u8)) + } +} + +impl std::fmt::Display for RegId { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "R{}", self.0) + } +} + +/// MulFactor argument +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub struct MulFactor(pub u8); + +impl std::fmt::Display for MulFactor { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "R{}", self.0) + } +} + +/// Memory arguments +/// Have multiple mode for proper support of template addressing +/// Template enable runtime replacement of MemId with associated Top-level arguments +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub enum MemId { + Addr(CtId), + Heap { bid: u16 }, + Src { tid: u8, bid: u8 }, + Dst { tid: u8, bid: u8 }, +} + +impl MemId { + pub fn new_heap(bid: u16) -> Self { + Self::Heap { bid } + } + pub fn new_dst(tid: u8, bid: u8) -> Self { + Self::Dst { tid, bid } + } + pub fn new_src(tid: u8, bid: u8) -> Self { + Self::Src { tid, bid } + } +} + +impl std::fmt::Display for MemId { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + MemId::Addr(addr) => write!(f, "@0x{:x}", addr.0), + MemId::Heap { bid } => write!(f, "TH.{bid}"), + MemId::Src { tid, bid } => write!(f, "TS[{tid}].{bid}"), + MemId::Dst { tid, bid } => write!(f, "TD[{tid}].{bid}"), + } + } +} + +/// Memory arguments +/// Have multiple mode for proper support of template addressing +/// Template enable runtime replacement of MemId with associated Top-level arguments +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum ImmId { + Cst(u16), + Var { tid: u8, bid: u8 }, +} + +impl ImmId { + /// Create new immediat template + pub fn new_var(tid: u8, bid: u8) -> Self { + Self::Var { tid, bid } + } +} + +impl std::fmt::Display for ImmId { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + ImmId::Cst(val) => write!(f, "{val}"), + ImmId::Var { tid, bid } => write!(f, "TI[{tid}].{bid}"), + } + } +} + +/// Pbs argument +/// Direct mapping to PBS Gid +/// 12bits wide -> 4096 lut entries +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub struct PbsGid(pub u16); + +impl std::fmt::Display for PbsGid { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Pbs{}", self.0) + } +} + +/// Sync argument +/// Currently unused +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct SyncId(pub u32); + +impl std::fmt::Display for SyncId { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +/// PeArith instructions +/// Arithmetic operation that use one destination register and two sources register +/// Have also an extra mul_factor field for MAC insn +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct PeArithInsn { + pub dst_rid: RegId, + pub src0_rid: RegId, + pub src1_rid: RegId, + pub mul_factor: MulFactor, + pub opcode: Opcode, +} + +/// PeaMsg instructions +/// Arithmetic operation that use one destination register, one source register and an immediat +/// value +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct PeArithMsgInsn { + pub dst_rid: RegId, + pub src_rid: RegId, + pub msg_cst: ImmId, + pub opcode: Opcode, +} + +/// PeMem instructions +/// LD/St operation with one register and one memory slot +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct PeMemInsn { + pub rid: RegId, + pub slot: MemId, + pub opcode: Opcode, +} + +/// PePbs instructions +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct PePbsInsn { + pub dst_rid: RegId, + pub src_rid: RegId, + pub gid: PbsGid, + pub opcode: Opcode, +} + +/// PeSync instructions +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct PeSyncInsn { + pub sid: SyncId, + pub opcode: Opcode, +} diff --git a/backends/tfhe-hpu-backend/src/asm/dop/fmt.rs b/backends/tfhe-hpu-backend/src/asm/dop/fmt.rs new file mode 100644 index 000000000..48e68c623 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/dop/fmt.rs @@ -0,0 +1,245 @@ +//! +//! Define binary format encoding of instructions +//! Rely on `bitfield_struct` crate to define bit-accurate insn format and enable serde to +//! byte-stream +//! +//! Provide conversion implementation between raw bitfield and DOp types +use bitfield_struct::bitfield; + +use super::*; + +// List of DOp format with there associated encoding +// NB: typedef couldn't be used in bitfield_struct macro. Thus macro rely on u32 instead of +// DOpRepr... +// ------------------------------------------------------------------------------------------------ +/// Raw type used for encoding +pub type DOpRepr = u32; + +#[enum_dispatch] +pub trait ToHex { + fn to_hex(&self) -> DOpRepr; +} + +/// DOp raw encoding used for Opcode extraction +#[bitfield(u32)] +pub struct DOpRawHex { + #[bits(26)] + _reserved: u32, + #[bits(6)] + pub opcode: u8, +} + +/// PeArith instructions +/// Arithmetic operation that use one destination register and two sources register +/// Have also an extra mul_factor field for MAC insn +#[bitfield(u32)] +pub struct PeArithHex { + #[bits(7)] + dst_rid: u8, + #[bits(7)] + src0_rid: u8, + #[bits(7)] + src1_rid: u8, + #[bits(5)] + mul_factor: u8, + #[bits(6)] + opcode: u8, +} + +impl From<&PeArithInsn> for PeArithHex { + fn from(value: &PeArithInsn) -> Self { + Self::new() + .with_dst_rid(value.dst_rid.0) + .with_src0_rid(value.src0_rid.0) + .with_src1_rid(value.src1_rid.0) + .with_mul_factor(value.mul_factor.0) + .with_opcode(value.opcode.into()) + } +} +impl From<&PeArithHex> for PeArithInsn { + fn from(value: &PeArithHex) -> Self { + Self { + dst_rid: RegId(value.dst_rid()), + src0_rid: RegId(value.src0_rid()), + src1_rid: RegId(value.src1_rid()), + mul_factor: MulFactor(value.mul_factor()), + opcode: Opcode::from(value.opcode()), + } + } +} + +/// PeaMsg instructions +/// Arithmetic operation that use one destination register, one source register and an immediat +/// value +#[bitfield(u32)] +pub struct PeArithMsgHex { + #[bits(7)] + dst_rid: u8, + #[bits(7)] + src_rid: u8, + #[bits(1)] + msg_mode: bool, + #[bits(11)] + msg_cst: u16, + #[bits(6)] + opcode: u8, +} +// Define encoding for msg_mode +const IMM_CST: bool = false; +const IMM_VAR: bool = true; + +impl From<&PeArithMsgInsn> for PeArithMsgHex { + fn from(value: &PeArithMsgInsn) -> Self { + let (mode, cst) = match value.msg_cst { + ImmId::Cst(cst) => (IMM_CST, cst), + ImmId::Var { tid, bid } => (IMM_VAR, (((tid as u16) << 8) + bid as u16)), + }; + + Self::new() + .with_dst_rid(value.dst_rid.0) + .with_src_rid(value.src_rid.0) + .with_msg_mode(mode) + .with_msg_cst(cst) + .with_opcode(value.opcode.into()) + } +} + +impl From<&PeArithMsgHex> for PeArithMsgInsn { + fn from(value: &PeArithMsgHex) -> Self { + let msg_cst = match value.msg_mode() { + IMM_CST => ImmId::Cst(value.msg_cst()), + IMM_VAR => ImmId::new_var((value.msg_cst() >> 8) as u8, (value.msg_cst() & 0xff) as u8), + }; + + Self { + dst_rid: RegId(value.dst_rid()), + src_rid: RegId(value.src_rid()), + msg_cst, + opcode: Opcode::from(value.opcode()), + } + } +} + +/// PeMem instructions +/// LD/St operation with one register and one memory slot +#[bitfield(u32)] +pub struct PeMemHex { + #[bits(7)] + rid: u8, + #[bits(1)] + _pad: u8, + #[bits(2)] + mode: u8, + #[bits(16)] + slot: u16, + #[bits(6)] + opcode: u8, +} + +// Define encoding for mem_mode +const MEM_ADDR: u8 = 0x0; +const MEM_HEAP: u8 = 0x1; +const MEM_SRC: u8 = 0x2; +const MEM_DST: u8 = 0x3; + +impl From<&PeMemInsn> for PeMemHex { + fn from(value: &PeMemInsn) -> Self { + let (mode, slot) = match value.slot { + MemId::Addr(ct_id) => (MEM_ADDR, ct_id.0), + MemId::Heap { bid } => (MEM_HEAP, bid), + MemId::Src { tid, bid } => (MEM_SRC, ((tid as u16) << 8) + bid as u16), + MemId::Dst { tid, bid } => (MEM_DST, ((tid as u16) << 8) + bid as u16), + }; + + Self::new() + .with_rid(value.rid.0) + .with_mode(mode) + .with_slot(slot) + .with_opcode(value.opcode.into()) + } +} + +impl From<&PeMemHex> for PeMemInsn { + fn from(value: &PeMemHex) -> Self { + let slot = if MEM_ADDR == value.mode() { + MemId::Addr(crate::asm::CtId(value.slot())) + } else if MEM_HEAP == value.mode() { + MemId::Heap { bid: value.slot() } + } else if MEM_SRC == value.mode() { + MemId::Src { + tid: (value.slot() >> 8) as u8, + bid: (value.slot() & 0xff) as u8, + } + } else if MEM_DST == value.mode() { + MemId::Dst { + tid: (value.slot() >> 8) as u8, + bid: (value.slot() & 0xff) as u8, + } + } else { + panic!("Unsupported memory mode") + }; + + Self { + rid: RegId(value.rid()), + slot, + opcode: Opcode::from(value.opcode()), + } + } +} + +/// PePbs instructions +#[bitfield(u32)] +pub struct PePbsHex { + #[bits(7)] + dst_rid: u8, + #[bits(7)] + src_rid: u8, + #[bits(12)] + gid: u16, + #[bits(6)] + opcode: u8, +} + +impl From<&PePbsInsn> for PePbsHex { + fn from(value: &PePbsInsn) -> Self { + Self::new() + .with_dst_rid(value.dst_rid.0) + .with_src_rid(value.src_rid.0) + .with_gid(value.gid.0) + .with_opcode(value.opcode.into()) + } +} +impl From<&PePbsHex> for PePbsInsn { + fn from(value: &PePbsHex) -> Self { + Self { + dst_rid: RegId(value.dst_rid()), + src_rid: RegId(value.src_rid()), + gid: PbsGid(value.gid()), + opcode: Opcode::from(value.opcode()), + } + } +} + +/// PeSync instructions +#[bitfield(u32)] +pub struct PeSyncHex { + #[bits(26)] + sid: u32, + #[bits(6)] + opcode: u8, +} +impl From<&PeSyncInsn> for PeSyncHex { + fn from(value: &PeSyncInsn) -> Self { + Self::new() + .with_sid(value.sid.0) + .with_opcode(value.opcode.into()) + } +} +impl From<&PeSyncHex> for PeSyncInsn { + fn from(value: &PeSyncHex) -> Self { + Self { + sid: SyncId(value.sid()), + opcode: Opcode::from(value.opcode()), + } + } +} diff --git a/backends/tfhe-hpu-backend/src/asm/dop/mod.rs b/backends/tfhe-hpu-backend/src/asm/dop/mod.rs new file mode 100644 index 000000000..f90284551 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/dop/mod.rs @@ -0,0 +1,531 @@ +pub mod arg; +mod dop_macro; +pub mod field; +pub mod fmt; +mod opcode; +pub mod pbs_macro; + +use lazy_static::lazy_static; +use std::collections::HashMap; + +use crate::{dop, impl_dop, impl_dop_parser}; +pub use arg::{FromAsm, IsFlush, ParsingError, ToAsm, ToFlush}; +pub use field::{ + ImmId, MemId, MulFactor, PbsGid, PeArithInsn, PeArithMsgInsn, PeMemInsn, PePbsInsn, PeSyncInsn, + RegId, SyncId, +}; +pub use fmt::{ + DOpRawHex, DOpRepr, PeArithHex, PeArithMsgHex, PeMemHex, PePbsHex, PeSyncHex, ToHex, +}; +pub use opcode::{DOpType, Opcode}; + +dop!( + // Arith operation + ["ADD", opcode::Opcode::ADD(), PeArithInsn], + ["SUB", opcode::Opcode::SUB(), PeArithInsn], + ["MAC", opcode::Opcode::MAC(), PeArithInsn{mul_factor}], + + // ArithMsg operation + ["ADDS", opcode::Opcode::ADDS(), PeArithMsgInsn], + ["SUBS", opcode::Opcode::SUBS(), PeArithMsgInsn], + ["SSUB", opcode::Opcode::SSUB(), PeArithMsgInsn], + ["MULS", opcode::Opcode::MULS(), PeArithMsgInsn], + + // Ld/st operation + ["LD", opcode::Opcode::LD(), PeMemInsn{ld}], + ["ST", opcode::Opcode::ST(), PeMemInsn{st}] + + // Pbs operation + ["PBS", opcode::Opcode::PBS(1), PePbsInsn, "_F"], + ["PBS_ML2", opcode::Opcode::PBS(2), PePbsInsn, "_F"], + ["PBS_ML4", opcode::Opcode::PBS(4), PePbsInsn, "_F"], + ["PBS_ML8", opcode::Opcode::PBS(8), PePbsInsn, "_F"], + + // Pbs flush operation + ["PBS_F", opcode::Opcode::PBS_F(1), PePbsInsn], + ["PBS_ML2_F", opcode::Opcode::PBS_F(2), PePbsInsn], + ["PBS_ML4_F", opcode::Opcode::PBS_F(4), PePbsInsn], + ["PBS_ML8_F", opcode::Opcode::PBS_F(8), PePbsInsn], + + // Sync operation + ["SYNC", opcode::Opcode::SYNC(), PeSyncInsn], +); + +#[derive(Debug, Clone, Copy)] +pub struct DigitParameters { + pub msg_w: usize, + pub carry_w: usize, +} + +impl DigitParameters { + /// Msg field only + pub fn msg_mask(&self) -> usize { + (1 << self.msg_w) - 1 + } + /// Carry field only + pub fn carry_mask(&self) -> usize { + ((1 << (self.carry_w)) - 1) << self.msg_w + } + /// Padding bit only + pub fn padding_mask(&self) -> usize { + 1 << (self.carry_w + self.msg_w) + } + + /// carry + msg fields only + pub fn data_mask(&self) -> usize { + self.carry_mask() | self.msg_mask() + } + /// Padding + carry + msg fields + pub fn raw_mask(&self) -> usize { + self.padding_mask() | self.data_mask() + } + + /// Message range (used for neg operation) + pub fn msg_range(&self) -> usize { + 1 << self.msg_w + } + + /// Compute available linear operation based on carry_w/msg_w + // TODO: Find a proper way to have nu < carry_w (i.e ManyLutPbs case) + pub fn nu(&self) -> usize { + (self.carry_mask() + self.msg_mask()) / self.msg_mask() + } + + pub fn total_width(&self) -> usize { + self.msg_w + self.carry_w + } +} + +/// Base trait to depict an Pbs function +/// Provides a set of method to reason about pbs +#[enum_dispatch] +pub trait PbsLut { + fn name(&self) -> &'static str; + fn gid(&self) -> PbsGid; + fn lut_nb(&self) -> u8; + fn lut_lg(&self) -> u8; + fn fn_at(&self, pos: usize, params: &DigitParameters, val: usize) -> usize; + fn deg_at(&self, pos: usize, params: &DigitParameters, deg: usize) -> usize; + // Blanket implementation + fn lut_msk(&self) -> usize { + usize::MAX << self.lut_lg() + } +} + +use crate::{impl_pbs, pbs}; +use enum_dispatch::enum_dispatch; +use pbs_macro::{CMP_EQUAL, CMP_INFERIOR, CMP_SUPERIOR}; + +pbs!( +["None" => 0 [ + @0 =>{ + |_params: &DigitParameters, val | val; + |_params: &DigitParameters, deg| deg; + } +]], +["MsgOnly" => 1 [ + @0 =>{ + |params: &DigitParameters, val | val & params.msg_mask(); + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], +["CarryOnly" => 2 [ + @0 =>{ + |params: &DigitParameters, val | val & params.carry_mask(); + |params: &DigitParameters, _deg| params.carry_mask(); + } +]], +["CarryInMsg" => 3 [ + @0 =>{ + |params: &DigitParameters, val | (val & params.carry_mask()) >> params.msg_w; + |params: &DigitParameters, _deg| params.msg_mask(); + } +]] +["MultCarryMsg" => 4 [ + @0 =>{ + |params: &DigitParameters, val | (((val & params.carry_mask()) >> params.msg_w) * (val & params.msg_mask())) & params.data_mask(); + |params: &DigitParameters, _deg| params.data_mask(); + } +]], +["MultCarryMsgLsb" => 5 [ + @0 =>{ + |params: &DigitParameters, val | (((val & params.carry_mask()) >> params.msg_w) * (val & params.msg_mask())) & params.msg_mask(); + |params: &DigitParameters, _deg| params.msg_mask(); + }, +]], +["MultCarryMsgMsb" => 6 [ + @0 =>{ + |params: &DigitParameters, val | ((((val & params.carry_mask()) >> params.msg_w) * (val & params.msg_mask())) >> params.msg_w) & params.msg_mask(); + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], +["BwAnd" => 7 [ + @0 =>{ + |params: &DigitParameters, val | (((val & params.carry_mask()) >> params.msg_w) & (val & params.msg_mask())) & params.msg_mask(); + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], +["BwOr" => 8 [ + @0 =>{ + |params: &DigitParameters, val | (((val & params.carry_mask()) >> params.msg_w) | (val & params.msg_mask())) & params.msg_mask(); + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], +["BwXor" => 9 [ + @0 =>{ + |params: &DigitParameters, val | (((val & params.carry_mask()) >> params.msg_w) ^ (val & params.msg_mask())) & params.msg_mask(); + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], + +["CmpSign" => 10 [ + @0 =>{ + |_params: &DigitParameters, val | { + // Signed comparison with 0. Based on behavior of negacyclic function. + // Example for Padding| 4bit digits (i.e 2msg2Carry) + // 1|xxxx -> SignLut -> -1 -> 0|1111 + // x|0000 -> SignLut -> 0 -> 0|0000 + // 0|xxxx -> SignLut -> 1 -> 0|0001 + if val != 0 {1} else {0} + }; + // WARN: in practice return value with padding that could encode -1, 0, 1 + // But should always be follow by an add to reach back range 0, 1, 2 + // To ease degree handling considered an output degree of 1 to obtain + // degree 2 after add + // Not a perfect solution but the easiest to prevent degree error + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpReduce" => 11 [ + @0 =>{ + |params: &DigitParameters, val | { + // Carry contain MSB cmp result, msg LSB cmp result + // Reduction is made from lsb to msb as follow + // MSB | LSB | Out + // Inferior | x | Inferior + // Equal | x | x + // Superior | x | Superior + let carry_field = (val & params.carry_mask()) >> params.msg_w; + let msg_field = val & params.msg_mask(); + + match (carry_field, msg_field) { + (CMP_EQUAL, lsb_cmp) => lsb_cmp, + _ => carry_field + } + }; + |_params: &DigitParameters, _deg| 2; + } +]] + +["CmpGt" => 12 [ + @0 =>{ + |params: &DigitParameters, val | match val & params.msg_mask() { + CMP_SUPERIOR => 1, + _ => 0, + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpGte" => 13 [ + @0 =>{ + |params: &DigitParameters, val | match val & params.msg_mask() { + CMP_SUPERIOR | CMP_EQUAL => 1, + _ => 0, + }; + |_params: &DigitParameters, _deg| 1; + } +]], +// Could be merge with Gt/Gte +["CmpLt" => 14 [ + @0 =>{ + |params: &DigitParameters, val | match val & params.msg_mask() { + CMP_INFERIOR => 1, + _ => 0, + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpLte" => 15 [ + @0 =>{ + |params: &DigitParameters, val | match val & params.msg_mask() { + CMP_INFERIOR | CMP_EQUAL => 1, + _ => 0, + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpEq" => 16 [ + @0 =>{ + |params: &DigitParameters, val | match val & params.msg_mask() { + CMP_EQUAL => 1, + _ => 0, + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpNeq" => 17 [ + @0 =>{ + |params: &DigitParameters, val | match val & params.msg_mask() { + CMP_EQUAL => 0, + _ => 1, + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["ManyGenProp" => 18 [ // Turns carry save into a generate/propagate pair and message with manyLUT + @0 =>{ + |params: &DigitParameters, val| { val & params.msg_mask()}; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @1 =>{ + |params: &DigitParameters, val| { + ((val & params.carry_mask()) >> (params.msg_w)) << 1| // Generate + (((val & params.msg_mask()) == params.msg_mask()) as usize) // Propagate + }; + |_params: &DigitParameters, _deg| 3; + } +]], +["ReduceCarry2" => 19 [ // Reduces a carry propagation add to two bits from an + // input in which the carry is in the second bit. + @0 =>{ + |_params: &DigitParameters, val | { + let carry = val >> 2; + let prop = (val & 3 == 3) as usize; + (carry << 1) | prop + }; + |_params: &DigitParameters, _deg| 3; + } +]], +["ReduceCarry3" => 20 [ // Reduces a carry propagation add to two bits from an + // input in which the carry is in the third bit. + @0 =>{ + |_params: &DigitParameters, val | { + let carry = val >> 3; + let prop = (val & 7 == 7) as usize; + (carry << 1) | prop + }; + |_params: &DigitParameters, _deg| 3; + } +]], +["ReduceCarryPad" => 21 [ // Reduces a carry propagation add to two bits from an + // input in which the carry is in the padding bit. + @0 =>{ + |params: &DigitParameters, val | { + if val == params.data_mask() { + 0 + } else { + params.raw_mask() + } + }; + |params: &DigitParameters, _deg| params.raw_mask(); + } +]], +["GenPropAdd" => 22 [ // Adds a generate/propagate pair with a message modulus message + @0 =>{ + |params: &DigitParameters, val | { + let lhs = val & params.msg_mask(); + let rhs = (val & params.carry_mask()) >> params.msg_w; + let rhs_gen = rhs >> 1; + (lhs + rhs_gen) & params.msg_mask() + }; + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], + +["IfTrueZeroed" => 23 [ // Ct must contain CondCt in Carry and ValueCt in Msg. If condition it's *TRUE*, value ct is forced to 0 + @0 =>{ + |params: &DigitParameters, val | { + let value = val & params.msg_mask(); + let cond = (val & params.carry_mask()) >> params.msg_w; + if cond != 0 {0} else {value} + }; + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], +["IfFalseZeroed" => 24 [ // Ct must contain CondCt in Carry and ValueCt in Msg. If condition it's *FALSE*, value ct is forced to 0 + @0 =>{ + |params: &DigitParameters, val | { + let value = val & params.msg_mask(); + let cond = (val & params.carry_mask()) >> params.msg_w; + if cond != 0 {value} else {0} + }; + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], +["Ripple2GenProp" => 25 [ // Converts from Ripple carry to GenProp + @0 =>{ + |params: &DigitParameters, val | { + (val & params.msg_mask()) * 2 + }; + |params: &DigitParameters, _deg| params.msg_mask(); + } +]], + +// Below Pbs are defined for Test only +["TestMany2" => 128 [ + @0 =>{ + |_params: &DigitParameters, val | val; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @1 =>{ + |_params: &DigitParameters, val | val +1; + |params: &DigitParameters, _deg| params.msg_mask(); + }, +]], +["TestMany4" => 129 [ + @0 =>{ + |_params: &DigitParameters, val | val; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @1 =>{ + |_params: &DigitParameters, val | val +1; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @2 =>{ + |_params: &DigitParameters, val | val +2; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @3 =>{ + |_params: &DigitParameters, val | val +3; + |params: &DigitParameters, _deg| params.msg_mask(); + }, +]], +["TestMany8" => 130 [ + @0 =>{ + |_params: &DigitParameters, val | val; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @1 =>{ + |_params: &DigitParameters, val | val +1; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @2 =>{ + |_params: &DigitParameters, val | val +2; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @3 =>{ + |_params: &DigitParameters, val | val +3; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @4 =>{ + |_params: &DigitParameters, val | val +4; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @5 =>{ + |_params: &DigitParameters, val | val +5; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @6 =>{ + |_params: &DigitParameters, val | val +6; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @7 =>{ + |_params: &DigitParameters, val | val +7; + |params: &DigitParameters, _deg| params.msg_mask(); + }, +]], +["ManyCarryMsg" => 26 [ // Turns carry save into carry and message with manyLUT + @0 =>{ + |params: &DigitParameters, val| { val & params.msg_mask()}; + |params: &DigitParameters, _deg| params.msg_mask(); + }, + @1 =>{ + |params: &DigitParameters, val| { val >> params.msg_w }; + |params: &DigitParameters, _deg| ((1 << (params.carry_w - 1)) - 1); + } +]], +["CmpGtMrg" => 27 [ + @0 =>{ + |params: &DigitParameters, val | { + let carry_field = (val & params.carry_mask()) >> params.msg_w; + let msg_field = val & params.msg_mask(); + + match (carry_field, msg_field) { + (CMP_SUPERIOR, _) | + (CMP_EQUAL, CMP_SUPERIOR) => 1, + _ => 0, + } + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpGteMrg" => 28 [ + @0 =>{ + |params: &DigitParameters, val | { + let carry_field = (val & params.carry_mask()) >> params.msg_w; + let msg_field = val & params.msg_mask(); + + match (carry_field, msg_field) { + (CMP_SUPERIOR, _) | + (CMP_EQUAL, CMP_SUPERIOR) | + (CMP_EQUAL, CMP_EQUAL) => 1, + _ => 0, + } + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpLtMrg" => 29 [ + @0 =>{ + |params: &DigitParameters, val | { + let carry_field = (val & params.carry_mask()) >> params.msg_w; + let msg_field = val & params.msg_mask(); + + match (carry_field, msg_field) { + (CMP_INFERIOR, _) | + (CMP_EQUAL, CMP_INFERIOR) => 1, + _ => 0, + } + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpLteMrg" => 30 [ + @0 =>{ + |params: &DigitParameters, val | { + let carry_field = (val & params.carry_mask()) >> params.msg_w; + let msg_field = val & params.msg_mask(); + + match (carry_field, msg_field) { + (CMP_INFERIOR, _) | + (CMP_EQUAL, CMP_INFERIOR) | + (CMP_EQUAL, CMP_EQUAL) => 1, + _ => 0, + } + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpEqMrg" => 31 [ + @0 =>{ + |params: &DigitParameters, val | { + let carry_field = (val & params.carry_mask()) >> params.msg_w; + let msg_field = val & params.msg_mask(); + + match (carry_field, msg_field) { + (CMP_EQUAL, CMP_EQUAL) => 1, + _ => 0, + } + }; + |_params: &DigitParameters, _deg| 1; + } +]], +["CmpNeqMrg" => 32 [ + @0 =>{ + |params: &DigitParameters, val | { + let carry_field = (val & params.carry_mask()) >> params.msg_w; + let msg_field = val & params.msg_mask(); + + match (carry_field, msg_field) { + (CMP_EQUAL, CMP_EQUAL) => 0, + _ => 1, + } + }; + |_params: &DigitParameters, _deg| 1; + } +]], +); + +pub(crate) fn ceil_ilog2(value: &u8) -> u8 { + (value.ilog2() + u32::from(!value.is_power_of_two())) as u8 +} diff --git a/backends/tfhe-hpu-backend/src/asm/dop/opcode.rs b/backends/tfhe-hpu-backend/src/asm/dop/opcode.rs new file mode 100644 index 000000000..e85d95387 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/dop/opcode.rs @@ -0,0 +1,166 @@ +//! +//! Define hex encoding for a subset of known DOp +//! DOp are defined with two section: {Type, subtype} + +/// Opcode structure +/// Gather DOp type and subtype +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub struct Opcode { + optype: DOpType, + subtype: u8, +} + +/// Define Instruction type as C-like enumeration +/// Types are encoded with 2bits +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum DOpType { + ARITH = 0b00, + SYNC = 0b01, + MEM = 0b10, + PBS = 0b11, +} + +/// Define raw type conversion +/// Opcode is on 6bits +impl From for u8 { + fn from(value: Opcode) -> Self { + (((value.optype as u8) & 0x3) << 4) + value.subtype + } +} +impl From for Opcode { + fn from(value: u8) -> Self { + let subtype = value & 0xf; + let optype_raw = (value >> 4) & 0x3; + let optype = match optype_raw { + x if x == DOpType::ARITH as u8 => DOpType::ARITH, + x if x == DOpType::SYNC as u8 => DOpType::SYNC, + x if x == DOpType::MEM as u8 => DOpType::MEM, + x if x == DOpType::PBS as u8 => DOpType::PBS, + _ => panic!("Invalid DOpType"), + }; + + Self { optype, subtype } + } +} + +/// Implement helper function to create Arith DOp +impl Opcode { + #[allow(non_snake_case)] + pub fn ADD() -> Self { + Self { + optype: DOpType::ARITH, + subtype: 0b0001, + } + } + #[allow(non_snake_case)] + pub fn SUB() -> Self { + Self { + optype: DOpType::ARITH, + subtype: 0b0010, + } + } + #[allow(non_snake_case)] + pub fn MAC() -> Self { + Self { + optype: DOpType::ARITH, + subtype: 0b0101, + } + } +} + +/// Implement helper function to create ArithMsg DOp +impl Opcode { + #[allow(non_snake_case)] + pub fn ADDS() -> Self { + Self { + optype: DOpType::ARITH, + subtype: 0b1001, + } + } + #[allow(non_snake_case)] + pub fn SUBS() -> Self { + Self { + optype: DOpType::ARITH, + subtype: 0b1010, + } + } + #[allow(non_snake_case)] + pub fn SSUB() -> Self { + Self { + optype: DOpType::ARITH, + subtype: 0b1011, + } + } + #[allow(non_snake_case)] + pub fn MULS() -> Self { + Self { + optype: DOpType::ARITH, + subtype: 0b1100, + } + } +} + +/// Implement helper function to create Sync DOp +impl Opcode { + #[allow(non_snake_case)] + pub fn SYNC() -> Self { + Self { + optype: DOpType::SYNC, + subtype: 0b0000, + } + } +} + +/// Implement helper function to create Memory DOp +impl Opcode { + #[allow(non_snake_case)] + pub fn LD() -> Self { + Self { + optype: DOpType::MEM, + subtype: 0b0000, + } + } + #[allow(non_snake_case)] + pub fn ST() -> Self { + Self { + optype: DOpType::MEM, + subtype: 0b0001, + } + } +} + +/// Implement helper function to create Memory DOp +pub const PBS_HAS_FLUSH: u8 = 0b1000; +impl Opcode { + #[allow(non_snake_case)] + pub fn PBS(lut_nb: u8) -> Self { + let lut_lg = super::ceil_ilog2(&lut_nb); + let subtype = lut_lg & 0x3; + Self { + optype: DOpType::PBS, + subtype, + } + } + + #[allow(non_snake_case)] + pub fn PBS_F(lut_nb: u8) -> Self { + let lut_lg = super::ceil_ilog2(&lut_nb); + let subtype = PBS_HAS_FLUSH + (lut_lg & 0x3); + Self { + optype: DOpType::PBS, + subtype, + } + } +} + +impl Opcode { + pub fn is_flush(&self) -> bool { + (self.optype == DOpType::PBS) && (self.subtype & PBS_HAS_FLUSH) != 0 + } + pub fn to_flush(&self) -> Self { + Self { + subtype: self.subtype | PBS_HAS_FLUSH, + ..*self + } + } +} diff --git a/backends/tfhe-hpu-backend/src/asm/dop/pbs_macro.rs b/backends/tfhe-hpu-backend/src/asm/dop/pbs_macro.rs new file mode 100644 index 000000000..1415df02c --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/dop/pbs_macro.rs @@ -0,0 +1,152 @@ +//! Pbs definition is repetitive +//! +//! A macro rules is used to help with Pbs definition + +pub const CMP_INFERIOR: usize = 0; +pub const CMP_EQUAL: usize = 1; +pub const CMP_SUPERIOR: usize = 2; + +#[macro_export] +macro_rules! impl_pbs { + ( + $pbs: literal => $gid: literal [ + $(@$id:literal => { + $func: expr; + $deg: expr$(;)? + }$(,)?)+ + ] + ) => { + ::paste::paste! { + #[derive(Debug, PartialEq, Eq, Clone)] + pub struct [](); + + impl Default for []{ + fn default() -> Self { + Self () + } + } + + impl PbsLut for [< Pbs $pbs:camel >] { + fn name(&self) -> &'static str { + $pbs + } + fn gid(&self) -> PbsGid { + PbsGid($gid) + } + fn lut_nb(&self) -> u8 { + if let Some(max) = [$($id,)*].iter().max() { + max +1} else {0} + } + fn lut_lg(&self) -> u8 { + ceil_ilog2(&self.lut_nb()) + } + + fn fn_at(&self, pos: usize, params: &DigitParameters, val: usize ) -> usize { + match pos { + $( + $id => ($func)(params, val), + )* + _ => { + // Unspecified -> Default to identity + val + }, + } + } + + fn deg_at(&self, pos: usize, params: &DigitParameters, deg: usize ) -> usize { + match pos { + $( + $id => ($deg)(params, deg), + )* + _ => { + // Unspecified -> Default to identity + deg + }, + } + } + } + } + }; +} + +#[macro_export] +macro_rules! pbs { + ( + $([$pbs: literal => $gid: literal [ + $(@$id:literal => { + $func: expr; + $deg: expr$(;)? + }$(,)?)+] + ] $(,)?)* + ) => { + ::paste::paste! { + $( + impl_pbs!($pbs => $gid [ $(@$id => {$func; $deg;},)*]); + )* + + /// Aggregate Pbs concrete type in one enumeration + #[derive(Debug, Clone, PartialEq, Eq)] + #[enum_dispatch(PbsLut)] + pub enum Pbs{ + $([< $pbs:camel >]([< Pbs $pbs:camel >]),)* + } + + impl std::fmt::Display for Pbs { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "Pbs{}", self.name()) + } + } + + impl std::str::FromStr for Pbs { + type Err = ParsingError; + + fn from_str(name: &str) -> Result { + if let Some(lut) = PBS_LUT.asm.get(name) { + Ok(lut.clone()) + } else { + Err(ParsingError::Unmatch(format!("Pbs{name} unknown"))) + } + + } + } + + impl Pbs { + pub fn from_hex(gid: PbsGid) -> Result { + if let Some(pbs) = PBS_LUT.hex.get(&gid) { + Ok(pbs.clone()) + } else { + Err(ParsingError::Unmatch(format!("Pbs {gid:?} unknown"))) + } + } + + pub fn list_all() -> Vec { + PBS_LUT.hex.values().map(|pbs| pbs.clone()).collect::>() + } + } + + /// Parser utilities + /// Hashmap for Name -> to fromArg impl + struct PbsFromArg{ + asm: HashMap, + hex: HashMap, + } + + lazy_static! { + static ref PBS_LUT: PbsFromArg = { + + let mut pbs_from_arg = PbsFromArg{ + asm: HashMap::new(), + hex: HashMap::new(), + }; + + $( + let pbs = Pbs::[< $pbs:camel >]([< Pbs $pbs >]::default()); + pbs_from_arg.asm.insert(stringify!([< $pbs:camel >]).to_string(), pbs.clone()); + pbs_from_arg.hex.insert(pbs.gid(), pbs); + )* + pbs_from_arg + }; +} + } + }; +} diff --git a/backends/tfhe-hpu-backend/src/asm/iop/arg.rs b/backends/tfhe-hpu-backend/src/asm/iop/arg.rs new file mode 100644 index 000000000..59a589de4 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/iop/arg.rs @@ -0,0 +1,545 @@ +//! +//! Gather IOp argument in a common type +//! Provides a FromStr implementation for parsing + +use super::*; +use field::{ + FwMode, IOpHeader, IOpcode, ImmBundle, Immediat, Operand, OperandBlock, OperandBundle, +}; +use lazy_static::lazy_static; + +pub const ASM_OPCODE_WIDTH: usize = 8; + +/// Parsing error +#[derive(thiserror::Error, Debug, Clone)] +pub enum ParsingError { + #[error("Opcode {0} is in in reserved range")] + Opcode(u8), + #[error("Unknown IOp alias {0}")] + Opalias(String), + #[error("Unmatch Asm Operation: {0}")] + Unmatch(String), + #[error("Invalid arguments number: expect {0}, get {1}")] + ArgNumber(usize, usize), + #[error("Invalid arguments type: expect {0}, get {1}")] + ArgType(String, Arg), + #[error("Invalid arguments: {0}")] + InvalidArg(String), + #[error("Empty line")] + Empty, +} + +// Asm arguments are slightly different that hex word +// Thus we can't directly mapped ASM arg to fmt structure +// Below, we define a set of arguments for parsing purpose + +/// Define fixed inner IOp (opposed as user available IOP) +/// Those IOp are generated by the Fw and have a fixed number of arguments +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct IOpFormat { + pub name: String, + pub opcode: IOpcode, + pub proto: IOpProto, +} + +/// Opcode asm parsing utility +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct AsmIOpcode { + pub(crate) opcode: IOpcode, + pub(crate) format: Option, +} + +impl AsmIOpcode { + pub fn from_opcode(opcode: IOpcode) -> Self { + if let Some(alias) = IOP_LUT.hex.get(&opcode) { + Self { + opcode, + format: Some(alias.clone()), + } + } else { + Self { + opcode, + format: None, + } + } + } + pub fn opcode(&self) -> IOpcode { + self.opcode + } + + pub fn format(&self) -> Option<&IOpFormat> { + self.format.as_ref() + } + pub fn has_imm(&self) -> bool { + if let Some(alias) = self.format.as_ref() { + alias.proto.imm != 0 + } else { + false + } + } +} + +impl std::fmt::Display for AsmIOpcode { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let name = if let Some(alias) = &self.format { + &alias.name + } else { + &format!("IOP[0x{:x}]", self.opcode.0) + }; + write!(f, "{name: for AsmIOpcode { + fn from(opcode: IOpcode) -> Self { + if let Some(alias) = IOP_LUT.hex.get(&opcode) { + Self { + opcode, + format: Some(alias.clone()), + } + } else { + Self { + opcode, + format: None, + } + } + } +} + +/// Extract AsmOpcode from IOp +/// This is used from proper rendering in asm +impl From<&IOp> for AsmIOpcode { + fn from(iop: &IOp) -> Self { + Self::from(iop.header.opcode) + } +} + +impl std::str::FromStr for AsmIOpcode { + type Err = ParsingError; + + #[tracing::instrument(level = "trace", ret)] + fn from_str(s: &str) -> Result { + lazy_static! { + static ref OPCODE_ARG_RE: regex::Regex = regex::Regex::new( + r"(?^IOP\[((?0x[0-9a-fA-F]+)|(?[0-9]+))\])|^(?\w+)" + ) + .expect("Invalid regex"); + } + + if let Some(caps) = OPCODE_ARG_RE.captures(s) { + if let Some(_raw) = caps.name("raw") { + let value = if let Some(raw_val) = caps.name("val") { + raw_val + .as_str() + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + } else { + // One of them must match, otherwise error will be arose before + let raw_hex_val = caps.name("hex_val").unwrap(); + u8::from_str_radix(&raw_hex_val.as_str()[2..], 16) + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + }; + if (opcode::USER_RANGE_LB..=opcode::USER_RANGE_UB).contains(&value) { + Ok(AsmIOpcode { + opcode: IOpcode(value), + format: None, + }) + } else { + Err(ParsingError::Opcode(value)) + } + } else if let Some(alias) = caps.name("alias") { + if let Some(alias) = IOP_LUT.asm.get(alias.as_str()) { + Ok(AsmIOpcode { + opcode: alias.opcode, + format: Some(alias.clone()), + }) + } else { + Err(ParsingError::Opalias(alias.as_str().to_string())) + } + } else { + Err(ParsingError::Unmatch(format!( + "Invalid argument format {s}" + ))) + } + } else { + Err(ParsingError::Unmatch(format!( + "Invalid argument format {s}" + ))) + } + } +} + +impl From<&AsmIOpcode> for IOpcode { + fn from(asm: &AsmIOpcode) -> Self { + asm.opcode + } +} + +/// Properties asm parsing utility +#[derive(Debug, Clone)] +pub struct Properties { + fw_mode: FwMode, + dst_align: OperandBlock, + src_align: OperandBlock, +} + +impl std::fmt::Display for Properties { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mode = match self.fw_mode { + FwMode::Static => "", + FwMode::Dynamic => "dyn ", + }; + write!( + f, + "{}I{} I{}", + mode, + (self.dst_align.0 + 1) * MSG_WIDTH, + (self.src_align.0 + 1) * MSG_WIDTH, + ) + } +} + +/// Extract properties from IOpHeader +impl From<&IOpHeader> for Properties { + fn from(value: &IOpHeader) -> Self { + Self { + fw_mode: value.fw_mode, + dst_align: value.dst_align, + src_align: value.src_align, + } + } +} + +impl std::str::FromStr for Properties { + type Err = ParsingError; + + #[tracing::instrument(level = "trace", ret)] + fn from_str(s: &str) -> Result { + lazy_static! { + static ref PROPERTIES_ARG_RE: regex::Regex = + regex::Regex::new(r"(?dyn)?\s*I(?\d+)\s*I(?\d+)") + .expect("Invalid regex"); + } + + if let Some(caps) = PROPERTIES_ARG_RE.captures(s) { + let fw_mode = if caps.name("fw").is_some() { + FwMode::Dynamic + } else { + FwMode::Static + }; + let src_width = caps["src"] + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?; + let src_align = OperandBlock::new((src_width / MSG_WIDTH as u16) as u8); + let dst_width = caps["dst"] + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?; + let dst_align = OperandBlock::new((dst_width / MSG_WIDTH as u16) as u8); + Ok(Properties { + fw_mode, + dst_align, + src_align, + }) + } else { + Err(ParsingError::Unmatch(format!( + "Invalid argument format {s}" + ))) + } + } +} + +impl std::fmt::Display for Operand { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + // Block/vec_size are zeroed indexed value + // -> Transform them in one indexed for human readability + let block = self.block.0 + 1; + let vec_size = self.vec_size.0 + 1; + if vec_size != 1 { + write!( + f, + "I{}[{}]@0x{:0>2x}", + block * MSG_WIDTH, + vec_size, + self.base_cid.0, + ) + } else { + write!(f, "I{}@0x{:0>2x}", block * MSG_WIDTH, self.base_cid.0,) + } + } +} + +// OperandBundle +// Addr are packed in <> in the ASM format and thus we only parse them by bundle +impl std::fmt::Display for OperandBundle { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "{}", + self.iter() + .fold(" ".to_string(), |acc, x| format!("{acc}{x} ")) + .trim() + ) + } +} + +impl std::str::FromStr for OperandBundle { + type Err = ParsingError; + + #[tracing::instrument(level = "trace", ret)] + fn from_str(s: &str) -> Result { + lazy_static! { + static ref ADDR_ARG_RE: regex::Regex = + regex::Regex::new(r"I(?\d+)((?\[\s*(?\d+)\s*\]@(0x(?[0-9a-fA-F]+)|(?\d+))\s*)|(?@(0x(?[0-9a-fA-F]+)|(?\d+))\s*))") + .expect("Invalid regex"); + } + let mut operands = ADDR_ARG_RE + .captures_iter(s) + .map(|caps| { + let width = caps["width"] + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?; + let block = (width / MSG_WIDTH as u16) as u8; + + if let Some(_vec) = caps.name("vec") { + let base_cid = if let Some(raw_cid) = caps.name("vec_cid") { + raw_cid + .as_str() + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + } else { + // One of them must match, otherwise error will be arose before + let raw_hex_cid = caps.name("vec_hex_cid").unwrap(); + u16::from_str_radix(raw_hex_cid.as_str(), 16) + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + }; + let len = caps["vec_len"] + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))?; + + Ok(Operand::new(block, base_cid, len, None)) + } else if let Some(_single) = caps.name("single") { + let base_cid = if let Some(raw_cid) = caps.name("cid") { + raw_cid + .as_str() + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + } else { + // One of them must match, otherwise error will be arose before + u16::from_str_radix(&caps["hex_cid"], 16) + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + }; + Ok(Operand::new(block, base_cid, 1, None)) + } else { + return Err(ParsingError::Unmatch(format!( + "Invalid argument format {s}" + ))); + } + }) + .collect::, ParsingError>>()?; + + // Empty OperandBundle is considered as parsing error + if operands.is_empty() { + Err(ParsingError::Unmatch(format!( + "Invalid argument: Empty OperandBundle {s}" + ))) + } else { + // Update is_last token + operands.last_mut().unwrap().is_last = true; + Ok(operands.into()) + } + } +} + +// ImmBundle +// Imm are packed in <> in the ASM format and thus we only parse them by bundle +impl std::fmt::Display for ImmBundle { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "{}", + self.iter() + .fold(" ".to_string(), |acc, x| format!( + "{acc}0x{:x} ", + x.cst_value() + )) + .trim() + ) + } +} + +impl std::str::FromStr for ImmBundle { + type Err = ParsingError; + + #[tracing::instrument(level = "trace", ret)] + fn from_str(s: &str) -> Result { + lazy_static! { + static ref IMM_ARG_RE: regex::Regex = + regex::Regex::new(r"(0x(?[0-9a-fA-F]+))|(?\d+)") + .expect("Invalid regex"); + } + let mut imms = IMM_ARG_RE + .captures_iter(s) + .map(|caps| { + let imm = if let Some(raw_imm) = caps.name("imm") { + raw_imm + .as_str() + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + } else { + // One of them must match, otherwise error will be arose before + let raw_hex_imm = caps.name("hex_imm").unwrap(); + u128::from_str_radix(raw_hex_imm.as_str(), 16) + .map_err(|err| ParsingError::InvalidArg(err.to_string()))? + }; + + Ok(Immediat::from_cst(imm)) + }) + .collect::, ParsingError>>()?; + + // Empty ImmBundle is considered as parsing error + if imms.is_empty() { + Err(ParsingError::Unmatch(format!( + "Invalid argument format {s}" + ))) + } else { + // Update is_last token + imms.last_mut().unwrap().is_last = true; + Ok(imms.into()) + } + } +} + +/// Generic arguments +/// Used to pack argument under the same type +#[derive(Debug, Clone)] +pub enum Arg { + Opcode(AsmIOpcode), + Properties(Properties), + Operand(OperandBundle), + Imm(ImmBundle), +} + +/// Use Display trait to convert into asm human readable file +/// Simply defer to inner type display impl while forcing the display width +impl std::fmt::Display for Arg { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Arg::Opcode(inner) => write!(f, "{inner}"), + Arg::Properties(inner) => write!(f, "{inner}"), + Arg::Operand(inner) => write!(f, "{inner}"), + Arg::Imm(inner) => write!(f, "{inner}"), + } + } +} + +/// Use FromStr trait to decode from asm file +impl std::str::FromStr for Arg { + type Err = ParsingError; + + #[tracing::instrument(level = "trace", ret)] + fn from_str(s: &str) -> Result { + match ( + OperandBundle::from_str(s), + AsmIOpcode::from_str(s), + Properties::from_str(s), + ImmBundle::from_str(s), + ) { + (Ok(operand), ..) => Ok(Self::Operand(operand)), + (Err(_), Ok(opcode), ..) => Ok(Self::Opcode(opcode)), + (Err(_), Err(_), Ok(props), ..) => Ok(Self::Properties(props)), + (Err(_), Err(_), Err(_), Ok(imm)) => Ok(Self::Imm(imm)), + (Err(addr), Err(opcode), Err(props), Err(imm)) => Err(ParsingError::Unmatch(format!( + "{s}: + Addr failed with{addr} + Opcode failed with{opcode} + Props failed with{props} + Imm failed with{imm} + " + ))), + } + } +} + +impl std::fmt::Display for IOp { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let opcode = AsmIOpcode::from(self); + write!(f, "{opcode}")?; + + let props = Properties::from(&self.header); + write!(f, " <{props}>")?; + + // Destination operands list + write!(f, " <{}>", self.dst)?; + + // Source operands list + write!(f, " <{}>", self.src)?; + + // Immediat operands list [Optional] + if self.header.has_imm { + write!(f, " <{}>", self.imm)?; + } + + Ok(()) + } +} + +/// Use FromStr trait to decode from asm file +impl std::str::FromStr for IOp { + type Err = ParsingError; + + #[tracing::instrument(level = "trace", ret)] + fn from_str(s: &str) -> Result { + lazy_static! { + static ref IOP_RE: regex::Regex = regex::Regex::new( + r"^(?\S+)\s*(?<.*?>)\s*(?<.*?>)\s*(?<.*?>)\s*(?<.*?>)?" + ) + .expect("Invalid regex"); + } + + if let Some(caps) = IOP_RE.captures(s) { + let opcode = AsmIOpcode::from_str(caps["opcode"].trim_matches(['<', '>', ' ']))?; + let props = Properties::from_str(caps["props"].trim_matches(['<', '>', ' ']))?; + let dst = { + let mut bundle = + OperandBundle::from_str(caps["dst"].trim_matches(['<', '>', ' ']))?; + bundle.set_kind(OperandKind::Dst); + bundle + }; + let src = { + let mut bundle = + OperandBundle::from_str(caps["src"].trim_matches(['<', '>', ' ']))?; + bundle.set_kind(OperandKind::Src); + bundle + }; + let (imm, has_imm) = if let Some(imm) = caps.name("imm") { + ( + ImmBundle::from_str(imm.as_str().trim_matches(['<', '>', ' ']))?, + true, + ) + } else { + (ImmBundle::from(vec![]), false) + }; + + // Aggregate some fields together to build real IOp + let header = IOpHeader { + fw_mode: props.fw_mode, + has_imm, + opcode: opcode.opcode, + dst_align: props.dst_align, + src_align: props.src_align, + }; + + Ok(IOp { + header, + dst, + src, + imm, + }) + } else { + Err(ParsingError::Unmatch(format!( + "Invalid argument format {s}" + ))) + } + } +} diff --git a/backends/tfhe-hpu-backend/src/asm/iop/field.rs b/backends/tfhe-hpu-backend/src/asm/iop/field.rs new file mode 100644 index 000000000..7f9d6f418 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/iop/field.rs @@ -0,0 +1,540 @@ +//! List of IOp field +//! Mainly thin wrapper over basic type to enforce correct used of asm fields +use super::*; +use crate::asm::CtId; + +use thiserror::Error; + +/// Parsing error +#[derive(Error, Debug, Clone)] +pub enum HexParsingError { + #[error("Invalid header")] + Header, + #[error("Invalid Operand Kind: {0}")] + Kind(String), + #[error("Invalid operand blocks")] + Block, + #[error("Incomplete stream")] + EmptyStream, +} + +// Vectorized ciphertext operands +// ------------------------------------------------------------------------------------------------ +/// Type of the operands +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum OperandKind { + Src = 0x0, + Dst = 0x1, + Imm = 0x2, + Unknown = 0x3, +} + +/// VectorSize +/// => Number of operands defined in the operands block +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct VectorSize(pub u8); +impl VectorSize { + /// Create vector size with the correct encoding + pub fn new(len: u8) -> Self { + assert!(len != 0, "Empty vector couldn't be encoded"); + Self(len - 1) + } +} + +/// OperandSize +/// => Number of valid digit in oach operand block +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct OperandBlock(pub u8); +impl OperandBlock { + /// Create vector size with the correct encoding + pub fn new(width: u8) -> Self { + assert!(width != 0, "Empty block couldn't be encoded"); + Self(width - 1) + } +} + +/// Ciphertext vectorized operands with extra parsing flags +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct Operand { + pub base_cid: CtId, + pub block: OperandBlock, + pub vec_size: VectorSize, + pub is_last: bool, + pub kind: OperandKind, +} + +impl Operand { + pub(crate) fn new(block: u8, base_cid: u16, vec_size: u8, kind: Option) -> Self { + Self { + kind: kind.unwrap_or(OperandKind::Unknown), + is_last: false, + vec_size: VectorSize::new(vec_size), + block: OperandBlock::new(block), + base_cid: CtId(base_cid), + } + } +} + +/// Create a dedicated type for a collection of Immediat +/// This is to enable trait implementation on it (c.f arg) +#[derive(Debug, Clone)] +pub struct OperandBundle(Vec); + +impl OperandBundle { + pub(crate) fn set_kind(&mut self, kind: OperandKind) { + assert!( + kind != OperandKind::Imm, + "OperandBundle couldn't be tagged as Imm" + ); + self.0.iter_mut().for_each(|op| op.kind = kind); + } +} + +impl From> for OperandBundle { + fn from(inner: Vec) -> Self { + let mut inner = inner; + // Enforce correct is_last handling + inner.iter_mut().for_each(|op| op.is_last = false); + if let Some(last) = inner.last_mut() { + last.is_last = true; + } + Self(inner) + } +} + +impl std::ops::Deref for OperandBundle { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl OperandBundle { + #[tracing::instrument(level = "trace", ret)] + pub fn from_words(stream: &[IOpWordRepr]) -> Result<(Self, usize), HexParsingError> { + // Keep track of the current peak index + let mut peak_words = 0; + + let mut op_list = Vec::new(); + loop { + let op = if let Some(op_word) = stream.get(peak_words) { + peak_words += 1; + Operand::from(&fmt::OperandHex::from_bits(*op_word)) + } else { + return Err(HexParsingError::EmptyStream); + }; + op_list.push(op); + if op.is_last { + break; + } + } + Ok((Self(op_list), peak_words)) + } + #[tracing::instrument(level = "trace", ret)] + pub fn to_words(&self) -> Vec { + self.0 + .iter() + .map(|op| fmt::OperandHex::from(op).into_bits()) + .collect::>() + } +} + +// Immediate operands +// ------------------------------------------------------------------------------------------------ +/// Immediat Size +/// => Number of valid digit in following immediat +/// To obtain the number of valid bits, user should multiply by the msg_width +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct ImmBlock(pub u16); + +/// Immediat header +/// Use to implement top-level parser manually +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct ImmediatHeader { + pub(super) lsb_msg: u16, + pub(super) block: ImmBlock, + pub(super) is_last: bool, + pub(super) kind: OperandKind, +} + +/// Full Immediat representation (i.e. header + data) +#[derive(Debug, Clone, PartialEq)] +pub struct Immediat { + pub(super) kind: OperandKind, + pub(super) is_last: bool, + pub(super) block: ImmBlock, + pub(super) msg: Vec, +} + +impl Immediat { + /// Access imm msg for template patching + /// Extract the correct block (i.e. MSG_WIDTH chunk) + pub fn msg_block(&self, bid: u8) -> u16 { + let word_id = bid as u32 / (u16::BITS / MSG_WIDTH as u32); + let block_id = bid as u32 % (u16::BITS / MSG_WIDTH as u32); + if let Some(word) = self.msg.get(word_id as usize) { + (word >> (block_id * MSG_WIDTH as u32)) & ((1 << MSG_WIDTH) - 1) + } else { + 0 + } + } + + pub fn from_cst(cst: u128) -> Self { + let mut u16_cst = cst + .to_le_bytes() + .chunks(2) + .map(|x| u16::from_le_bytes(x.try_into().unwrap())) + .collect::>(); + + let mut cst = cst; + let block = { + let mut block = 0; + while cst != 0 { + block += 1; + cst >>= 2; + } + ImmBlock(block) + }; + + // Shrink to fit + let msg_word = usize::div_ceil(block.0 as usize * MSG_WIDTH as usize, u16::BITS as usize); + u16_cst.resize(msg_word, 0); + + Self { + kind: OperandKind::Imm, + is_last: false, + block, + msg: u16_cst, + } + } + pub fn cst_value(&self) -> u128 { + self.msg + .iter() + .enumerate() + .map(|(pos, val)| (*val as u128) << (8 * std::mem::size_of::() * pos)) + .sum::() + } +} + +impl Immediat { + #[tracing::instrument(level = "trace", ret)] + pub fn from_words(stream: &[IOpWordRepr]) -> Result<(Self, usize), HexParsingError> { + // Keep track of the current peak index + let mut peak_words = 0; + + // 1. Parse header + let header = if let Some(header_word) = stream.get(peak_words) { + peak_words += 1; + ImmediatHeader::from(&fmt::ImmediatHeaderHex::from_bits(*header_word)) + } else { + return Err(HexParsingError::EmptyStream); + }; + + // Check flags + if header.kind != OperandKind::Imm { + return Err(HexParsingError::Kind(format!( + "Get {:?} instead of {:?}", + header.kind, + OperandKind::Imm + ))); + } + + // Get associated value: + let mut le_msg = vec![header.lsb_msg]; + + let data_word = usize::div_ceil( + header.block.0 as usize * MSG_WIDTH as usize, + 8 * (std::mem::size_of::() / std::mem::size_of::()), + ); + + // NB: First imm word is encoded in the header + for _w in 0..(data_word / 2) { + if let Some(word) = stream.get(peak_words) { + peak_words += 1; + let u16_words = word + .to_le_bytes() + .chunks(2) + .map(|x| u16::from_le_bytes(x.try_into().unwrap())) + .collect::>(); + le_msg.extend_from_slice(u16_words.as_slice()); + } else { + return Err(HexParsingError::EmptyStream); + } + } + + Ok(( + Self { + kind: header.kind, + is_last: header.is_last, + block: header.block, + msg: le_msg, + }, + peak_words, + )) + } + + pub fn to_words(&self) -> Vec { + let mut words = Vec::new(); + let header = ImmediatHeader { + lsb_msg: *self.msg.first().unwrap_or(&0), + block: self.block, + is_last: self.is_last, + kind: self.kind, + }; + words.push(fmt::ImmediatHeaderHex::from(&header).into_bits()); + + if self.msg.len() > 1 { + for imm in self.msg[1..] + .chunks(std::mem::size_of::() / std::mem::size_of_val(&self.msg[0])) + { + let imm_word = match imm.len() { + 1 => IOpWordRepr::from(imm[0]), + 2 => IOpWordRepr::from( + imm[0] as IOpWordRepr + ((imm[1] as IOpWordRepr) << u16::BITS), + ), + _ => panic!("Unsupported chunks, IOpWordRepr has been changed"), + }; + words.push(imm_word); + } + } + words + } +} + +/// Create a dedicated type for a collection of Immediat +/// This is to enable trait implementation on it (c.f arg) +#[derive(Debug, Clone)] +pub struct ImmBundle(Vec); + +impl ImmBundle { + #[tracing::instrument(level = "trace", ret)] + pub fn from_words(stream: &[IOpWordRepr]) -> Result<(Self, usize), HexParsingError> { + // Keep track of the current peak index + let mut peak_words = 0; + + let mut imm_list = Vec::new(); + loop { + let (imm, peaked) = Immediat::from_words(&stream[peak_words..])?; + peak_words += peaked; + + let is_last = imm.is_last; + imm_list.push(imm); + if is_last { + break; + } + } + Ok((Self(imm_list), peak_words)) + } + #[tracing::instrument(level = "trace", ret)] + pub fn to_words(&self) -> Vec { + self.0 + .iter() + .flat_map(|imm| imm.to_words()) + .collect::>() + } +} + +impl From> for ImmBundle { + #[tracing::instrument(level = "trace", ret)] + fn from(inner: Vec) -> Self { + let mut inner = inner; + // Enforce correct is_last handling + inner.iter_mut().for_each(|op| op.is_last = false); + if let Some(last) = inner.last_mut() { + last.is_last = true; + } + Self(inner) + } +} + +impl std::ops::Deref for ImmBundle { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +// IOp header +// ------------------------------------------------------------------------------------------------ +/// Opcode +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub struct IOpcode(pub u8); + +/// Type of the operands +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum FwMode { + Static = 0x0, + Dynamic = 0x1, +} + +/// IOpHeader +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct IOpHeader { + pub(super) src_align: OperandBlock, + pub(super) dst_align: OperandBlock, + pub(super) opcode: IOpcode, + pub(super) has_imm: bool, + pub(super) fw_mode: FwMode, +} + +/// Gather all subparts together +#[derive(Debug, Clone)] +pub struct IOp { + pub(super) header: IOpHeader, + pub(super) dst: OperandBundle, + pub(super) src: OperandBundle, + pub(super) imm: ImmBundle, +} +use std::collections::VecDeque; + +/// Implement construction +/// Used to construct IOp from Backend HpuVar +impl IOp { + pub fn new(opcode: IOpcode, dst: Vec, src: Vec, imm: Vec) -> Self { + let dst_align = dst.iter().map(|x| x.block).max().unwrap(); + let src_align = src.iter().map(|x| x.block).max().unwrap(); + let has_imm = !imm.is_empty(); + + let header = IOpHeader { + src_align, + dst_align, + opcode, + has_imm, + fw_mode: FwMode::Static, + }; + Self { + header, + dst: dst.into(), + src: src.into(), + imm: imm.into(), + } + } + + pub fn opcode(&self) -> IOpcode { + self.header.opcode + } + pub fn asm_opcode(&self) -> AsmIOpcode { + self.header.opcode.into() + } + + // Compute associated fw block size + // Used to compute fw_entry offset and fw translation validity + pub fn fw_blk_width(&self) -> usize { + std::cmp::max(self.header.dst_align.0, self.header.src_align.0) as usize + } + + // Compute fw table entry + pub fn fw_entry(&self) -> usize { + self.fw_blk_width() * 0x100 + self.header.opcode.0 as usize + } + pub fn dst(&self) -> &OperandBundle { + &self.dst + } + pub fn src(&self) -> &OperandBundle { + &self.src + } + pub fn imm(&self) -> &ImmBundle { + &self.imm + } +} +/// Implement parsing logic from stream of word +/// Only consume the VecDeque on Success +impl IOp { + #[tracing::instrument(level = "trace", ret)] + pub fn from_words(stream: &mut VecDeque) -> Result { + // Keep track of the current peak index + let mut peak_words = 0; + + // Enforce contiguous for ease of addressing in the queue + stream.make_contiguous(); + + // 1. Parse header + let header = if let Some(header_word) = stream.get(peak_words) { + peak_words += 1; + IOpHeader::from(&fmt::IOpHeaderHex::from(*header_word)) + } else { + return Err(HexParsingError::EmptyStream); + }; + + // 2. Parse Destination operands + let dst = { + let (dst, peaked) = OperandBundle::from_words(&stream.as_slices().0[peak_words..])?; + for op in dst.iter() { + // Check flags + if op.kind != OperandKind::Dst { + return Err(HexParsingError::Kind(format!( + "Get {:?} instead of {:?}", + op.kind, + OperandKind::Dst + ))); + } + if op.block > header.dst_align { + return Err(HexParsingError::Kind(format!( + "Get {:?} > {:?}", + op.block, header.dst_align + ))); + } + } + peak_words += peaked; + dst + }; + + // 3. Parse Source operands + let src = { + let (src, peaked) = OperandBundle::from_words(&stream.as_slices().0[peak_words..])?; + for op in src.iter() { + // Check flags + if op.kind != OperandKind::Src { + return Err(HexParsingError::Kind(format!( + "Get {:?} instead of {:?}", + op.kind, + OperandKind::Src + ))); + } + if op.block > header.src_align { + return Err(HexParsingError::Kind(format!( + "Get {:?} > {:?}", + op.block, header.src_align + ))); + } + } + peak_words += peaked; + src + }; + + // 4. Parse Immediat [Optional] + let (imm, peaked) = if header.has_imm { + ImmBundle::from_words(&stream.as_slices().0[peak_words..])? + } else { + (ImmBundle(Vec::new()), 0) + }; + peak_words += peaked; + + // Successful extraction from the dequeue + // Consume the associated words + stream.drain(0..peak_words); + + Ok(Self { + header, + dst, + src, + imm, + }) + } + + #[tracing::instrument(level = "trace", ret)] + pub fn to_words(&self) -> Vec { + let mut words = Vec::new(); + // 1. Header + words.push(fmt::IOpHeaderHex::from(&self.header).into_bits()); + // 2. Destination + words.extend(self.dst.to_words()); + // 3. Sources + words.extend(self.src.to_words()); + // 4. Immediat + words.extend(self.imm.to_words()); + words + } +} diff --git a/backends/tfhe-hpu-backend/src/asm/iop/fmt.rs b/backends/tfhe-hpu-backend/src/asm/iop/fmt.rs new file mode 100644 index 000000000..fe3baf7e6 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/iop/fmt.rs @@ -0,0 +1,154 @@ +//! +//! Define binary format encoding of IOp instructions +//! Rely on `bitfield_struct` crate to define bit-accurate insn format +//! and some manual From/To implementation to move to internal type +use crate::asm::CtId; +use bitfield_struct::bitfield; + +use super::*; + +// Define type alias for underlying native type. +// NB: Currently bitfield don't support type alias and thus we use native type instead +pub type IOpWordRepr = u32; +pub type IOpRepr = Vec; + +#[bitfield(u32)] +pub struct OperandHex { + #[bits(16)] + base_cid: u16, + #[bits(8)] + block: u8, + #[bits(5)] + vec_size: u8, + #[bits(1)] + is_last: bool, + #[bits(2)] + kind: u8, +} + +impl From<&OperandHex> for field::Operand { + fn from(value: &OperandHex) -> Self { + let kind = if value.kind() == OperandKind::Src as u8 { + OperandKind::Src + } else if value.kind() == OperandKind::Dst as u8 { + OperandKind::Dst + } else if value.kind() == OperandKind::Imm as u8 { + OperandKind::Imm + } else { + OperandKind::Unknown + }; + + Self { + base_cid: CtId(value.base_cid()), + block: field::OperandBlock(value.block()), + vec_size: field::VectorSize(value.vec_size()), + is_last: value.is_last(), + kind, + } + } +} + +impl From<&Operand> for OperandHex { + fn from(value: &Operand) -> Self { + Self::new() + .with_base_cid(value.base_cid.0) + .with_block(value.block.0) + .with_vec_size(value.vec_size.0) + .with_is_last(value.is_last) + .with_kind(value.kind as u8) + } +} + +#[bitfield(u32)] +pub struct ImmediatHeaderHex { + #[bits(16)] + lsb_msg: u16, + #[bits(12)] + block: u16, + #[bits(1)] + is_last: bool, + #[bits(1)] + _reserved: u8, + #[bits(2)] + kind: u8, +} + +impl From<&ImmediatHeaderHex> for field::ImmediatHeader { + fn from(value: &ImmediatHeaderHex) -> Self { + let kind = if value.kind() == OperandKind::Src as u8 { + OperandKind::Src + } else if value.kind() == OperandKind::Dst as u8 { + OperandKind::Dst + } else if value.kind() == OperandKind::Imm as u8 { + OperandKind::Imm + } else { + OperandKind::Unknown + }; + + Self { + lsb_msg: value.lsb_msg(), + block: field::ImmBlock(value.block()), + is_last: value.is_last(), + kind, + } + } +} + +impl From<&field::ImmediatHeader> for ImmediatHeaderHex { + fn from(value: &field::ImmediatHeader) -> Self { + Self::new() + .with_lsb_msg(value.lsb_msg) + .with_block(value.block.0) + .with_is_last(value.is_last) + .with_kind(value.kind as u8) + } +} + +#[bitfield(u32)] +pub struct IOpHeaderHex { + #[bits(8)] + src_align: u8, + #[bits(8)] + dst_align: u8, + #[bits(8)] + opcode: u8, + #[bits(1)] + has_imm: bool, + #[bits(1)] + fw_mode: bool, + #[bits(6)] + _reserved: u8, +} + +impl From<&IOpHeaderHex> for field::IOpHeader { + fn from(value: &IOpHeaderHex) -> Self { + let fw_mode = match value.fw_mode() { + true => field::FwMode::Dynamic, + false => field::FwMode::Static, + }; + + Self { + src_align: field::OperandBlock(value.src_align()), + dst_align: field::OperandBlock(value.dst_align()), + opcode: field::IOpcode(value.opcode()), + has_imm: value.has_imm(), + fw_mode, + } + } +} + +impl From<&field::IOpHeader> for IOpHeaderHex { + fn from(value: &field::IOpHeader) -> Self { + let fw_mode = match value.fw_mode { + field::FwMode::Dynamic => true, + field::FwMode::Static => false, + }; + + Self::new() + .with_src_align(value.src_align.0) + .with_dst_align(value.dst_align.0) + .with_opcode(value.opcode.0) + .with_has_imm(value.has_imm) + .with_fw_mode(fw_mode) + } +} diff --git a/backends/tfhe-hpu-backend/src/asm/iop/iop_macro.rs b/backends/tfhe-hpu-backend/src/asm/iop/iop_macro.rs new file mode 100644 index 000000000..6a19d4dcf --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/iop/iop_macro.rs @@ -0,0 +1,61 @@ +//! IOp mapping +//! +//! IOp currently share one format. +//! Some of them (upper 128) are handled by the fw and are named, the other one is for custom user +//! entries. + +#[macro_export] +macro_rules! iop { + ( + $([ $proto: ident -> $asm: literal, $opcode: expr] $(,)?)* + ) => { + ::paste::paste! { + /// Parser utilities + /// Hashmap for Name -> to (Opcode, (src, imm, dst)) + pub(crate) struct IOpFromArg { + pub(crate) asm: HashMap, + pub(crate) hex: HashMap, + } + lazy_static! { + pub(crate) static ref IOP_LUT: IOpFromArg = { + + let mut iop_from_arg = IOpFromArg{ + asm: HashMap::new(), + hex: HashMap::new(), + }; + + $( + let iop_format = IOpFormat{ + name: stringify!([< $asm:upper >]).to_string(), + opcode: IOpcode($opcode), + proto: $proto.clone().into() + }; + iop_from_arg.asm.insert(stringify!([< $asm:upper >]).to_string(), iop_format.clone()); + iop_from_arg.hex.insert(IOpcode($opcode), iop_format); + )* + iop_from_arg + }; + } + // Export each AsmIOpCode as constant + $( + lazy_static! { + pub static ref [< IOP_ $asm:upper >]: AsmIOpcode = { + AsmIOpcode{opcode: IOpcode($opcode), format: Some(IOpFormat{ + name: stringify!([< $asm:upper >]).to_string(), + opcode: IOpcode($opcode), + proto: $proto.clone().into() + })} + }; + } + )* + + lazy_static! { + pub static ref IOP_LIST: Vec = vec![ $(AsmIOpcode{opcode: IOpcode($opcode), format: Some(IOpFormat{ + name: stringify!([< $asm:upper >]).to_string(), + opcode: IOpcode($opcode), + proto: $proto.clone().into() + })},)*]; + } + } + } +} diff --git a/backends/tfhe-hpu-backend/src/asm/iop/mod.rs b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs new file mode 100644 index 000000000..753778a84 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/iop/mod.rs @@ -0,0 +1,184 @@ +//! +//! IOp definition + +mod field; +pub use field::{HexParsingError, IOp, IOpcode, Immediat, Operand, OperandKind}; +mod fmt; +pub use fmt::{IOpRepr, IOpWordRepr}; +mod iop_macro; +pub mod opcode; + +mod arg; +pub use arg::{AsmIOpcode, ParsingError}; + +// TODO find a proper way to let this runtime properties +pub const MSG_WIDTH: u8 = 2; +pub const CARRY_WIDTH: u8 = 2; + +/// Enum used to define a variable size relative to current integer width +#[derive(Debug, Eq, PartialEq, Clone, Copy, serde::Serialize, serde::Deserialize)] +pub enum VarMode { + Native, + Half, + Bool, +} + +/// Implement FromString trait to enable parsing from CLI +impl std::str::FromStr for VarMode { + type Err = ParsingError; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "n" | "nat" | "native" => Ok(VarMode::Native), + "h" | "half" => Ok(VarMode::Half), + "b" | "bool" => Ok(VarMode::Bool), + _ => Err(ParsingError::InvalidArg(format!("Invalid VarMode: {s}"))), + } + } +} + +/// Struct used to depict IOp prototype with clarity +#[derive(Debug, Clone)] +pub struct ConstIOpProto { + pub dst: [VarMode; D], + pub src: [VarMode; S], + pub imm: usize, +} + +/// Dynamic type to erase const template +// TODO moved from runtime check to compile time one +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct IOpProto { + pub dst: Vec, + pub src: Vec, + pub imm: usize, +} + +impl From> for IOpProto { + fn from(const_val: ConstIOpProto) -> Self { + Self { + dst: const_val.dst.into(), + src: const_val.src.into(), + imm: const_val.imm, + } + } +} + +/// Implement FromString trait to enable parsing from CLI +impl std::str::FromStr for IOpProto { + type Err = ParsingError; + + fn from_str(s: &str) -> Result { + lazy_static! { + static ref PROTO_ARG_RE: regex::Regex = + regex::Regex::new(r"<(?[\w\s,]+)>::<(?[\w\s,]*)><(?\d+)>") + .expect("Invalid regex"); + } + if let Some(caps) = PROTO_ARG_RE.captures(s) { + let dst = if let Some(dst_raw) = caps.name("dst") { + dst_raw + .as_str() + .split(',') + .map(|x| x.trim().parse()) + .collect::, ParsingError>>() + } else { + Err(ParsingError::Unmatch( + "Invalid IOpProto: Missing dst field (e.g. ".to_string(), + )) + }?; + + let src = if let Some(src_raw) = caps.name("src") { + src_raw + .as_str() + .split(',') + .map(|x| x.trim().parse()) + .collect::, ParsingError>>() + } else { + Err(ParsingError::Unmatch( + "Invalid IOpProto: Missing src field (e.g. " + .to_string(), + )) + }?; + let imm = if let Some(imm_raw) = caps.name("imm") { + imm_raw + .as_str() + .parse::() + .map_err(|err| ParsingError::InvalidArg(err.to_string())) + } else { + Err(ParsingError::Unmatch( + "Invalid IOpProto: Missing imm field (e.g. <2>".to_string(), + )) + }?; + + Ok(IOpProto { dst, src, imm }) + } else { + Err(ParsingError::Unmatch(format!( + "Invalid IOpProto format {s}" + ))) + } + } +} + +// Define some common iop format +pub const IOP_CT_F_CT: ConstIOpProto<1, 1> = ConstIOpProto { + dst: [VarMode::Native; 1], + src: [VarMode::Native; 1], + imm: 0, +}; +pub const IOP_CT_F_2CT: ConstIOpProto<1, 2> = ConstIOpProto { + dst: [VarMode::Native; 1], + src: [VarMode::Native; 2], + imm: 0, +}; +pub const IOP_CT_F_2CT_BOOL: ConstIOpProto<1, 3> = ConstIOpProto { + dst: [VarMode::Native; 1], + src: [VarMode::Native, VarMode::Native, VarMode::Bool], + imm: 0, +}; +pub const IOP_CT_F_CT_BOOL: ConstIOpProto<1, 2> = ConstIOpProto { + dst: [VarMode::Native; 1], + src: [VarMode::Native, VarMode::Bool], + imm: 0, +}; +pub const IOP_CT_F_CT_SCALAR: ConstIOpProto<1, 1> = ConstIOpProto { + dst: [VarMode::Native; 1], + src: [VarMode::Native; 1], + imm: 1, +}; +pub const IOP_CMP: ConstIOpProto<1, 2> = ConstIOpProto { + dst: [VarMode::Bool; 1], + src: [VarMode::Native; 2], + imm: 0, +}; +pub const IOP_2CT_F_3CT: ConstIOpProto<2, 3> = ConstIOpProto { + dst: [VarMode::Native; 2], + src: [VarMode::Native; 3], + imm: 0, +}; + +use crate::iop; +use arg::IOpFormat; +use lazy_static::lazy_static; +use std::collections::HashMap; +iop!( + [IOP_CT_F_CT_SCALAR -> "ADDS", opcode::ADDS], + [IOP_CT_F_CT_SCALAR -> "SUBS", opcode::SUBS], + [IOP_CT_F_CT_SCALAR -> "SSUB", opcode::SSUB], + [IOP_CT_F_CT_SCALAR -> "MULS", opcode::MULS], + [IOP_CT_F_2CT -> "ADD", opcode::ADD], + [IOP_CT_F_2CT -> "SUB", opcode::SUB], + [IOP_CT_F_2CT -> "MUL", opcode::MUL], + [IOP_CT_F_2CT -> "BW_AND", opcode::BW_AND], + [IOP_CT_F_2CT -> "BW_OR", opcode::BW_OR], + [IOP_CT_F_2CT -> "BW_XOR", opcode::BW_XOR], + [IOP_CMP -> "CMP_GT", opcode::CMP_GT], + [IOP_CMP -> "CMP_GTE", opcode::CMP_GTE], + [IOP_CMP -> "CMP_LT", opcode::CMP_LT], + [IOP_CMP -> "CMP_LTE", opcode::CMP_LTE], + [IOP_CMP -> "CMP_EQ", opcode::CMP_EQ], + [IOP_CMP -> "CMP_NEQ", opcode::CMP_NEQ], + [IOP_CT_F_CT_BOOL -> "IF_THEN_ZERO", opcode::IF_THEN_ZERO], + [IOP_CT_F_2CT_BOOL -> "IF_THEN_ELSE", opcode::IF_THEN_ELSE], + [IOP_2CT_F_3CT -> "ERC_20", opcode::ERC_20], + [IOP_CT_F_CT -> "MEMCPY", opcode::MEMCPY], +); diff --git a/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs new file mode 100644 index 000000000..501cbe0db --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/iop/opcode.rs @@ -0,0 +1,61 @@ +//! +//! Define hex encoding for a subset of known IOp +//! NB: Start from highest IOpcode to reduce the likelihood to clash with user custom operation on +//! extensions +//! +//! Current Opcode space could be viewed as follow: +//! | Range | Categories | +//! | ---------- | ------------------------- | +//! | 0x00.. 0x7f| User custom operations | +//! | 0x80.. 0xff| Fw generated operations | +//! | 0b1xyz_0000| x: Ct x Ct Operation | +//! | | !x: Ct x Imm Operation | +//! | | y!z: ARITH operations | +//! | | !yz: BW operations | +//! | | !y!z: CMP operations | +//! | ---------- | ------------------------- | + +pub const USER_RANGE_LB: u8 = 0x0; +pub const USER_RANGE_UB: u8 = 0x7f; + +// Ct x Imm ------------------------------------------------------------------- +pub const ADDS: u8 = 0xA0; +pub const SUBS: u8 = 0xA1; +pub const SSUB: u8 = 0xA2; +pub const MULS: u8 = 0xA3; + +// Ct x Ct ------------------------------------------------------------------- +// Arith operations +pub const ADD: u8 = 0xE0; +pub const SUB: u8 = 0xE2; +pub const MUL: u8 = 0xE4; + +// BW operations +pub const BW_AND: u8 = 0xD0; +pub const BW_OR: u8 = 0xD1; +pub const BW_XOR: u8 = 0xD2; + +// Cmp operations +pub const CMP_GT: u8 = 0xC0; +pub const CMP_GTE: u8 = 0xC1; +pub const CMP_LT: u8 = 0xC2; +pub const CMP_LTE: u8 = 0xC3; +pub const CMP_EQ: u8 = 0xC4; +pub const CMP_NEQ: u8 = 0xC5; + +// Ternary operations +// IfThenZero -> Select or force to 0 +// Take 1Ct and a Boolean Ct as input +pub const IF_THEN_ZERO: u8 = 0xCA; +// IfThenElse -> Select operation +// Take 2Ct and a Boolean Ct as input +pub const IF_THEN_ELSE: u8 = 0xCB; + +// Custom algorithm +// ERC20 -> Found xfer algorithm +// 2Ct <- func(3Ct) +pub const ERC_20: u8 = 0x80; + +// Utility operations +// Used to handle real clone of ciphertext already uploaded in the Hpu memory +pub const MEMCPY: u8 = 0xFF; diff --git a/backends/tfhe-hpu-backend/src/asm/mod.rs b/backends/tfhe-hpu-backend/src/asm/mod.rs new file mode 100644 index 000000000..db5f1898d --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/mod.rs @@ -0,0 +1,334 @@ +pub mod dop; +pub use dop::arg::Arg as DOpArg; +pub use dop::{DOp, DigitParameters, ImmId, MemId, Pbs, PbsGid, PbsLut, RegId, ToHex}; +pub mod iop; +pub use iop::{AsmIOpcode, IOp, IOpProto, IOpcode, OperandKind}; + +use std::collections::VecDeque; +use std::io::{BufRead, Write}; + +pub const ASM_COMMENT_PREFIX: [char; 2] = [';', '#']; + +// Common type used in both DOp/IOp definition -------------------------------- +/// Ciphertext Id +/// On-board memory is viewed as an array of ciphertext, +/// Thus, instead of using bytes address, ct id is used +/// => Id of the first ciphertext of the vector +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)] +pub struct CtId(pub u16); + +// --------------------------------------------------------------------------- + +/// Simple test for Asm parsing +#[cfg(test)] +mod tests; + +/// Type to aggregate Op and header +/// Aim is to kept correct interleaving while parsing +#[derive(Debug, Clone)] +pub enum AsmOp { + Comment(String), + Stmt(Op), +} + +impl AsmOp { + pub fn to_flush(&mut self) { + if let AsmOp::Stmt(op) = self { + *op = op.to_flush(); + } + } +} + +impl std::fmt::Display for AsmOp { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + match self { + Self::Comment(c) => write!(f, "{}{c}", ASM_COMMENT_PREFIX[0]), + Self::Stmt(op) => write!(f, "{op}"), + } + } +} + +/// Generic struct to represent sequence of operations +/// Used to extract OP from ASM file +/// Work on any kind of Op that implement FromStr +#[derive(Debug, Clone)] +pub struct Program(Vec>); + +impl Default for Program { + fn default() -> Self { + Self(Vec::new()) + } +} + +impl Program { + pub fn new(ops: Vec>) -> Self { + Self(ops) + } + /// Push a new statement in the program + pub fn push_stmt(&mut self, op: Op) { + self.0.push(AsmOp::Stmt(op)) + } + /// Push a new statement in the program + /// Returns the position in which the statement was inserted + pub fn push_stmt_pos(&mut self, op: Op) -> usize { + let ret = self.0.len(); + self.0.push(AsmOp::Stmt(op)); + ret + } + /// Push a new comment in the program + pub fn push_comment(&mut self, comment: String) { + self.0.push(AsmOp::Comment(comment)) + } + + pub fn get_stmt_mut(&mut self, i: usize) -> &mut AsmOp { + &mut self.0[i] + } +} + +impl std::ops::Deref for Program { + type Target = Vec>; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::fmt::Display for Program { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + for op in self.0.iter() { + writeln!(f, "{op}")?; + } + Ok(()) + } +} + +impl Program +where + Op: std::str::FromStr, + Err: std::error::Error, +{ + /// Generic function to extract OP from ASM file + /// Work on any kind of Op that implement FromStr + pub fn read_asm(file: &str) -> Result { + // Open file + let rd_f = std::io::BufReader::new( + std::fs::OpenOptions::new() + .create(false) + .read(true) + .open(file)?, + ); + + let mut asm_ops = Vec::new(); + for (line, val) in rd_f.lines().map_while(Result::ok).enumerate() { + if let Some(comment) = val.trim().strip_prefix(ASM_COMMENT_PREFIX) { + asm_ops.push(AsmOp::Comment(comment.to_string())) + } else if !val.is_empty() { + match Op::from_str(&val) { + Ok(op) => asm_ops.push(AsmOp::Stmt(op)), + Err(err) => { + tracing::warn!("ReadAsm failed @{file}:{}", line + 1); + anyhow::bail!( + "ReadAsm failed @{file}:{} with {}", + line + 1, + err.to_string() + ); + } + } + } + } + Ok(Self(asm_ops)) + } +} + +impl Program +where + Op: std::fmt::Display, +{ + /// Generic function to write Op in ASM file + /// Work on any kind of Op that implement Display + pub fn write_asm(&self, file: &str) -> Result<(), anyhow::Error> { + // Create path + let path = std::path::Path::new(file); + if let Some(dir_p) = path.parent() { + std::fs::create_dir_all(dir_p).unwrap(); + } + + // Open file + let mut wr_f = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(path)?; + + writeln!(wr_f, "{self}").map_err(anyhow::Error::new) + } +} + +// Implement dedicated hex parser/dumper for DOp +impl Program { + /// Generic function to extract OP from hex file + /// Work on any kind of Op that implement FromStr + pub fn read_hex(file: &str) -> Result { + // Open file + let rd_f = std::io::BufReader::new( + std::fs::OpenOptions::new() + .create(false) + .read(true) + .open(file) + .unwrap_or_else(|_| panic!("Invalid HEX file {file}")), + ); + + let mut prog = Self::default(); + for (line, val) in rd_f.lines().map_while(Result::ok).enumerate() { + if let Some(comment) = val.trim().strip_prefix(ASM_COMMENT_PREFIX) { + prog.push_comment(comment.to_string()); + } else { + let val_u32 = + dop::DOpRepr::from_str_radix(std::str::from_utf8(val.as_bytes()).unwrap(), 16)?; + match dop::DOp::from_hex(val_u32) { + Ok(op) => prog.push_stmt(op), + Err(err) => { + tracing::warn!("DOp::ReadHex failed @{file}:{}", line + 1); + return Err(err.into()); + } + } + } + } + Ok(prog) + } + + /// Generic function to write Op in Hex file + pub fn write_hex(&self, file: &str) -> Result<(), anyhow::Error> { + // Create path + let path = std::path::Path::new(file); + if let Some(dir_p) = path.parent() { + std::fs::create_dir_all(dir_p).unwrap(); + } + + // Open file + let mut wr_f = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(path)?; + + for op in self.0.iter() { + match op { + AsmOp::Comment(comment) => writeln!(wr_f, "{}{}", ASM_COMMENT_PREFIX[0], comment)?, + AsmOp::Stmt(op) => writeln!(wr_f, "{:x}", op.to_hex())?, + } + } + Ok(()) + } +} + +impl Program { + /// Convert a program of Dops in translation table + pub fn tr_table(&self) -> Vec { + let ops_stream = self + .iter() + .filter_map(|op| match op { + AsmOp::Comment(_) => None, + AsmOp::Stmt(op) => Some(op), + }) + .collect::>(); + + let mut words_stream = Vec::with_capacity(ops_stream.len() + 1); + // First word of the stream is length in DOp + words_stream.push(ops_stream.len() as u32); + + ops_stream.iter().for_each(|op| { + words_stream.push(op.to_hex()); + }); + words_stream + } +} + +// Implement dedicated hex parser/dumper for IOp +impl Program { + /// Generic function to extract OP from hex file + pub fn read_hex(file: &str) -> Result { + // Open file + let rd_f = std::io::BufReader::new( + std::fs::OpenOptions::new() + .create(false) + .read(true) + .open(file) + .unwrap_or_else(|_| panic!("Invalid HEX file {file}")), + ); + + let mut prog = Self::default(); + // Buffer word stream. + // When comment token occurred, convert the word stream into IOp + // -> No comment could be inserted in a middle of IOp word stream + let mut word_stream = VecDeque::new(); + let mut file_len = 0; + + for val in rd_f.lines().map_while(Result::ok) { + file_len += 1; + if let Some(comment) = val.trim().strip_prefix(ASM_COMMENT_PREFIX) { + while !word_stream.is_empty() { + match iop::IOp::from_words(&mut word_stream) { + Ok(op) => prog.push_stmt(op), + Err(err) => { + tracing::warn!( + "IOp::ReadHex failed @{file}:{}", + file_len - word_stream.len() + ); + return Err(err.into()); + } + } + } + prog.push_comment(comment.to_string()); + } else { + let word = iop::IOpWordRepr::from_str_radix( + std::str::from_utf8(val.as_bytes()).unwrap(), + 16, + )?; + word_stream.push_back(word); + } + } + // Flush word stream + while !word_stream.is_empty() { + match iop::IOp::from_words(&mut word_stream) { + Ok(op) => prog.push_stmt(op), + Err(err) => { + tracing::warn!( + "IOp::ReadHex failed @{file}:{}", + file_len - word_stream.len() + ); + return Err(err.into()); + } + } + } + Ok(prog) + } + + /// Generic function to write Op in Hex file + pub fn write_hex(&self, file: &str) -> Result<(), anyhow::Error> { + // Create path + let path = std::path::Path::new(file); + if let Some(dir_p) = path.parent() { + std::fs::create_dir_all(dir_p).unwrap(); + } + + // Open file + let mut wr_f = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(path)?; + + for op in self.0.iter() { + match op { + AsmOp::Comment(comment) => writeln!(wr_f, "{}{}", ASM_COMMENT_PREFIX[0], comment)?, + AsmOp::Stmt(op) => { + op.to_words() + .into_iter() + .try_for_each(|word| writeln!(wr_f, "{word:0>8x}"))?; + } + } + } + Ok(()) + } +} diff --git a/backends/tfhe-hpu-backend/src/asm/tests/dop.asm b/backends/tfhe-hpu-backend/src/asm/tests/dop.asm new file mode 100644 index 000000000..4346a5ba1 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/tests/dop.asm @@ -0,0 +1,31 @@ +; DOp Asm snippets sample that depicts all available format +; with the != available arguments modes +; Test LD with various template format +LD R1 @0x400 +LD R2 @386 +LD R3 TS[8].4 +LD R3 TD[8].4 +LD R4 TH.60 + +; Test ST with various template format +ST @0x400 R1 +ST @386 R2 +ST TS[8].4 R3 +ST TD[4].0 R4 +ST TH.60 R4 + +; Test Arith operation +ADD R2 R1 R3 +SUB R2 R1 R3 +MUL R2 R1 R3 +MAC R2 R1 R3 4 + +; Test ArithMsg operation with various immediat template format +ADDS R2 R1 10 +SUBS R2 R1 TI[4].0 +SSUB R2 R1 TI[2].4 +SUBS R2 R1 TI[4].0 + +; Test Pbs operation +PBS R2 R1 PbsNone +PBS_F R2 R1 PbsCarryInMsg diff --git a/backends/tfhe-hpu-backend/src/asm/tests/iop.asm b/backends/tfhe-hpu-backend/src/asm/tests/iop.asm new file mode 100644 index 000000000..4a87e003b --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/tests/iop.asm @@ -0,0 +1,31 @@ +; IOp Asm snippets sample that depicts all available format +; with the != available arguments modes + +; Simple Mode: +; 1 destination, 2 sources, no immediat +; With raw opcode -> 0x35 +IOP[0x35] +; With raw opcode -> 40 and dynamic Fw generation +; IOP[0x35] I8> +IOP[0x35] +; With opcode alias -> MUL +MUL + +; Simple Mode with immediat +; Source operands are defined through vector mode +MULS <0xaf> + +; Vectorized mode with opcode alias +; ADDV +; Nb: not implemented yet, use raw format instead +IOP[0x20] + +; Two destination w.o. opcode alias +; I.e. could be a div euclide which output divider and remainder +IOP[0x60] +; Previous operation could be defined with vector format. +IOP[0x40] + +; With multiple immediat +; Example this operation could compute D <- A*4 + B*8 +IOP[0x0] <0xdeadc0de> diff --git a/backends/tfhe-hpu-backend/src/asm/tests/mod.rs b/backends/tfhe-hpu-backend/src/asm/tests/mod.rs new file mode 100644 index 000000000..c1aae335f --- /dev/null +++ b/backends/tfhe-hpu-backend/src/asm/tests/mod.rs @@ -0,0 +1,49 @@ +//! +//! Test for DOp/IOp format + +use crate::asm::{dop, iop, Program}; + +#[test] +fn dop_asm_test() -> Result<(), anyhow::Error> { + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(true) + // Display source code line numbers + .with_line_number(true) + .without_time() + // Build & register the subscriber + .init(); + + let input_file = "src/tests/dop.asm"; + let dop_prg = Program::::read_asm(input_file)?; + println!("Parsing results:\n {dop_prg}"); + + Ok(()) +} + +#[test] +fn iop_asm_test() -> Result<(), anyhow::Error> { + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(true) + // Display source code line numbers + .with_line_number(true) + .without_time() + // Build & register the subscriber + .init(); + + let input_file = "src/tests/iop.asm"; + + let iop_prg = Program::::read_asm(input_file)?; + println!("Parsing results:\n {iop_prg}"); + + Ok(()) +} diff --git a/backends/tfhe-hpu-backend/src/entities/glwe_ciphertext.rs b/backends/tfhe-hpu-backend/src/entities/glwe_ciphertext.rs new file mode 100644 index 000000000..b561bb74b --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/glwe_ciphertext.rs @@ -0,0 +1,108 @@ +//! Module containing the definition of the HpuGlweCiphertext. +//! Raw typed container without any logic +//! Conversion from/into tfhers entities should be implemented inside tfhers to prevent dependency +//! loop + +use super::parameters::*; +use super::traits::container::*; + +/// A [`Hpu GLWE ciphertext`](`HpuGlweCiphertext`). +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuGlweCiphertext { + data: C, + params: HpuParameters, +} + +impl AsMut<[C::Element]> for HpuGlweCiphertext { + fn as_mut(&mut self) -> &mut [C::Element] { + self.data.as_mut() + } +} + +impl AsRef<[C::Element]> for HpuGlweCiphertext { + fn as_ref(&self) -> &[C::Element] { + self.data.as_ref() + } +} + +pub fn hpu_glwe_ciphertext_size(params: &HpuParameters) -> usize { + (params.pbs_params.glwe_dimension + 1) * params.pbs_params.polynomial_size +} + +impl HpuGlweCiphertext { + /// Create a [`HpuGlweCiphertext`] from an existing container. + pub fn from_container(container: C, params: HpuParameters) -> Self { + assert!( + container.container_len() > 0, + "Got an empty container to create a HpuGlweCiphertext" + ); + assert!( + container.container_len() == hpu_glwe_ciphertext_size(¶ms), + "The provided container length is not valid. \ + It needs to match with parameters. \ + Got container length: {} and based on parameters value expect: {}.", + container.container_len(), + hpu_glwe_ciphertext_size(¶ms), + ); + Self { + data: container, + params, + } + } + + /// Consume the entity and return its underlying container. + /// + /// See [`HpuGlweCiphertext::from_container`] for usage. + pub fn into_container(self) -> C { + self.data + } +} + +impl HpuGlweCiphertext { + /// Return the [`Parameters`] of the [`HpuGlweCiphertext`]. + /// + /// See [`HpuGlweCiphertext::from_container`] for usage. + pub fn params(&self) -> &HpuParameters { + &self.params + } + + /// Return a view of the [`HpuGlweCiphertext`]. This is useful if an algorithm takes a view by + /// value. + pub fn as_view(&self) -> HpuGlweCiphertext<&'_ [C::Element]> { + HpuGlweCiphertext { + data: self.data.as_ref(), + params: self.params.clone(), + } + } +} + +impl HpuGlweCiphertext { + /// Mutable variant of [`HpuGlweCiphertext::as_view`]. + pub fn as_mut_view(&mut self) -> HpuGlweCiphertext<&'_ mut [C::Element]> { + HpuGlweCiphertext { + data: self.data.as_mut(), + params: self.params.clone(), + } + } +} + +/// A [`HpuGlweCiphertext`] owning the memory for its own storage. +pub type HpuGlweCiphertextOwned = HpuGlweCiphertext>; +/// A [`HpuGlweCiphertext`] immutably borrowing memory for its own storage. +pub type HpuGlweCiphertextView<'data, Scalar> = HpuGlweCiphertext<&'data [Scalar]>; +/// A [`HpuGlweCiphertext`] mutably borrowing memory for its own storage. +pub type HpuGlweCiphertextMutView<'data, Scalar> = HpuGlweCiphertext<&'data mut [Scalar]>; + +impl HpuGlweCiphertextOwned { + /// Allocate memory and create a new owned [`HpuGlweCiphertext`]. + /// + /// # Note + /// + /// This function allocates a vector of the appropriate size and wraps it in the appropriate + /// type. + /// + /// See [`HpuGlweCiphertext::from_container`] for usage. + pub fn new(fill_with: Scalar, params: HpuParameters) -> Self { + Self::from_container(vec![fill_with; hpu_glwe_ciphertext_size(¶ms)], params) + } +} diff --git a/backends/tfhe-hpu-backend/src/entities/glwe_lookuptable.rs b/backends/tfhe-hpu-backend/src/entities/glwe_lookuptable.rs new file mode 100644 index 000000000..258d9366a --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/glwe_lookuptable.rs @@ -0,0 +1,104 @@ +//! Module containing the definition of the HpuGlweLookuptable. +//! -> Mainly a Glwe body +//! Raw typed container without any logic +//! Conversion from/into tfhers entities should be implemented inside tfhers to prevent dependency +//! loop + +use super::parameters::*; +use super::traits::container::*; + +/// A [`Hpu GLWE lookuptable`](`HpuGlweLookuptable`). +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuGlweLookuptable { + data: C, + params: HpuParameters, +} + +impl AsMut<[C::Element]> for HpuGlweLookuptable { + fn as_mut(&mut self) -> &mut [C::Element] { + self.data.as_mut() + } +} + +impl AsRef<[C::Element]> for HpuGlweLookuptable { + fn as_ref(&self) -> &[C::Element] { + self.data.as_ref() + } +} + +pub fn hpu_glwe_lookuptable_size(params: &HpuParameters) -> usize { + params.pbs_params.polynomial_size +} + +impl HpuGlweLookuptable { + /// Create a [`HpuGlweLookuptable`] from an existing container. + pub fn from_container(container: C, params: HpuParameters) -> Self { + assert!( + container.container_len() > 0, + "Got an empty container to create a HpuGlweLookuptable" + ); + assert!( + container.container_len() == hpu_glwe_lookuptable_size(¶ms), + "The provided container length is not valid. \ + It needs to match with parameters. \ + Got container length: {} and based on parameters value expect: {}.", + container.container_len(), + hpu_glwe_lookuptable_size(¶ms), + ); + Self { + data: container, + params, + } + } + + /// Consume the entity and return its underlying container. + /// + /// See [`HpuGlweLookuptable::from_container`] for usage. + pub fn into_container(self) -> C { + self.data + } +} + +impl HpuGlweLookuptable { + /// Return the [`Parameters`] of the [`HpuGlweLookuptable`]. + /// + /// See [`HpuGlweLookuptable::from_container`] for usage. + pub fn params(&self) -> &HpuParameters { + &self.params + } + + /// Return a view of the [`HpuGlweLookuptable`]. This is useful if an algorithm takes a view by + /// value. + pub fn as_view(&self) -> HpuGlweLookuptable<&'_ [C::Element]> { + HpuGlweLookuptable { + data: self.data.as_ref(), + params: self.params.clone(), + } + } +} + +impl HpuGlweLookuptable { + /// Mutable variant of [`HpuGlweLookuptable::as_view`]. + pub fn as_mut_view(&mut self) -> HpuGlweLookuptable<&'_ mut [C::Element]> { + HpuGlweLookuptable { + data: self.data.as_mut(), + params: self.params.clone(), + } + } +} + +/// A [`HpuGlweLookuptable`] owning the memory for its own storage. +pub type HpuGlweLookuptableOwned = HpuGlweLookuptable>; +/// A [`HpuGlweLookuptable`] immutably borrowing memory for its own storage. +pub type HpuGlweLookuptableView<'data, Scalar> = HpuGlweLookuptable<&'data [Scalar]>; +/// A [`HpuGlweLookuptable`] mutably borrowing memory for its own storage. +pub type HpuGlweLookuptableMutView<'data, Scalar> = HpuGlweLookuptable<&'data mut [Scalar]>; + +impl HpuGlweLookuptableOwned { + /// Allocate memory and create a new owned [`HpuGlweLookuptable`]. + /// + /// See [`HpuGlweLookuptable::from_container`] for usage. + pub fn new(fill_with: Scalar, params: HpuParameters) -> Self { + Self::from_container(vec![fill_with; hpu_glwe_lookuptable_size(¶ms)], params) + } +} diff --git a/backends/tfhe-hpu-backend/src/entities/lwe_bootstrap_key.rs b/backends/tfhe-hpu-backend/src/entities/lwe_bootstrap_key.rs new file mode 100644 index 000000000..506602470 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/lwe_bootstrap_key.rs @@ -0,0 +1,161 @@ +//! Module containing the definition of the HpuLweBootstrapKey. +//! Raw typed container without any logic +//! Conversion from/into tfhers entities should be implemented inside tfhers to prevent dependency +//! loop + +use super::parameters::*; +use super::traits::container::*; + +/// A [`Hpu lwe bootstrapping key`](`HpuLweBootstrapKey`). +/// Inner container is split in pc chunks to ease copy from/to hardware +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuLweBootstrapKey { + pc_data: Vec, + params: HpuParameters, +} + +/// Index inside the container abstracting away the inner pc split +impl std::ops::Index for HpuLweBootstrapKey { + type Output = C::Element; + + fn index(&self, index: usize) -> &Self::Output { + let ntt_p = &self.params.ntt_params; + let bsk_pc = self.params.pc_params.bsk_pc; + let chunk_size = (ntt_p.radix * ntt_p.psi) / bsk_pc; + let (pc, ofst) = ( + (index / chunk_size) % bsk_pc, + (((index / chunk_size) / bsk_pc) * chunk_size) + (index % chunk_size), + ); + &self.pc_data[pc].as_ref()[ofst] + } +} + +/// IndexMut inside the container abstracting away the inner pc split +impl std::ops::IndexMut for HpuLweBootstrapKey { + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + let ntt_p = &self.params.ntt_params; + let bsk_pc = self.params.pc_params.bsk_pc; + let chunk_size = (ntt_p.radix * ntt_p.psi) / bsk_pc; + let (pc, ofst) = ( + (index / chunk_size) % bsk_pc, + (((index / chunk_size) / bsk_pc) * chunk_size) + (index % chunk_size), + ); + &mut self.pc_data[pc].as_mut()[ofst] + } +} + +pub fn hpu_lwe_bootstrap_key_size(params: &HpuParameters) -> usize { + let pbs_p = ¶ms.pbs_params; + pbs_p.lwe_dimension // ciphertext_count + * pbs_p.pbs_level // ggsw_ciphertext_size + * ((pbs_p.glwe_dimension +1) // ggsw_level_matrix_size + * (pbs_p.glwe_dimension +1) + * pbs_p.polynomial_size) +} + +impl HpuLweBootstrapKey { + /// Create a [`HpuLweBootstrapKey`] from an existing container. + pub fn from_container(container: Vec, params: HpuParameters) -> Self { + debug_assert_eq!( + (params.ntt_params.radix * params.ntt_params.psi) % params.pc_params.bsk_pc, + 0, + "Error: Incompatible (R*PSI: {}, BSK_PC: {})", + params.ntt_params.radix * params.ntt_params.psi, + params.pc_params.bsk_pc + ); + + assert_eq!( + container.len(), + params.pc_params.bsk_pc, + "Container chunk mismatch with bsk_pc number" + ); + assert!( + container.iter().map(|x| x.container_len()).sum::() > 0, + "Got an empty container to create a HpuLweBootstrapKey" + ); + assert_eq!( + container.iter().map(|x| x.container_len()).sum::(), + hpu_lwe_bootstrap_key_size(¶ms), + "The provided container length is not valid. \ + It needs to match with parameters. \ + Got container length: {} and based on parameters value expect: {}.", + container.iter().map(|x| x.container_len()).sum::(), + hpu_lwe_bootstrap_key_size(¶ms) + ); + + Self { + pc_data: container, + params, + } + } + + /// Consume the entity and return its underlying container. + /// + /// See [`HpuLweBootstrapKey::from_container`] for usage. + pub fn into_container(self) -> Vec { + self.pc_data + } +} + +impl HpuLweBootstrapKey { + /// Return the [`Parameters`] of the [`HpuLweBootstrapKey`]. + /// + /// See [`HpuLweBootstrapKey::from_container`] for usage. + pub fn params(&self) -> &HpuParameters { + &self.params + } + + /// Return the length of the [`HpuLweBootstrapKey`] underlying containers. + pub fn len(&self) -> usize { + self.pc_data.iter().map(|c| c.container_len()).sum() + } + + pub fn is_empty(&self) -> bool { + !self.pc_data.iter().any(|c| c.container_len() != 0) + } + + /// Return a view of the [`HpuLweBootstrapKey`]. This is useful if an algorithm takes a view by + /// value. + pub fn as_view(&self) -> HpuLweBootstrapKey<&'_ [C::Element]> { + HpuLweBootstrapKey { + pc_data: self.pc_data.iter().map(|x| x.as_ref()).collect::>(), + params: self.params.clone(), + } + } +} + +impl HpuLweBootstrapKey { + /// Mutable variant of [`HpuLweBootstrapKey::as_view`]. + pub fn as_mut_view(&mut self) -> HpuLweBootstrapKey<&'_ mut [C::Element]> { + HpuLweBootstrapKey { + pc_data: self + .pc_data + .iter_mut() + .map(|x| x.as_mut()) + .collect::>(), + params: self.params.clone(), + } + } +} + +/// A [`HpuLweBootstrapKey`] owning the memory for its own storage. +pub type HpuLweBootstrapKeyOwned = HpuLweBootstrapKey>; +/// A [`HpuLweBootstrapKey`] immutably borrowing memory for its own storage. +pub type HpuLweBootstrapKeyView<'data, Scalar> = HpuLweBootstrapKey<&'data [Scalar]>; +/// A [`HpuLweBootstrapKey`] mutably borrowing memory for its own storage. +pub type HpuLweBootstrapKeyMutView<'data, Scalar> = HpuLweBootstrapKey<&'data mut [Scalar]>; + +impl HpuLweBootstrapKeyOwned { + /// Allocate memory and create a new owned [`HpuLweBootstrapKey`]. + /// + /// + /// See [`HpuLweBootstrapKey::from_container`] for usage. + pub fn new(fill_with: Scalar, params: HpuParameters) -> Self { + let chunk_size = hpu_lwe_bootstrap_key_size(¶ms).div_euclid(params.pc_params.bsk_pc); + let pc_data = (0..params.pc_params.bsk_pc) + .map(|_| vec![fill_with.clone(); chunk_size]) + .collect::>(); + + Self::from_container(pc_data, params) + } +} diff --git a/backends/tfhe-hpu-backend/src/entities/lwe_ciphertext.rs b/backends/tfhe-hpu-backend/src/entities/lwe_ciphertext.rs new file mode 100644 index 000000000..4bd7ff3e3 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/lwe_ciphertext.rs @@ -0,0 +1,164 @@ +//! Module containing the definition of the HpuLweCiphertext. +//! Raw typed container without any logic +//! Conversion from/into tfhers entities should be implemented inside tfhers to prevent dependency +//! loop + +use super::parameters::*; +use super::traits::container::*; + +/// A [`Hpu LWE ciphertext`](`HpuLweCiphertext`). +/// Inner container is split in pc chunks to ease copy from/to hardware +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuLweCiphertext { + pc_data: Vec, + params: HpuParameters, +} + +/// Index inside the container abstracting away the inner pc split +impl std::ops::Index for HpuLweCiphertext { + type Output = C::Element; + + fn index(&self, index: usize) -> &Self::Output { + let pem_pc = self.params.pc_params.pem_pc; + let chunk_size = self.params.regf_params.coef_nb / pem_pc; + let (pc, ofst) = ( + (index / chunk_size) % pem_pc, + (((index / chunk_size) / pem_pc) * chunk_size) + (index % chunk_size), + ); + &self.pc_data[pc].as_ref()[ofst] + } +} + +/// IndexMut inside the container abstracting away the inner pc split +impl std::ops::IndexMut for HpuLweCiphertext { + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + let pem_pc = self.params.pc_params.pem_pc; + let chunk_size = self.params.regf_params.coef_nb / pem_pc; + let (pc, ofst) = ( + (index / chunk_size) % pem_pc, + (((index / chunk_size) / pem_pc) * chunk_size) + (index % chunk_size), + ); + &mut self.pc_data[pc].as_mut()[ofst] + } +} + +#[allow(unused)] +// NB: HPU only handle Big Lwe over it's boundaries +// Indeed only the Big encryption key-choice is supported and the small lwe stay inside the chip +// (never reach the host) +pub fn hpu_small_lwe_ciphertext_size(params: &HpuParameters) -> usize { + params.pbs_params.lwe_dimension + 1 +} + +pub fn hpu_big_lwe_ciphertext_size(params: &HpuParameters) -> usize { + (params.pbs_params.glwe_dimension * params.pbs_params.polynomial_size) + 1 +} + +impl HpuLweCiphertext { + /// Create a [`HpuLweCiphertext`] from an existing container. + pub fn from_container(container: Vec, params: HpuParameters) -> Self { + assert_eq!( + container.len(), + params.pc_params.pem_pc, + "Container chunk mismatch with pem_pc number" + ); + assert!( + container.iter().map(|x| x.container_len()).sum::() > 0, + "Got an empty container to create a HpuLweCiphertext" + ); + assert_eq!( + container.iter().map(|x| x.container_len()).sum::(), + hpu_big_lwe_ciphertext_size(¶ms), + "The provided container length is not valid. \ + It needs to match with parameters. \ + Got container length: {} and based on parameters value expect: {}.", + container.iter().map(|x| x.container_len()).sum::(), + hpu_big_lwe_ciphertext_size(¶ms), + ); + Self { + pc_data: container, + params, + } + } + + /// Consume the entity and return its underlying container. + /// + /// See [`HpuLweCiphertext::from_container`] for usage. + pub fn into_container(self) -> Vec { + self.pc_data + } +} + +impl HpuLweCiphertext { + /// Return the [`Parameters`] of the [`HpuLweCiphertext`]. + /// + /// See [`HpuLweCiphertext::from_container`] for usage. + pub fn params(&self) -> &HpuParameters { + &self.params + } + + /// Return the length of the [`HpuLweCiphertext`] underlying containers. + pub fn len(&self) -> usize { + self.pc_data.iter().map(|c| c.container_len()).sum() + } + + pub fn is_empty(&self) -> bool { + !self.pc_data.iter().any(|c| c.container_len() != 0) + } + + /// Return a view of the [`HpuLweCiphertext`]. This is useful if an algorithm takes a view by + /// value. + pub fn as_view(&self) -> HpuLweCiphertext<&'_ [C::Element]> { + HpuLweCiphertext { + pc_data: self.pc_data.iter().map(|x| x.as_ref()).collect::>(), + params: self.params.clone(), + } + } +} + +impl HpuLweCiphertext { + /// Mutable variant of [`HpuLweCiphertext::as_view`]. + pub fn as_mut_view(&mut self) -> HpuLweCiphertext<&'_ mut [C::Element]> { + HpuLweCiphertext { + pc_data: self + .pc_data + .iter_mut() + .map(|x| x.as_mut()) + .collect::>(), + params: self.params.clone(), + } + } +} + +/// A [`HpuLweCiphertext`] owning the memory for its own storage. +pub type HpuLweCiphertextOwned = HpuLweCiphertext>; +/// A [`HpuLweCiphertext`] immutably borrowing memory for its own storage. +pub type HpuLweCiphertextView<'data, Scalar> = HpuLweCiphertext<&'data [Scalar]>; +/// A [`HpuLweCiphertext`] mutably borrowing memory for its own storage. +pub type HpuLweCiphertextMutView<'data, Scalar> = HpuLweCiphertext<&'data mut [Scalar]>; + +impl HpuLweCiphertextOwned { + /// Allocate memory and create a new owned [`HpuLweCiphertext`]. + /// + /// # Note + /// + /// This function allocates a vector of the appropriate size and wraps it in the appropriate + /// type. + /// + /// See [`HpuLweCiphertext::from_container`] for usage. + pub fn new(fill_with: Scalar, params: HpuParameters) -> Self { + // Mask is equally split in pc chunks. + // Body is then added to first chunk + let chunk_size = hpu_big_lwe_ciphertext_size(¶ms).div_euclid(params.pc_params.pem_pc); + let pc_data = (0..params.pc_params.pem_pc) + .map(|id| { + if (id == 0) && (params.pc_params.pem_pc != 1) { + vec![fill_with.clone(); chunk_size + 1] + } else { + vec![fill_with.clone(); chunk_size] + } + }) + .collect::>(); + Self::from_container(pc_data, params) + } +} diff --git a/backends/tfhe-hpu-backend/src/entities/lwe_keyswitch_key.rs b/backends/tfhe-hpu-backend/src/entities/lwe_keyswitch_key.rs new file mode 100644 index 000000000..c7e3a7dc8 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/lwe_keyswitch_key.rs @@ -0,0 +1,149 @@ +//! Module containing the definition of the HpuLweKeyswitchKey. +//! Raw typed container without any logic +//! Conversion from/into tfhers entities should be implemented inside tfhers to prevent dependency +//! loop + +use super::parameters::*; +use super::traits::container::*; + +/// A [`Hpu Lwe Keyswitch key`](`HpuLweKeyswitchKey`). +/// Inner container is split in pc chunks to ease copy from/to hardware +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuLweKeyswitchKey { + pc_data: Vec, + params: HpuParameters, +} + +/// Index inside the container abstracting away the inner pc split +impl std::ops::Index for HpuLweKeyswitchKey { + type Output = C::Element; + + fn index(&self, index: usize) -> &Self::Output { + let (pc, ofst) = self.get_pc_offset_from_index(index); + &self.pc_data[pc].as_ref()[ofst] + } +} + +/// IndexMut inside the container abstracting away the inner pc split +impl std::ops::IndexMut for HpuLweKeyswitchKey { + fn index_mut(&mut self, index: usize) -> &mut Self::Output { + let (pc, ofst) = self.get_pc_offset_from_index(index); + &mut self.pc_data[pc].as_mut()[ofst] + } +} + +pub fn hpu_lwe_keyswitch_key_size(params: &HpuParameters) -> usize { + // HwkeyswitchKey is a polyhedron padded with 0 to be multiple of lbx,lby,lbz + let ks_p = ¶ms.ks_params; + let pbs_p = ¶ms.pbs_params; + let hw_ksk_x = ks_p.lbx * (pbs_p.lwe_dimension + 1).div_ceil(ks_p.lbx); + let hw_ksk_y = ks_p.lby * (pbs_p.glwe_dimension * pbs_p.polynomial_size).div_ceil(ks_p.lby); + // coefs over z are packed in u64 + let hw_ksk_z = pbs_p.ks_level.div_ceil(ks_p.lbz); + + hw_ksk_x * hw_ksk_y * hw_ksk_z +} + +impl HpuLweKeyswitchKey { + /// Create a [`HpuLweKeyswitchKey`] from an existing container. + pub fn from_container(container: Vec, params: HpuParameters) -> Self { + assert_eq!( + container.len(), + params.pc_params.ksk_pc, + "Container chunk mismatch with ksk_pc number" + ); + assert_eq!( + container.iter().map(|x| x.container_len()).sum::(), + hpu_lwe_keyswitch_key_size(¶ms), + "The provided container length is not valid. \ + It needs to match with parameters. \ + Got container length: {} and based on parameters value expect: {}.", + container.iter().map(|x| x.container_len()).sum::(), + hpu_lwe_keyswitch_key_size(¶ms) + ); + Self { + pc_data: container, + params, + } + } + + /// Consume the entity and return its underlying container. + /// + /// See [`HpuLweKeyswitchKey::from_container`] for usage. + pub fn into_container(self) -> Vec { + self.pc_data + } +} + +impl HpuLweKeyswitchKey { + /// Return the [`Parameters`] of the [`HpuLweKeyswitchKey`]. + /// + /// See [`HpuLweKeyswitchKey::from_container`] for usage. + pub fn params(&self) -> &HpuParameters { + &self.params + } + + /// Return the length of the [`HpuLweKeyswitchKey`] underlying containers. + pub fn len(&self) -> usize { + self.pc_data.iter().map(|c| c.container_len()).sum() + } + + pub fn is_empty(&self) -> bool { + !self.pc_data.iter().any(|c| c.container_len() != 0) + } + + /// Return a view of the [`HpuLweKeyswitchKey`]. This is useful if an algorithm takes a view by + /// value. + pub fn as_view(&self) -> HpuLweKeyswitchKey<&'_ [C::Element]> { + HpuLweKeyswitchKey { + pc_data: self.pc_data.iter().map(|x| x.as_ref()).collect::>(), + params: self.params.clone(), + } + } + + /// Utility function to retrieved pc/offset from a global index in the key + /// Use by the Index/IndexMut trait implementation + fn get_pc_offset_from_index(&self, index: usize) -> (usize, usize) { + let ksk_pc = self.params.pc_params.ksk_pc; + let chunk_size = self.params.ks_params.lby / ksk_pc; + ( + (index / chunk_size) % ksk_pc, + (((index / chunk_size) / ksk_pc) * chunk_size) + (index % chunk_size), + ) + } +} + +impl HpuLweKeyswitchKey { + /// Mutable variant of [`HpuLweKeyswitchKey::as_view`]. + pub fn as_mut_view(&mut self) -> HpuLweKeyswitchKey<&'_ mut [C::Element]> { + HpuLweKeyswitchKey { + pc_data: self + .pc_data + .iter_mut() + .map(|x| x.as_mut()) + .collect::>(), + params: self.params.clone(), + } + } +} + +/// A [`HpuLweKeyswitchKey`] owning the memory for its own storage. +pub type HpuLweKeyswitchKeyOwned = HpuLweKeyswitchKey>; +/// A [`HpuLweKeyswitchKey`] immutably borrowing memory for its own storage. +pub type HpuLweKeyswitchKeyView<'data, Scalar> = HpuLweKeyswitchKey<&'data [Scalar]>; +/// A [`HpuLweKeyswitchKey`] mutably borrowing memory for its own storage. +pub type HpuLweKeyswitchKeyMutView<'data, Scalar> = HpuLweKeyswitchKey<&'data mut [Scalar]>; + +impl HpuLweKeyswitchKeyOwned { + /// Allocate memory and create a new owned [`HpuLweKeyswitchKey`]. + /// + /// + /// See [`HpuLweKeyswitchKey::from_container`] for usage. + pub fn new(fill_with: Scalar, params: HpuParameters) -> Self { + let chunk_size = hpu_lwe_keyswitch_key_size(¶ms) / params.pc_params.ksk_pc; + let pc_data = (0..params.pc_params.ksk_pc) + .map(|_| vec![fill_with.clone(); chunk_size]) + .collect::>(); + Self::from_container(pc_data, params) + } +} diff --git a/backends/tfhe-hpu-backend/src/entities/mod.rs b/backends/tfhe-hpu-backend/src/entities/mod.rs new file mode 100644 index 000000000..5a6367245 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/mod.rs @@ -0,0 +1,37 @@ +pub(crate) mod traits; + +pub mod parameters; +pub use parameters::{ + HpuIscParameters, HpuKeyswitchParameters, HpuNoiseDistributionInput, HpuNttCoreArch, + HpuNttParameters, HpuPBSParameters, HpuParameters, HpuPcParameters, HpuRegfileParameters, +}; + +pub mod glwe_ciphertext; +pub use glwe_ciphertext::{ + hpu_glwe_ciphertext_size, HpuGlweCiphertextMutView, HpuGlweCiphertextOwned, + HpuGlweCiphertextView, +}; + +pub mod glwe_lookuptable; +pub use glwe_lookuptable::{ + hpu_glwe_lookuptable_size, HpuGlweLookuptableMutView, HpuGlweLookuptableOwned, + HpuGlweLookuptableView, +}; + +pub mod lwe_bootstrap_key; +pub use lwe_bootstrap_key::{ + hpu_lwe_bootstrap_key_size, HpuLweBootstrapKeyMutView, HpuLweBootstrapKeyOwned, + HpuLweBootstrapKeyView, +}; + +pub mod lwe_ciphertext; +pub use lwe_ciphertext::{ + hpu_big_lwe_ciphertext_size, HpuLweCiphertextMutView, HpuLweCiphertextOwned, + HpuLweCiphertextView, +}; + +pub mod lwe_keyswitch_key; +pub use lwe_keyswitch_key::{ + hpu_lwe_keyswitch_key_size, HpuLweKeyswitchKeyMutView, HpuLweKeyswitchKeyOwned, + HpuLweKeyswitchKeyView, +}; diff --git a/backends/tfhe-hpu-backend/src/entities/parameters.rs b/backends/tfhe-hpu-backend/src/entities/parameters.rs new file mode 100644 index 000000000..14d3e31c3 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/parameters.rs @@ -0,0 +1,197 @@ +//! Contains definition of Hpu architecture related parameters +//! Those parameters are architecture dependents and have direct impact over memory order +//! They are required to correctly arrange entities data in an Hpu usable order. + +#[derive(Clone, Copy, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +pub enum HpuNoiseDistributionInput { + GaussianStdDev(f64), + TUniformBound(u32), +} + +/// Parameters related to Tfhe scheme computation +/// Couldn't rely on ClassicPBSParameters to prevent dependency loop +#[derive(Clone, Copy, Debug, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct HpuPBSParameters { + pub lwe_dimension: usize, + pub glwe_dimension: usize, + pub polynomial_size: usize, + pub lwe_noise_distribution: HpuNoiseDistributionInput, + pub glwe_noise_distribution: HpuNoiseDistributionInput, + pub pbs_base_log: usize, + pub pbs_level: usize, + pub ks_base_log: usize, + pub ks_level: usize, + pub message_width: usize, + pub carry_width: usize, + pub ciphertext_width: usize, +} +// Manual implementation of Eq trait +// Indeed, we can handle strict comparison of f64 +impl std::cmp::Eq for HpuPBSParameters {} + +impl HpuPBSParameters { + /// Compute associated encoding delta. + /// Used for scalar encoding + pub fn delta(&self) -> u64 { + 1_u64 + << (self.ciphertext_width + - (self.message_width + self.carry_width + /* padding_bit */ 1)) + } +} + +/// Parameters related to Keyswitch computation +/// Related to architectural implementation of Ks in Hpu +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuKeyswitchParameters { + /// Bit width + pub width: usize, + /// Parallelism over X + pub lbx: usize, + /// Parallelism over Y + pub lby: usize, + /// Parallelism over Z + pub lbz: usize, +} + +/// Parameters related to NTT computation +/// Related to architectural implementation of NTT/INTT in Hpu +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuNttParameters { + /// Core architecture + pub core_arch: HpuNttCoreArch, + /// Minimum #PBS in the batch for full throughput + pub min_pbs_nb: Option, + /// #PBS in Ntt Pipe + pub batch_pbs_nb: usize, + /// Maximum #PBS store in Pep + pub total_pbs_nb: usize, + + /// Bit width of ciphertext modulus (pow2 modulus) + pub ct_width: u32, + + /// Radix value. Must be a power of 2 + pub radix: usize, + /// Stages number -> Total number of stages. Note that R^S = N the number of coefficients of + /// the NTT. + pub stg_nb: usize, + // Prime used during computation + pub prime_modulus: HpuNttPrime, + + /// Psi value -> Number of radix blocks that work in parallel + pub psi: usize, + /// Delta value -> Number of stages before pcg network + pub delta: usize, +} + +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum HpuNttCoreArch { + WmmCompactPcg, + WmmUnfoldPcg, + GF64(Vec), +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub enum HpuNttPrime { + GF64 = 0, + Solinas3_32_17_13 = 1, + Solinas2_44_14 = 2, +} +impl From<&HpuNttPrime> for u64 { + fn from(prime: &HpuNttPrime) -> Self { + match prime { + HpuNttPrime::GF64 => { + /* Goldilocks64 */ + ((1_u128 << 64) - (1_u128 << 32) + 1_u128) as u64 + } + HpuNttPrime::Solinas3_32_17_13 => { + /* Solinas3_32_17_13 */ + ((1_u128 << 32) - (1_u128 << 17) - (1_u128 << 13) + 1) as u64 + } + HpuNttPrime::Solinas2_44_14 => { + /* Solinas2_44_14 */ + ((1_u128 << 44) - (1_u128 << 14) + 1) as u64 + } + } + } +} + +impl HpuNttParameters { + pub fn stg_iter(&self, poly_n: usize) -> usize { + poly_n / (self.radix * self.psi) + } + + pub fn ls_delta(&self) -> usize { + if 0 == (self.stg_nb % self.delta) { + self.delta - 1 + } else { + (self.stg_nb % self.delta) - 1 + } + } +} + +/// Parameters related to Hbm PC +/// Related to memory connection and allocated channel +/// Only specify the number of Pc allocated to each interface. +/// The concrete mapping of pc on memory channel is defined by the user in the top-level +/// configuration file. +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuPcParameters { + pub ksk_pc: usize, + pub ksk_bytes_w: usize, + pub bsk_pc: usize, + pub bsk_bytes_w: usize, + pub pem_pc: usize, + pub pem_bytes_w: usize, + // pub glwe_pc: usize, // Currently hardcoded to 1 + pub glwe_bytes_w: usize, +} + +/// Parameters related to regfile +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuRegfileParameters { + /// Number of register + pub reg_nb: usize, + /// Number of coefs in // at the regfile boundary + pub coef_nb: usize, +} + +/// Parameters related to Instruction Scheduler +#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuIscParameters { + /// HPU lookahead buffer depth + /// Number of instruction that are considered in advance + pub depth: usize, + + /// Minimum Number of DOps per IOp + pub min_iop_size: usize, +} + +/// HpuArchitecturesParameters +/// Describe Architecture constants that have direct import on memory shuffling and slicing +#[derive(Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct HpuParameters { + pub pbs_params: HpuPBSParameters, + pub ntt_params: HpuNttParameters, + pub ks_params: HpuKeyswitchParameters, + pub pc_params: HpuPcParameters, + pub regf_params: HpuRegfileParameters, + pub isc_params: HpuIscParameters, +} + +/// Provide Serde mechanisms in ron file +impl HpuParameters { + /// Provide Serde mechanisms from TOML file + pub fn from_toml(file: &str) -> Self { + let file_str = match std::fs::read_to_string(file) { + Ok(str) => str, + Err(err) => { + panic!("Error: `{file}`:: {err}"); + } + }; + + match toml::from_str(&file_str) { + Ok(cfg) => cfg, + Err(err) => panic!("Toml error in `{file}`: {err}"), + } + } +} diff --git a/backends/tfhe-hpu-backend/src/entities/traits/container.rs b/backends/tfhe-hpu-backend/src/entities/traits/container.rs new file mode 100644 index 000000000..9d813e5e0 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/traits/container.rs @@ -0,0 +1,104 @@ +//! Module with traits pertaining to container manipulation. +//! +//! NB: This module is a duplication of tfhers/core_crypto/commons/traits. +//! Duplicated to keep genericty in Hpu backend code without circle dependence. +//! Warn: rustc see those traits as distincts one from tfhers + +/// A trait to manipulate various immutable container types transparently. +pub trait Container: AsRef<[Self::Element]> { + type Element; + + fn container_len(&self) -> usize { + self.as_ref().len() + } +} + +/// A trait to manipulate various mutable container types transparently. +pub trait ContainerMut: Container + AsMut<[::Element]> {} + +impl Container for [T] { + type Element = T; +} + +impl ContainerMut for [T] {} + +impl Container for Vec { + type Element = T; +} + +impl ContainerMut for Vec {} + +impl Container for &[T] { + type Element = T; +} + +impl Container for &mut [T] { + type Element = T; +} + +impl ContainerMut for &mut [T] {} + +impl Container for Box<[T]> { + type Element = T; +} + +impl ContainerMut for Box<[T]> {} + +pub trait Split: Sized { + type Chunks: DoubleEndedIterator + ExactSizeIterator; + + #[allow(unused)] + fn into_chunks(self, chunk_size: usize) -> Self::Chunks; + #[allow(unused)] + fn split_into(self, chunk_count: usize) -> Self::Chunks; + #[allow(unused)] + fn split_at(self, mid: usize) -> (Self, Self); +} + +impl<'a, T> Split for &'a [T] { + type Chunks = core::slice::ChunksExact<'a, T>; + + #[inline] + fn into_chunks(self, chunk_size: usize) -> Self::Chunks { + debug_assert_eq!(self.len() % chunk_size, 0); + self.chunks_exact(chunk_size) + } + #[inline] + fn split_into(self, chunk_count: usize) -> Self::Chunks { + if chunk_count == 0 { + debug_assert_eq!(self.len(), 0); + self.chunks_exact(1) + } else { + debug_assert_eq!(self.len() % chunk_count, 0); + self.chunks_exact(self.len() / chunk_count) + } + } + #[inline] + fn split_at(self, mid: usize) -> (Self, Self) { + self.split_at(mid) + } +} + +impl<'a, T> Split for &'a mut [T] { + type Chunks = core::slice::ChunksExactMut<'a, T>; + + #[inline] + fn into_chunks(self, chunk_size: usize) -> Self::Chunks { + debug_assert_eq!(self.len() % chunk_size, 0); + self.chunks_exact_mut(chunk_size) + } + #[inline] + fn split_into(self, chunk_count: usize) -> Self::Chunks { + if chunk_count == 0 { + debug_assert_eq!(self.len(), 0); + self.chunks_exact_mut(1) + } else { + debug_assert_eq!(self.len() % chunk_count, 0); + self.chunks_exact_mut(self.len() / chunk_count) + } + } + #[inline] + fn split_at(self, mid: usize) -> (Self, Self) { + self.split_at_mut(mid) + } +} diff --git a/backends/tfhe-hpu-backend/src/entities/traits/mod.rs b/backends/tfhe-hpu-backend/src/entities/traits/mod.rs new file mode 100644 index 000000000..5ab42603f --- /dev/null +++ b/backends/tfhe-hpu-backend/src/entities/traits/mod.rs @@ -0,0 +1 @@ +pub(crate) mod container; diff --git a/backends/tfhe-hpu-backend/src/ffi/mod.rs b/backends/tfhe-hpu-backend/src/ffi/mod.rs new file mode 100644 index 000000000..327d23775 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/mod.rs @@ -0,0 +1,328 @@ +//! This ffi layer implement a wrapper around multiple ffi implementation +//! The aim is to completely hide underlying specificities and enable compile-time +//! swapping. +//! +//! Mainly replacing Xrt(u55c)/V80 by a simulation interface to ease CI + +use crate::interface::FFIMode; + +/// Enumeration to define the synchronisation of data between Host and Device +#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] +pub enum SyncMode { + Host2Device, + Device2Host, +} + +/// Specify kind of the target memory +/// Used for target that has DDR and HBM +/// Hbm is targeted based on attach PC number, the DDR otherwise is targeted based on offset +/// For the sake of simplicity and prevent issue with large xfer, memory is always viewed as a chunk +/// of 16MiB This is inherited from XRT allocator limitation... +#[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] +pub enum MemKind { + Ddr { offset: usize }, + Hbm { pc: usize }, +} + +/// Define memory zone properties +#[derive(Debug, Clone, Copy)] +pub struct MemZoneProperties { + pub mem_kind: MemKind, + pub size_b: usize, +} + +pub struct HpuHw( + #[cfg(feature = "hw-xrt")] cxx::UniquePtr, + #[cfg(feature = "hw-v80")] v80::HpuHw, + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] sim::HpuHw, +); + +impl HpuHw { + /// Read Hw register through ffi + #[inline(always)] + pub fn read_reg(&self, addr: u64) -> u32 { + #[cfg(feature = "hw-xrt")] + { + self.0.read_reg(addr) + } + + #[cfg(feature = "hw-v80")] + { + self.0.ami.read_reg(addr) + } + + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + self.0.read_reg(addr) + } + } + + /// Write Hw register through ffi + #[inline(always)] + pub fn write_reg(&mut self, addr: u64, value: u32) { + #[cfg(feature = "hw-xrt")] + { + self.0.pin_mut().write_reg(addr, value) + } + + #[cfg(feature = "hw-v80")] + { + self.0.ami.write_reg(addr, value) + } + + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + self.0.write_reg(addr, value) + } + } + + /// Handle on-board memory init through ffi + #[inline(always)] + #[allow(unused_variables)] + pub fn init_mem( + &mut self, + config: &crate::interface::HpuConfig, + params: &crate::entities::HpuParameters, + ) { + // NB: Currently only v80 backend required explicit memory init + #[cfg(feature = "hw-v80")] + { + self.0.init_mem(config, params); + } + } + /// Handle on-board memory allocation through ffi + #[inline(always)] + pub fn alloc(&mut self, props: MemZoneProperties) -> MemZone { + #[cfg(feature = "hw-xrt")] + { + let xrt_mz = self.0.pin_mut().alloc(props.into()); + MemZone(xrt_mz) + } + + #[cfg(feature = "hw-v80")] + { + MemZone(self.0.alloc(props)) + } + + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + MemZone(self.0.alloc(props)) + } + } + + /// Handle on-board memory deallocation through ffi + #[inline(always)] + #[allow(unused_variables)] + pub fn release(&mut self, zone: &mut MemZone) { + // #[cfg(feature = "hw-xrt")] + // { + // todo!("Handle memory release"); + // } + + #[cfg(feature = "hw-v80")] + { + self.0.release(&mut zone.0); + } + + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + self.0.release(&mut zone.0); + } + } + + /// Handle ffi instantiation + #[inline(always)] + pub fn new_hpu_hw(mode: &FFIMode, #[allow(unused)] retry_rate: std::time::Duration) -> HpuHw { + #[cfg(feature = "hw-xrt")] + { + use tracing::{enabled, Level}; + // Check config + match mode { + FFIMode::Xrt { id, kernel, xclbin } => { + // Extract trace verbosity and convert it in cxx understandable value + let verbosity = { + if enabled!(target: "cxx", Level::TRACE) { + xrt::VerbosityCxx::Trace + } else if enabled!(target: "cxx", Level::DEBUG) { + xrt::VerbosityCxx::Debug + } else if enabled!(target: "cxx", Level::INFO) { + xrt::VerbosityCxx::Info + } else if enabled!(target: "cxx", Level::WARN) { + xrt::VerbosityCxx::Warning + } else { + xrt::VerbosityCxx::Error + } + }; + Self(xrt::new_hpu_hw( + *id, + kernel.expand(), + xclbin.expand(), + verbosity, + )) + } + _ => panic!("Unsupported config type with ffi::xrt"), + } + } + + #[cfg(feature = "hw-v80")] + { + match mode { + FFIMode::V80 { + ami_id, + qdma_h2c, + qdma_c2h, + } => Self(v80::HpuHw::new_hpu_hw( + *ami_id, + retry_rate, + &qdma_h2c.expand(), + &qdma_c2h.expand(), + )), + _ => panic!("Unsupported config type with ffi::v80"), + } + } + + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + match mode { + FFIMode::Sim { ipc_name } => Self(sim::HpuHw::new_hpu_hw(&ipc_name.expand())), + _ => panic!("Unsupported config type with ffi::sim"), + } + } + } + + /// Custom register command to retrieved custom parameters set from mockup. + /// Only available with mockup FFI + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + pub fn get_pbs_parameters(&mut self) -> crate::entities::HpuPBSParameters { + self.0.get_pbs_parameters() + } + + /// Custom command only supported on V80 to push work + #[cfg(feature = "hw-v80")] + pub fn iop_push(&mut self, stream: &[u32]) { + self.0.ami.iop_push(stream) + } + + /// Custom command only supported on V80 to push work + #[cfg(feature = "hw-v80")] + pub fn dop_push(&mut self, stream: &[u32]) { + self.0.ami.dop_push(stream) + } + + /// Custom command only supported on V80 to rd_ack + #[cfg(feature = "hw-v80")] + pub fn iop_ack_rd(&mut self) -> u32 { + self.0.ami.iop_ackq_rd() + } +} + +pub struct MemZone( + #[cfg(feature = "hw-xrt")] cxx::UniquePtr, + #[cfg(feature = "hw-v80")] v80::MemZone, + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] sim::MemZone, +); + +// With Xrt backend, Opaque Cxx Object prevent compiler to auto impl Send+Sync +// However, it's safe to implement them +#[cfg(feature = "hw-xrt")] +unsafe impl Send for MemZone {} +#[cfg(feature = "hw-xrt")] +unsafe impl Sync for MemZone {} + +impl MemZone { + /// Read a bytes slice in the associated MemZone + #[inline(always)] + pub fn read_bytes(&self, ofst: usize, bytes: &mut [u8]) { + self.0.read_bytes(ofst, bytes); + } + + /// Get physical MemZone address + #[inline(always)] + pub fn paddr(&self) -> u64 { + self.0.paddr() + } + + /// Get MemZone size in byte + #[inline(always)] + #[allow(unused)] + pub fn size(&self) -> usize { + self.0.size() + } + + /// Get write byte slice in MemZone at a given offset + #[inline(always)] + pub fn write_bytes(&mut self, ofst: usize, bytes: &[u8]) { + #[cfg(feature = "hw-xrt")] + { + self.0.pin_mut().write_bytes(ofst, bytes) + } + + #[cfg(feature = "hw-v80")] + { + self.0.write_bytes(ofst, bytes) + } + + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + self.0.write_bytes(ofst, bytes) + } + } + + /// Map MemZone in userspace + #[inline(always)] + #[allow(unused)] + pub fn mmap(&mut self) -> &mut [u64] { + #[cfg(feature = "hw-xrt")] + { + self.0.pin_mut().mmap() + } + + #[cfg(feature = "hw-v80")] + { + panic!("V80 ffi rely on QDMA and couldn't implement mmap") + } + + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + self.0.mmap() + } + } + + /// Handle MemZone synchronisation with the hw target + #[inline(always)] + #[allow(unused)] + pub fn sync(&mut self, mode: SyncMode) { + #[cfg(feature = "hw-xrt")] + { + self.0.pin_mut().sync(mode.into()) + } + + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + self.0.sync(mode) + } + } +} + +/// Generic function to easily handle multiple word size +impl MemZone { + pub fn read(&self, ofst: usize, data: &mut [T]) { + let data_bytes = bytemuck::cast_slice_mut::(data); + let ofst_bytes = ofst * std::mem::size_of::(); + self.read_bytes(ofst_bytes, data_bytes); + } + + pub fn write(&mut self, ofst: usize, data: &[T]) { + let data_bytes = bytemuck::cast_slice::(data); + let ofst_bytes = ofst * std::mem::size_of::(); + self.write_bytes(ofst_bytes, data_bytes); + } +} + +#[cfg(feature = "hw-v80")] +mod v80; +#[cfg(feature = "hw-xrt")] +mod xrt; + +#[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] +pub(crate) mod sim; diff --git a/backends/tfhe-hpu-backend/src/ffi/sim/ipc.rs b/backends/tfhe-hpu-backend/src/ffi/sim/ipc.rs new file mode 100644 index 000000000..cf0706265 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/sim/ipc.rs @@ -0,0 +1,215 @@ +//! IPC interface and associated Commands + +use ipc_channel::ipc::{self, IpcOneShotServer, IpcReceiver, IpcSender}; +use std::fs::OpenOptions; +use std::io::{BufRead, BufReader, Write}; +use std::path::Path; +use std::sync::{Arc, Mutex}; + +use serde::{Deserialize, Serialize}; + +use crate::entities::HpuPBSParameters; +use crate::ffi::{self, SyncMode}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct IpcError {} + +/// Register request +#[derive(Debug, Serialize, Deserialize)] +pub enum RegisterReq { + Read { addr: u64 }, + Write { addr: u64, value: u32 }, + PbsParams, +} + +/// Register acknowledgment +#[derive(Debug, Serialize, Deserialize)] +pub enum RegisterAck { + Read(u32), + Write, + PbsParams(HpuPBSParameters), +} + +/// FFI side of IPC channel used for Register xfer +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct RegisterFfi { + pub(crate) req: IpcSender, + pub(crate) ack: IpcReceiver, +} +/// Sim side of IPC channel used for Register xfer +#[derive(Debug, Serialize, Deserialize)] +pub struct RegisterSim { + pub req: IpcReceiver, + pub ack: IpcSender, +} + +pub(crate) fn register_channel() -> (RegisterFfi, RegisterSim) { + let (req_tx, req_rx) = ipc::channel().unwrap(); + let (ack_tx, ack_rx) = ipc::channel().unwrap(); + + ( + RegisterFfi { + req: req_tx, + ack: ack_rx, + }, + RegisterSim { + req: req_rx, + ack: ack_tx, + }, + ) +} + +/// Memory request +#[derive(Debug, Serialize, Deserialize)] +pub enum MemoryReq { + Allocate { + mem_kind: ffi::MemKind, + size_b: usize, + }, + Sync { + mem_kind: ffi::MemKind, + addr: u64, + mode: SyncMode, + data: Option, + }, + Release { + mem_kind: ffi::MemKind, + addr: u64, + }, +} + +/// Memory acknowledgment +#[derive(Debug, Serialize, Deserialize)] +pub enum MemoryAck { + Allocate { addr: u64 }, + Sync { data: Option }, + Release, +} + +/// FFI side of IPC channel used for Memory xfer +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct MemoryFfi { + pub(crate) req: IpcSender, + pub(crate) ack: IpcReceiver, +} +/// FFI memory wrapped in an Arc> +/// Indeed, this object must be shared with all MemZone to enable proper sync +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct MemoryFfiWrapped(pub(crate) Arc>); + +impl From for MemoryFfiWrapped { + fn from(value: MemoryFfi) -> Self { + Self(Arc::new(Mutex::new(value))) + } +} + +/// Sim side of IPC channel used for Memory xfer +#[derive(Debug, Serialize, Deserialize)] +pub struct MemorySim { + pub req: IpcReceiver, + pub ack: IpcSender, +} + +pub(crate) fn memory_channel() -> (MemoryFfi, MemorySim) { + let (req_tx, req_rx) = ipc::channel().unwrap(); + let (ack_tx, ack_rx) = ipc::channel().unwrap(); + + ( + MemoryFfi { + req: req_tx, + ack: ack_rx, + }, + MemorySim { + req: req_rx, + ack: ack_tx, + }, + ) +} + +/// FFI side of IPC channel used for Memory xfer +/// Gather Register/Memory interface together to easily exchange them across OneShot server +#[derive(Debug, Serialize, Deserialize)] +pub(crate) struct IpcFfi { + pub(crate) register: RegisterFfi, + pub(crate) memory: MemoryFfiWrapped, +} + +impl IpcFfi { + /// Create IPC binding for Register and Memory interface + /// Use a named file to retrieved the OneShot IPC channel that enable to exchange + /// typed ipc_channels + pub fn new_bind_on(ipc_name: &str) -> IpcFfi { + // Open file + let mut rd_f = BufReader::new( + OpenOptions::new() + .create(false) + .read(true) + .open(ipc_name) + .unwrap(), + ); + // Read name of the targeted oneshot channel + let oneshot_name = { + let mut name = String::new(); + rd_f.read_line(&mut name).unwrap(); + name + }; + tracing::debug!("Will bind through {oneshot_name}"); + + // Connet to the oneshot channel + let bind_tx = IpcSender::connect(oneshot_name).unwrap(); + + // Generate ipc channel and send Sim side through oneshot + let (ffi, sim) = ipc_channel(); + bind_tx.send(sim).unwrap(); + + ffi + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct IpcSim { + pub register: RegisterSim, + pub memory: MemorySim, +} +impl IpcSim { + /// Create IPC Oneshot server and wait for endpoint + pub fn new_bind_on(ipc_name: &str) -> IpcSim { + // Create one shot channel + let (oneshot_server, oneshot_name) = IpcOneShotServer::new().unwrap(); + // Register it into {ipc_name} file + // Create folder if needed + let path = Path::new(ipc_name); + if let Some(dir_p) = path.parent() { + std::fs::create_dir_all(dir_p).unwrap(); + } + // Open file + let mut wr_f = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(ipc_name) + .unwrap(); + write!(wr_f, "{oneshot_name}").unwrap(); + + tracing::info!("Mockup waiting on IPC `{oneshot_name}`"); + let (_, ipc_sim): (_, IpcSim) = oneshot_server.accept().unwrap(); + + ipc_sim + } +} + +pub(crate) fn ipc_channel() -> (IpcFfi, IpcSim) { + let (register_ffi, register_sim) = register_channel(); + let (memory_ffi, memory_sim) = memory_channel(); + + ( + IpcFfi { + register: register_ffi, + memory: MemoryFfiWrapped::from(memory_ffi), + }, + IpcSim { + register: register_sim, + memory: memory_sim, + }, + ) +} diff --git a/backends/tfhe-hpu-backend/src/ffi/sim/mod.rs b/backends/tfhe-hpu-backend/src/ffi/sim/mod.rs new file mode 100644 index 000000000..d5a979283 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/sim/mod.rs @@ -0,0 +1,266 @@ +//! Implement a simple mockup interface for ffi. +//! It enables to simulate the tfhe-hpu-backend behavior without the real HW +//! +//! Simply provide IPC to communicate with an Hpu simulation mockup + +use crate::ffi; + +pub mod ipc; +use ipc::{IpcFfi, MemoryAck, MemoryFfiWrapped, MemoryReq, RegisterAck, RegisterFfi, RegisterReq}; +use ipc_channel::ipc::IpcSharedMemory; + +use super::MemZoneProperties; + +pub struct HpuHw { + ipc: IpcFfi, +} + +impl HpuHw { + /// Handle ffi instantiation + #[inline(always)] + pub fn new_hpu_hw(ipc_name: &str) -> HpuHw { + Self { + ipc: IpcFfi::new_bind_on(ipc_name), + } + } + + /// Handle on-board memory allocation + pub fn alloc(&mut self, props: ffi::MemZoneProperties) -> MemZone { + // Duplicate Memory handle for future memzone and take lock for xfer + let mem_cloned = self.ipc.memory.clone(); + let mem_locked = self.ipc.memory.0.lock().unwrap(); + + // Send request + let cmd = MemoryReq::Allocate { + mem_kind: props.mem_kind, + size_b: props.size_b, + }; + tracing::trace!("Req => {cmd:x?}"); + mem_locked.req.send(cmd).unwrap(); + + // Wait for ack + match mem_locked.ack.recv() { + Ok(ack) => { + tracing::trace!("Ack => {ack:x?}"); + match ack { + MemoryAck::Allocate { addr } => MemZone::new(props, addr, mem_cloned), + _ => panic!("Ack mismatch with sent request"), + } + } + Err(err) => panic!("Ipc recv {err:?}"), + } + } + /// Handle on-board memory de-allocation + pub fn release(&mut self, zone: &mut MemZone) { + // Take memory handle lock for xfer + let mem_locked = self.ipc.memory.0.lock().unwrap(); + + // Send request + let cmd = MemoryReq::Release { + mem_kind: zone.mem_kind, + addr: zone.addr, + }; + tracing::trace!("Req => {cmd:x?}"); + mem_locked.req.send(cmd).unwrap(); + + // Wait for ack + match mem_locked.ack.recv() { + Ok(ack) => { + tracing::trace!("Ack => {ack:x?}"); + match ack { + MemoryAck::Release => {} + _ => panic!("Ack mismatch with sent request"), + } + } + Err(err) => panic!("Ipc recv {err:?}"), + } + } + + /// Handle register read + pub fn read_reg(&self, addr: u64) -> u32 { + let (req, ack) = { + let IpcFfi { register, .. } = &self.ipc; + let RegisterFfi { req, ack } = register; + (req, ack) + }; + // Send request + let cmd = RegisterReq::Read { addr }; + tracing::trace!("Req => {cmd:x?}"); + req.send(cmd).unwrap(); + + // Wait for ack + + match ack.recv() { + Ok(ack) => { + tracing::trace!("Ack => {ack:x?}"); + match ack { + RegisterAck::Read(val) => val, + _ => panic!("Ack mismatch with sent request"), + } + } + Err(err) => panic!("Ipc recv {err:?}"), + } + } + + pub fn write_reg(&mut self, addr: u64, value: u32) { + let (req, ack) = { + let IpcFfi { register, .. } = &self.ipc; + let RegisterFfi { req, ack } = register; + (req, ack) + }; + + // Send request + let cmd = RegisterReq::Write { addr, value }; + tracing::trace!("Req => {cmd:x?}"); + req.send(cmd).unwrap(); + + // Wait for ack + match ack.recv() { + Ok(ack) => { + tracing::trace!("Ack => {ack:x?}"); + match ack { + RegisterAck::Write => {} + _ => panic!("Ack mismatch with sent request"), + } + } + Err(err) => panic!("Ipc recv {err:?}"), + } + } + + pub fn get_pbs_parameters(&mut self) -> crate::entities::HpuPBSParameters { + let (req, ack) = { + let IpcFfi { register, .. } = &self.ipc; + let RegisterFfi { req, ack } = register; + (req, ack) + }; + + // Send request + let cmd = RegisterReq::PbsParams; + tracing::trace!("Req => {cmd:x?}"); + req.send(cmd).unwrap(); + + // Wait for ack + match ack.recv() { + Ok(ack) => { + tracing::trace!("Ack => {ack:x?}"); + match ack { + RegisterAck::PbsParams(params) => params, + _ => panic!("Ack mismatch with sent request"), + } + } + Err(err) => panic!("Ipc recv {err:?}"), + } + } +} + +pub struct MemZone { + // Link properties + mem_kind: ffi::MemKind, + addr: u64, + ipc: MemoryFfiWrapped, + + // Host version of the memory + data: Vec, +} + +impl MemZone { + pub fn new(props: MemZoneProperties, addr: u64, ipc: MemoryFfiWrapped) -> Self { + Self { + mem_kind: props.mem_kind, + addr, + ipc, + data: vec![0; props.size_b], + } + } + pub fn read_bytes(&self, ofst: usize, bytes: &mut [u8]) { + let (start, end) = (ofst, ofst + bytes.len()); + bytes.copy_from_slice(&self.data[start..end]) + } + + pub fn paddr(&self) -> u64 { + self.addr + } + + pub fn size(&self) -> usize { + self.data.len() + } + + pub fn write_bytes(&mut self, ofst: usize, bytes: &[u8]) { + let (start, end) = (ofst, ofst + bytes.len()); + self.data.as_mut_slice()[start..end].copy_from_slice(bytes) + } + + pub fn mmap(&mut self) -> &mut [u64] { + todo!() + } + + pub fn sync(&mut self, mode: ffi::SyncMode) { + let Self { + mem_kind, + addr, + ipc, + data, + } = self; + + match mode { + ffi::SyncMode::Host2Device => { + // Wrap bytes in Shm and send request + let hw_data = IpcSharedMemory::from_bytes(data.as_slice()); + + // Take ipc lock and do req/ack sequence + let ipc_lock = ipc.0.lock().unwrap(); + + let req = MemoryReq::Sync { + mem_kind: *mem_kind, + addr: *addr, + mode, + data: Some(hw_data), + }; + tracing::trace!("Req => {req:x?}"); + ipc_lock.req.send(req).unwrap(); + + // Wait for ack + match ipc_lock.ack.recv() { + Ok(ack) => { + tracing::trace!("Ack => {ack:x?}"); + match ack { + MemoryAck::Sync { data } => { + assert!(data.is_none(), "Received data on Host2Device sync") + } + _ => panic!("Ack mismatch with sent request"), + } + } + Err(err) => panic!("Ipc recv {err:?}"), + } + } + ffi::SyncMode::Device2Host => { + // Take ipc lock and do req/ack sequence + let ipc_lock = ipc.0.lock().unwrap(); + + let req = MemoryReq::Sync { + mem_kind: *mem_kind, + addr: *addr, + mode, + data: None, + }; + tracing::trace!("Req => {req:x?}"); + ipc_lock.req.send(req).unwrap(); + + // Wait for ack + match ipc_lock.ack.recv() { + Ok(ack) => { + tracing::trace!("Ack => {ack:x?}"); + match ack { + MemoryAck::Sync { data } => { + let hw_data = data.expect("No data received on Device2Host sync"); + self.data.copy_from_slice(&hw_data); + } + _ => panic!("Ack mismatch with sent request"), + } + } + Err(err) => panic!("Ipc recv {err:?}"), + } + } + } + } +} diff --git a/backends/tfhe-hpu-backend/src/ffi/v80/ami.rs b/backends/tfhe-hpu-backend/src/ffi/v80/ami.rs new file mode 100644 index 000000000..528da569c --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/v80/ami.rs @@ -0,0 +1,328 @@ +//! Abstraction over the AMI driver +//! +//! AMI driver is used to issue gcq command to the RPU +//! Those command are used for configuration and register R/W +use lazy_static::lazy_static; +use std::fs::{File, OpenOptions}; +use std::io::Read; +use std::os::fd::AsRawFd; +use std::time::Duration; + +const AMI_VERSION_FILE: &str = "/sys/module/ami/version"; +const AMI_VERSION_PATTERN: &str = r"3\.0\.\d+-zama"; + +const AMI_ID_FILE: &str = "/sys/bus/pci/drivers/ami/devices"; +const AMI_ID_PATTERN: &str = r"(?\d{2}:\d{2}\.\d)\s(?\d+)\s\d+"; + +const HIS_VERSION_FILE: &str = "/sys/bus/pci/devices/0000:${V80_PCIE_DEV}:00.0/amc_version"; +const HIS_VERSION_PATTERN: &str = r".*- zama ucore 2.0"; + +pub struct AmiDriver { + ami_dev: File, + retry_rate: Duration, +} + +impl AmiDriver { + pub fn new(ami_id: usize, retry_rate: Duration) -> Self { + Self::check_version(); + + // Read ami_id_file to get ami device + let ami_path = { + // Extract AMI device path + lazy_static! { + static ref AMI_ID_RE: regex::Regex = + regex::Regex::new(AMI_ID_PATTERN).expect("Invalid regex"); + }; + + // Read ami string-id + let ami_id_f = std::fs::read_to_string(AMI_ID_FILE).expect("Invalid ami_id filepath"); + let id_line = ami_id_f + .lines() + .nth(ami_id) + .unwrap_or_else(|| panic!("Invalid ami id {ami_id}.")); + + let id_str = AMI_ID_RE + .captures(id_line) + .expect("Invalid AMI_ID_FILE content") + .name("dev_id") + .unwrap(); + let dev_id = + usize::from_str_radix(id_str.as_str(), 10).expect("Invalid AMI_DEV_ID encoding"); + format!("/dev/ami{dev_id}") + }; + + // Open ami device file + let ami_dev = OpenOptions::new() + .read(true) + .write(true) + .create(false) + .open(&ami_path) + .unwrap(); + Self { + ami_dev, + retry_rate, + } + } + + /// Check if current ami version is compliant + /// + /// For this purpose we use a regex. + /// it's easy to expressed and understand breaking rules with it + pub fn check_version() { + // Check AMI version + lazy_static! { + static ref AMI_VERSION_RE: regex::Regex = + regex::Regex::new(AMI_VERSION_PATTERN).expect("Invalid regex"); + }; + + // Read ami string-version + let mut ami_ver_f = OpenOptions::new() + .read(true) + .write(false) + .create(false) + .open(AMI_VERSION_FILE) + .unwrap(); + + let ami_version = { + let mut ver = String::new(); + ami_ver_f + .read_to_string(&mut ver) + .expect("Invalid AMI_VERSION string format"); + + ver + }; + + if !AMI_VERSION_RE.is_match(&ami_version) { + panic!( + "Invalid ami version. Get {} expect something matching pattern {}", + ami_version, AMI_VERSION_PATTERN + ) + } + + // Check HIS version + // Known through amc version retrieved by ami driver + lazy_static! { + static ref HIS_VERSION_RE: regex::Regex = + regex::Regex::new(HIS_VERSION_PATTERN).expect("Invalid regex"); + }; + + // Read ami string-version + // NB: Rely on shell interpretation to get PCI device + let his_version_file = crate::prelude::ShellString::new(HIS_VERSION_FILE.to_string()); + let mut his_ver_f = OpenOptions::new() + .read(true) + .write(false) + .create(false) + .open(his_version_file.expand()) + .unwrap(); + + let his_version = { + let mut ver = String::new(); + his_ver_f + .read_to_string(&mut ver) + .expect("Invalid HIS_VERSION string format"); + + ver + }; + + if !HIS_VERSION_RE.is_match(&his_version) { + panic!( + "Invalid his version. Get {} expect something matching pattern {}", + his_version, HIS_VERSION_PATTERN + ) + } + } + + /// Issue read register request through AMI driver + pub fn read_reg(&self, addr: u64) -> u32 { + let ami_fd = self.ami_dev.as_raw_fd(); + + // Allocate heap memory for read value + let data = Box::::new(0xdeadc0de); + let data_ptr = Box::into_raw(data); + + // Populate payload + let payload = AmiPeakPokePayload { + data_ptr, + len: 0x1, + offset: addr as u32, + }; + + tracing::trace!("AMI: Read request with following payload {payload:x?}"); + loop { + let ret = unsafe { ami_peak(ami_fd, &payload) }; + match ret { + Err(err) => { + tracing::debug!("AMI: Read failed -> {err:?}"); + std::thread::sleep(self.retry_rate); + } + Ok(val) => { + tracing::trace!("AMI: Read ack received {payload:x?} -> {val:?}"); + break; + } + } + } + unsafe { *Box::from_raw(data_ptr) } + } + + pub fn write_reg(&self, addr: u64, value: u32) { + let ami_fd = self.ami_dev.as_raw_fd(); + + // Allocate heap memory for read value + let data = Box::::new(value); + let data_ptr = Box::into_raw(data); + + // Populate payload + let payload = AmiPeakPokePayload { + data_ptr, + len: 0x1, + offset: addr as u32, + }; + + tracing::trace!("AMI: Write request with following payload {payload:x?}"); + loop { + let ret = unsafe { ami_poke(ami_fd, &payload) }; + match ret { + Err(err) => { + tracing::debug!("AMI: Write failed -> {err:?}"); + std::thread::sleep(self.retry_rate); + } + Ok(val) => { + tracing::trace!("AMI: Write ack received {payload:x?} -> {val:?}"); + break; + } + } + } + } + + /// Push a stream of DOp in the ISC + /// This call bypass the IOp->DOp translation in the ucore + /// NB: There is no automatic SYNC insertion + #[allow(unused)] + pub fn dop_push(&self, stream: &[u32]) { + let ami_fd = self.ami_dev.as_raw_fd(); + + // Allocate heap memory for dop stream + let mut data = Vec::from(stream); + let len = data.len() as u32; + let data_ptr = data.as_mut_ptr(); + + // Populate payload + let mut payload = AmiIOpPayload { + data_ptr, + len, + offset: 0x00, // Unused for iop_push + mode: true, // Push a stream of DOp + }; + + tracing::trace!("AMI: DOpPush request with following payload {payload:x?}"); + loop { + let ret = unsafe { ami_iop_push(ami_fd, &payload) }; + match ret { + Err(err) => { + tracing::debug!("AMI: DOpPush failed -> {err:?}"); + std::thread::sleep(self.retry_rate); + } + Ok(val) => { + tracing::trace!("AMI: DOpPush ack received {payload:x?} -> {val:?}"); + break; + } + } + } + } + + /// Push IOp to ucore + /// Ucore is in charge of translation to stream of DOp and forward them to ISC + #[allow(unused)] + pub fn iop_push(&self, stream: &[u32]) { + let ami_fd = self.ami_dev.as_raw_fd(); + + // Allocate heap memory for stream + let mut data = Vec::from(stream); + let len = data.len() as u32; + let data_ptr = data.as_mut_ptr(); + + // Populate payload + let mut payload = AmiIOpPayload { + data_ptr, + len, + offset: 0x00, // Unused for iop_push + mode: false, // Push a stream of IOp + }; + + tracing::trace!("AMI: IOpPush request with following payload {payload:x?}"); + loop { + let ret = unsafe { ami_iop_push(ami_fd, &payload) }; + match ret { + Err(err) => { + tracing::debug!("AMI: IOpPush failed -> {err:?}"); + std::thread::sleep(self.retry_rate); + } + Ok(val) => { + tracing::trace!("AMI: IOpPush ack received {payload:x?} -> {val:?}"); + break; + } + } + } + } + + // TODO ugly quick patch + // Clean this when driver interface is specified + pub fn iop_ackq_rd(&self) -> u32 { + let mut iop_ack_f = OpenOptions::new() + .read(true) + .write(true) + .create(false) + .open("/proc/ami_iop_ack") + .unwrap(); + + // Read a line and extract a 32b integer + let mut ack_str = String::new(); + iop_ack_f.read_to_string(&mut ack_str).unwrap(); + if ack_str.is_empty() { + 0 + } else { + let ack_nb = ack_str.as_str().trim_ascii().parse::().unwrap(); + tracing::trace!("Get value {ack_str} from proc/ami_iop_ack => {ack_nb}",); + ack_nb + } + } +} + +// Define driver IOCTL command and associated payload ------------------------- +const AMI_IOC_MAGIC: u8 = b'a'; + +// Peak/Poke command used for Read/Write in registers ------------------------- +const AMI_PEAK_CMD: u8 = 15; +const AMI_POKE_CMD: u8 = 16; + +/// Payload used for register read/write +#[derive(Debug)] +#[repr(C)] +struct AmiPeakPokePayload { + data_ptr: *mut u32, + len: u32, + offset: u32, +} + +nix::ioctl_write_ptr!(ami_peak, AMI_IOC_MAGIC, AMI_PEAK_CMD, AmiPeakPokePayload); +nix::ioctl_write_ptr!(ami_poke, AMI_IOC_MAGIC, AMI_POKE_CMD, AmiPeakPokePayload); + +// IOpPush/IOpRead command used for issuing work to HPU ------------------------ +const AMI_IOPPUSH_CMD: u8 = 17; +// const AMI_IOPREAD_CMD: u8 = 18; + +/// Payload used for IOp push and read back +#[derive(Debug)] +#[repr(C)] +struct AmiIOpPayload { + data_ptr: *mut u32, + len: u32, + offset: u32, + mode: bool, // false -> IOp, true -> DOp +} + +nix::ioctl_write_ptr!(ami_iop_push, AMI_IOC_MAGIC, AMI_IOPPUSH_CMD, AmiIOpPayload); +// nix::ioctl_write_ptr!(ami_iop_read, AMI_IOC_MAGIC, AMI_IOPREAD_CMD, AmiIOpPayload); + +// ---------------------------------------------------------------------------- diff --git a/backends/tfhe-hpu-backend/src/ffi/v80/mem_alloc.rs b/backends/tfhe-hpu-backend/src/ffi/v80/mem_alloc.rs new file mode 100644 index 000000000..31ba5d1ff --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/v80/mem_alloc.rs @@ -0,0 +1,136 @@ +//! Implement a fake memory allocator for help bridging with the ffi API +//! +//! There is no memory allocator with the Qdma. Indeed, the all board memory could be +//! accessed through the driver. +//! However, in order to be mapped on the ffi API with fake an allocator. For this purpose +//! each HBM pc is view as a list of 16MiB chunks and register in a store +//! NB: 16MiB is selected as upper xfer bound to match the previous XRT limitations +//! Indeed, all the required logic is present in the backend driver to view any memspace as an +//! aggregation of 16MiB slices + +use crate::entities::{hpu_big_lwe_ciphertext_size, HpuParameters}; +use crate::ffi; +use crate::interface::{page_align, HpuConfig}; + +use std::collections::VecDeque; + +// Some V80 constants +// Chunk_size inherited from XRT limitation +// NB: In Xilinx v80 implementation the HBM PC are not directly accessible. +// Indeed, there is an extra level of abstraction called port: +// Each HBM has 2 PC, and each PC has 2 Port. +// To keep thing simple this is hided from the SW, thus instead of viewing the board memory as: +// * 2HBM with 8Bank each and 2PC per bank -> 32 memory +// It's seen as: +// * 2HBM with 8Bank each and 4PC per bank -> 64PC +const MEM_BANK_NB: usize = 64; +const MEM_BANK_SIZE_MB: usize = 512; +const MEM_CHUNK_SIZE_B: usize = 16 * 1024 * 1024; +const MEM_BASE_ADDR: u64 = 0x40_0000_0000; + +#[derive(Debug, PartialOrd, PartialEq, Ord, Eq)] +pub struct MemChunk { + pub(super) paddr: u64, + pub(super) size_b: usize, +} + +pub struct MemAlloc([VecDeque; MEM_BANK_NB]); + +impl MemAlloc { + pub fn new(config: &HpuConfig, params: &HpuParameters) -> Self { + // Extract Hbm pc used by ciphertext if any + // For those bank, we use a different chunk size to match the ciphertext size + // Also compute the chunk size that match with tfhe parameters + let ct_pc = config + .board + .ct_pc + .iter() + .filter_map(|kind| match kind { + ffi::MemKind::Ddr { .. } => None, + ffi::MemKind::Hbm { pc } => Some(*pc as u64), + }) + .collect::>(); + + let ct_chunk_b = page_align( + hpu_big_lwe_ciphertext_size(params).div_ceil(params.pc_params.pem_pc) + * std::mem::size_of::(), + ); + + let banks = (0..MEM_BANK_NB as u64) + .map(|bank| { + let bank_base_addr = + MEM_BASE_ADDR + bank * (MEM_BANK_SIZE_MB * (1024 * 1024)) as u64; + if ct_pc.contains(&bank) { + // Allocation in this bank use small chunk that match ct cut_size + let bank_cut = (MEM_BANK_SIZE_MB * 1024 * 1024) / ct_chunk_b; + (0..bank_cut) + .map(|cut| MemChunk { + paddr: bank_base_addr + (cut * ct_chunk_b) as u64, + size_b: ct_chunk_b, + }) + .collect::>() + } else { + let bank_cut = (MEM_BANK_SIZE_MB * 1024 * 1024) / MEM_CHUNK_SIZE_B; + (0..bank_cut) + .map(|cut| MemChunk { + paddr: bank_base_addr + (cut * MEM_CHUNK_SIZE_B) as u64, + size_b: MEM_CHUNK_SIZE_B, + }) + .collect::>() + } + }) + .collect::>(); + + Self( + banks + .try_into() + .expect("Invalid banks slice size. Check parameters"), + ) + } + + // FIXME Fact that chunk are contiguous must be mandatory and not only likely to happen + pub fn alloc(&mut self, props: &ffi::MemZoneProperties) -> Vec { + match props.mem_kind { + ffi::MemKind::Ddr { offset } => { + tracing::warn!( + "DDR allocation isn't handled by FFI. User directly handled offset and range" + ); + // TODO Add guard to prevent bad argument from user + vec![MemChunk { + paddr: offset as u64, + size_b: props.size_b, + }] + } + ffi::MemKind::Hbm { pc } => { + let bank = &mut self.0[pc]; + // Compute required number of chunk + let chunk_nb = props.size_b.div_ceil(MEM_CHUNK_SIZE_B); + assert!( + bank.len() >= chunk_nb, + "Not enough memory in selected Hbm bank {pc} [req: {:?}]", + props + ); + bank.drain(0..chunk_nb).collect::>() + } + } + } + + pub fn release(&mut self, kind: &ffi::MemKind, chunks: &mut Vec) { + match kind { + ffi::MemKind::Ddr { .. } => { + // TODO properly handle it when DDR management is integrated in the FFI + } + ffi::MemKind::Hbm { pc } => { + // Insert chunk back in the correct bank + let bank = &mut self.0[*pc]; + while let Some(chunk) = chunks.pop() { + bank.push_back(chunk) + } + + // Sort chunk to maximize chance to obtain contiguous MemChunk + bank.make_contiguous(); + bank.as_mut_slices().0.sort(); + } + } + } +} diff --git a/backends/tfhe-hpu-backend/src/ffi/v80/mod.rs b/backends/tfhe-hpu-backend/src/ffi/v80/mod.rs new file mode 100644 index 000000000..ac1bfc1b6 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/v80/mod.rs @@ -0,0 +1,115 @@ +//! Implement V80 driver abstraction +//! +//! V80 rely on 2 driver for communication +//! * Register access/Rpu interaction -> AMI +//! * Data xfer -> QDMA + +use crate::ffi; + +use std::sync::{Arc, Mutex}; + +mod ami; +use ami::AmiDriver; + +mod mem_alloc; +use mem_alloc::{MemAlloc, MemChunk}; + +mod qdma; +use qdma::QdmaDriver; + +pub struct HpuHw { + pub(super) ami: AmiDriver, + pub(super) qdma: Arc>, + allocator: Option, +} + +impl HpuHw { + /// Handle ffi instantiation + #[inline(always)] + pub fn new_hpu_hw( + ami_id: usize, + ami_retry: std::time::Duration, + h2c_path: &str, + c2h_path: &str, + ) -> HpuHw { + Self { + ami: AmiDriver::new(ami_id, ami_retry), + qdma: Arc::new(Mutex::new(QdmaDriver::new(h2c_path, c2h_path))), + allocator: None, + } + } + + pub fn init_mem( + &mut self, + config: &crate::interface::HpuConfig, + params: &crate::entities::HpuParameters, + ) { + assert!( + self.allocator.is_none(), + "Error: Double request of HpuHw memory initialisation" + ); + self.allocator = Some(MemAlloc::new(config, params)); + } + + /// Handle on-board memory allocation + pub fn alloc(&mut self, props: ffi::MemZoneProperties) -> MemZone { + let chunks = self + .allocator + .as_mut() + .expect("Error: V80 backend memory must be explicitly init (c.f. init_mem)") + .alloc(&props); + MemZone::new(props.mem_kind, chunks[0].paddr, chunks, self.qdma.clone()) + } + /// Handle on-board memory de-allocation + pub fn release(&mut self, zone: &mut MemZone) { + let MemZone { kind, chunks, .. } = zone; + self.allocator + .as_mut() + .expect("Error: V80 backend memory must be explicitly init (c.f. init_mem)") + .release(kind, chunks) + } +} + +pub struct MemZone { + // Link properties + kind: ffi::MemKind, + addr: u64, + chunks: Vec, + + // Ref to Qdma driver + qdma: Arc>, +} + +impl MemZone { + pub fn new( + kind: ffi::MemKind, + addr: u64, + chunks: Vec, + qdma: Arc>, + ) -> Self { + Self { + kind, + addr, + chunks, + qdma, + } + } + + pub fn read_bytes(&self, ofst: usize, bytes: &mut [u8]) { + let qdma = self.qdma.lock().unwrap(); + qdma.read_bytes(ofst + self.addr as usize, bytes) + } + + pub fn paddr(&self) -> u64 { + self.addr + } + + pub fn size(&self) -> usize { + self.chunks.iter().map(|chunk| chunk.size_b).sum() + } + + pub fn write_bytes(&mut self, ofst: usize, bytes: &[u8]) { + let qdma = self.qdma.lock().unwrap(); + qdma.write_bytes(ofst + self.addr as usize, bytes) + } +} diff --git a/backends/tfhe-hpu-backend/src/ffi/v80/qdma.rs b/backends/tfhe-hpu-backend/src/ffi/v80/qdma.rs new file mode 100644 index 000000000..ac16c6d6c --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/v80/qdma.rs @@ -0,0 +1,106 @@ +//! Abstraction over the QDMA driver +//! +//! QDMA driver is used for memory xfer in both direction: +//! * H2C: _Host to Card_ +//! * C2H: _Card to Host_ +//! +//! NB: Currently configuration of QDMA isn't handled. Thus the QDMA queue must be correctly +//! created and started before backend start +//! ``` bash +//! # Select the correct pcie device and physical function. +//! # In the following code snippets the 21:00.0 is selected +//! +//! #1. Configure the maximum number of Qdma queues: +//! echo 100 > /sys/bus/pci/devices/0000\:21\:00.1/qdma/qmax +//! +//! #2. Create and start the host to card queue +//! dma-ctl qdma21001 q add idx 0 mode mm dir h2c +//! dma-ctl qdma21001 q start idx 0 dir h2c +//! +//! #3. Create and start the card to host queue +//! dma-ctl qdma21001 q add idx 1 mode mm dir c2h +//! dma-ctl qdma21001 q start idx 1 dir c2h +//! ``` + +use lazy_static::lazy_static; +use std::fs::{File, OpenOptions}; +use std::io::Read; + +const QDMA_VERSION_FILE: &str = "/sys/module/qdma_pf/version"; +const QDMA_VERSION_PATTERN: &str = r"2024\.1\.0\.\d+-zama"; + +pub(crate) struct QdmaDriver { + qdma_h2c: File, + qdma_c2h: File, +} + +impl QdmaDriver { + pub fn new(h2c_path: &str, c2h_path: &str) -> Self { + Self::check_version().unwrap(); + + // Open HostToCard xfer file + let qdma_h2c = OpenOptions::new() + .read(false) + .write(true) + .create(false) + .open(h2c_path) + .unwrap_or_else(|e| panic!("Invalid qdma_h2c path: {h2c_path} -> {e}. Check queue initialization and configuration.")); + + // Open CardToHost xfer file + let qdma_c2h = OpenOptions::new() + .read(true) + .write(false) + .create(false) + .open(c2h_path) + .unwrap_or_else(|e| panic!("Invalid qdma_c2h path: {c2h_path} -> {e}. Check queue initialization and configuration.")); + + Self { qdma_h2c, qdma_c2h } + } + + /// Check if current qdma version is compliant + /// + /// For this purpose we use a regex. + /// it's easy to expressed and understand breaking rules with it + pub fn check_version() -> Result<(), String> { + lazy_static! { + static ref QDMA_VERSION_RE: regex::Regex = + regex::Regex::new(QDMA_VERSION_PATTERN).expect("Invalid regex"); + }; + + // Read ami string-version + let mut qdma_ver_f = OpenOptions::new() + .read(true) + .write(false) + .create(false) + .open(QDMA_VERSION_FILE) + .unwrap(); + + let qdma_version = { + let mut ver = String::new(); + qdma_ver_f + .read_to_string(&mut ver) + .expect("Invalid QDMA_VERSION string format"); + + ver + }; + + if QDMA_VERSION_RE.is_match(&qdma_version) { + Ok(()) + } else { + Err(format!( + "Invalid qdma version. Get {} expect something matching pattern {}", + qdma_version, QDMA_VERSION_PATTERN + )) + } + } + + pub fn write_bytes(&self, addr: usize, bytes: &[u8]) { + let ret = nix::sys::uio::pwrite(&self.qdma_h2c, bytes, addr as i64).unwrap(); + tracing::trace!("QDMA written {ret} bytes to device"); + } + + pub fn read_bytes(&self, addr: usize, bytes: &mut [u8]) { + let ret = nix::sys::uio::pread(&self.qdma_c2h, bytes, addr as i64).unwrap(); + tracing::trace!("QDMA red {ret} bytes from device"); + } +} diff --git a/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/hpu_hw.cc b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/hpu_hw.cc new file mode 100644 index 000000000..129841279 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/hpu_hw.cc @@ -0,0 +1,104 @@ +#include "tfhe-hpu-backend/src/ffi/xrt/cxx/hpu_hw.h" +#include "tfhe-hpu-backend/src/ffi/xrt/mod.rs.h" + +#include + +namespace ffi { + + HpuHw::HpuHw(uint32_t fpga_id, rust::String _kernel_name, rust::String _xclbin_name, VerbosityCxx verbose) + : fpga_id{fpga_id}, kernel_name{std::string(_kernel_name)}, xclbin_name{std::string(_xclbin_name)}, verbose(verbose) + { + pr_info(verbose, "Create HwHpu Cxx type\n") + pr_info(verbose, "Open the fpga: " << fpga_id << "\n") + fpga = xrt::device(fpga_id); + pr_info(verbose, "Load the xclbin " << xclbin_name << "\n"); + auto uuid = fpga.load_xclbin(xclbin_name); + auto xclbin = xrt::xclbin(xclbin_name); + + // Get CU endpoints -------------------------------------------------------- + pr_info(verbose, "Fetch Compute Units endpoints " << kernel_name << "\n") + + ip = xrt::ip(fpga, uuid, kernel_name.c_str()); + std::vector cu; + + for (auto& kernel : xclbin.get_kernels()) { + if (kernel.get_name() == kernel_name.c_str()) { + cu = kernel.get_cus(); + break; + } + } + if (cu.empty()) + throw std::runtime_error("IP not found in the provided xclbin"); + + // Display kernel and memory information ----------------------------------- + if (verbose >= VerbosityCxx::Trace) { + std::cout << kernel_name << " CU properties: " << std::endl; + for (auto& cu_i: cu) { + std::cout << "instance name: " << cu_i.get_name() << "\n"; + std::cout << "base address: 0x" << std::hex << cu_i.get_base_address() << std::dec << "\n"; + for (const auto& arg : cu_i.get_args()) { + std::cout << " argument: " << arg.get_name() << "\n"; + std::cout << " hosttype: " << arg.get_host_type() << "\n"; + std::cout << " port: " << arg.get_port() << "\n"; + std::cout << " size (bytes): 0x" << std::hex << arg.get_size() << std::dec << "\n"; + std::cout << " offset: 0x" << std::hex << arg.get_offset() << std::dec << "\n"; + for (const auto& mem : arg.get_mems()) { + std::cout << "mem tag: " << mem.get_tag() << "\n"; + std::cout << "mem index: " << mem.get_index() << "\n"; + std::cout << "mem size (kb): 0x" << std::hex << mem.get_size_kb() << std::dec << "\n"; + std::cout << "mem base addr: 0x" << std::hex << mem.get_base_address() << std::dec << "\n"; + } + } + std::cout << std::endl; + } + } + + if (verbose >= VerbosityCxx::Debug) { + std::cout << "Display memory layout:\n"; + for (auto& mem : xclbin.get_mems()) { + std::cout << "mem tag: " << mem.get_tag() << "\n"; + std::cout << "mem used: " << (mem.get_used() ? "true" : "false") << "\n"; + std::cout << "mem index: " << mem.get_index() << "\n"; + std::cout << "mem size (kb): 0x" << std::hex << mem.get_size_kb() << std::dec << "\n"; + std::cout << "mem base addr: 0x" << std::hex << mem.get_base_address() << std::dec << "\n"; + } + } + } + + HpuHw::~HpuHw() + { + pr_info(verbose, "Delete HwHpu Cxx type\n") + } + + // Access regmap content + uint32_t HpuHw::read_reg(uint64_t addr) const + { + auto reg_val = ip.read_register(addr); + pr_trace(verbose, "read_reg:: @0x" < 0x" << reg_val <<"\n") + return reg_val; + } + + void HpuHw::write_reg(uint64_t addr, uint32_t value) + { + pr_trace(verbose, "write_reg:: @0x" < 0x" << value <<"\n") + return ip.write_register(addr, value); + } + + // Handle onboard memory + std::unique_ptr HpuHw::alloc(MemZonePropertiesCxx props){ + // NB: Currently XRT buffer are limited to 16MiB. + // if bigger buffer are required, user must split them in chunks and check that allocated + // chunk remains contiguous in memory (cf paddr) + assert((props.size_b <= MEM_CHUNK_SIZE_B) && "MemZone couldn't be bigger than 16MiB."); + + auto bo = new xrt::bo(fpga, props.size_b, props.hbm_pc); + return std::make_unique(props.size_b, props.hbm_pc, bo); + } + + std::unique_ptr + new_hpu_hw(uint32_t fpga_id, rust::String kernel_name, rust::String awsxclbin, + VerbosityCxx verbose) + { + return std::make_unique(fpga_id, kernel_name, awsxclbin, verbose); + } +} diff --git a/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/hpu_hw.h b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/hpu_hw.h new file mode 100644 index 000000000..ad69b4f03 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/hpu_hw.h @@ -0,0 +1,80 @@ +#pragma once +#include "rust/cxx.h" + +#include +#include +#include +#include +#include +#include + + +// XRT includes +#include "experimental/xrt_bo.h" +#include "experimental/xrt_ip.h" +#include "experimental/xrt_device.h" +#include "experimental/xrt_xclbin.h" + +#include "tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.h" + +// Utilities macro to handle verbosity +#define pr_trace(verbose, stmts)\ + if (verbose >= VerbosityCxx::Trace) { \ + std::cout << "cxx::Trace: " << stmts;\ + } +#define pr_debug(verbose, stmts)\ + if (verbose >= VerbosityCxx::Debug) { \ + std::cout << "cxx::Debug: " << stmts;\ + } +#define pr_info(verbose, stmts)\ + if (verbose >= VerbosityCxx::Info) { \ + std::cout << "cxx::Info: " << stmts;\ + } +#define pr_warn(verbose, stmts)\ + if (verbose >= VerbosityCxx::Warning) { \ + std::cout << "cxx::Warning: " << stmts;\ + } +#define pr_err(verbose, stmts)\ + if (verbose >= VerbosityCxx::Error) { \ + std::cerr << "cxx::Error: " << stmts;\ + } + +// Some Arch related defined +#define MEM_BANK_SIZE_MiB 512 +#define MEM_CHUNK_SIZE_B ((size_t)(16*1024*1024)) + +namespace ffi { + // Forward definition: Concrete implementation is made by Cxx + enum class SyncModeCxx: uint8_t; + enum class VerbosityCxx: uint8_t; + class MemZonePropertiesCxx; + + class HpuHw { + public: + HpuHw(uint32_t fpga_id, rust::String kernel_name, rust::String xclbin_name, VerbosityCxx verbose); + ~HpuHw(); + + private: + const uint32_t fpga_id; + const std::string kernel_name; + const std::string xclbin_name; + const VerbosityCxx verbose; + + // XRT objects + xrt::device fpga; + xrt::ip ip; + + public: // API exposed to Rust + // Access regmap content + uint32_t read_reg(uint64_t addr) const; + void write_reg(uint64_t addr, uint32_t value); + // Handle onboard memory + std::unique_ptr alloc(MemZonePropertiesCxx props); + + }; + + // Utility function to properly instantiate Cxx class in rust world + std::unique_ptr + new_hpu_hw(uint32_t fpga_id, rust::String kernel_name, rust::String awsxclbin, + VerbosityCxx verbose); +} diff --git a/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.cc b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.cc new file mode 100644 index 000000000..cfefa08ec --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.cc @@ -0,0 +1,59 @@ +#include "tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.h" +#include "tfhe-hpu-backend/src/ffi/xrt/mod.rs.h" + +#include + +namespace ffi { + + MemZone::MemZone(size_t size_b, size_t hbm_pc, xrt::bo* bo) + : size_b(size_b), hbm_pc(hbm_pc), bo(bo), map{} + {} + + MemZone::~MemZone(){ + delete bo; + } + + uint64_t MemZone::paddr() const { + return bo->address(); + + } + size_t MemZone::size() const { + return bo->size(); + } + size_t MemZone::pc() const { + return hbm_pc; + } + + void MemZone::read_bytes(size_t ofst, rust::Slice bytes) const { + bo->read(bytes.data(), bytes.size()*sizeof(uint8_t), ofst); + } + + void MemZone::write_bytes(size_t ofst, rust::Slice bytes){ + bo->write(bytes.data(), bytes.size()*sizeof(uint8_t), ofst); + } + + void MemZone::sync(SyncModeCxx mode){ + switch (mode) { + case SyncModeCxx::Host2Device: + bo->sync(XCL_BO_SYNC_BO_TO_DEVICE); + break; + case SyncModeCxx::Device2Host: + bo->sync(XCL_BO_SYNC_BO_FROM_DEVICE); + break; + } + return; + } + + rust::Slice MemZone::mmap(){ + if (!map.has_value()) { + map = bo->map(); + } + return rust::Slice{map.value(), size_b/sizeof(uint64_t)}; + } + + void MemZone::unmap(){ + if (map.has_value()) { + delete map.value(); + } + } +} diff --git a/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.h b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.h new file mode 100644 index 000000000..611e3a504 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.h @@ -0,0 +1,50 @@ +#pragma once +#include "rust/cxx.h" + +#include +#include +#include +#include +#include +#include +#include + + + +// XRT includes +#include "experimental/xrt_bo.h" +#include "experimental/xrt_ip.h" +#include "experimental/xrt_device.h" +#include "experimental/xrt_xclbin.h" + +// Some Arch related defined +#define MEM_BANK_SIZE_MiB 512 +#define MEM_CHUNK_SIZE ((size_t)(16*1024*1024)) + +namespace ffi { + // Forward definition: Concrete implementation is made by Cxx + enum class SyncModeCxx: uint8_t; + + class MemZone { + public: + MemZone(size_t size_b, size_t hbm_pc, xrt::bo* bo); + ~MemZone(); + + private: + const size_t size_b; + const size_t hbm_pc; + xrt::bo* bo; + std::optional map; + + public: // API exposed to Rust + uint64_t paddr() const; + uint64_t size() const ; + uint64_t pc() const ; + void read_bytes(size_t ofst, rust::Slice bytes) const; + void write_bytes(size_t ofst, rust::Slice bytes); + void sync(SyncModeCxx mode); + rust::Slice mmap(); + void unmap(); + }; + +} diff --git a/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mod.rs b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mod.rs new file mode 100644 index 000000000..04efcbb24 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/xrt/cxx/mod.rs @@ -0,0 +1,79 @@ +use super::*; + +#[cxx::bridge(namespace=ffi)] +mod extern_cxx { + /// Enumeration to define the synchronisation of data between Host and Device + #[derive(Debug, Clone)] + enum SyncMode { + Host2Device, + Device2Host, + } + + /// Enumeration to define the verbosity in Cxx bridge + #[derive(Debug, Clone)] + #[repr(u8)] + enum Verbosity { + Error = 0, + Warning, + Info, + Debug, + Trace, + } + + unsafe extern "C++" { + include!("hpu_hw.h"); + + type HpuHw; + // Access Hw register + fn read_reg(&self, addr: u64) -> u32; + fn write_reg(self: Pin<&mut HpuHw>, addr: u64, value: u32); + + // Handle onbeard memory + fn alloc(self: Pin<&mut HpuHw>, props: MemZoneProperties) -> UniquePtr; + // fn release(self: Pin<&mut HpuHw>, zone: &MemZone); + + fn new_hpu_hw( + fpga_id: u32, + kernel_name: String, + xclbin: String, + verbose: Verbosity, + ) -> UniquePtr; + } + + /// Define memory zone properties + #[derive(Debug, Clone)] + struct MemZoneProperties { + hbm_pc: usize, + size_b: usize, + } + + unsafe extern "C++" { + include!("tfhe-hpu-backend/src/ffi/cxx/mem_zone.h"); + + type MemZone; + + fn read_bytes(&self, ofst: usize, bytes: &mut [u8]); + fn paddr(&self) -> u64; + #[allow(unused)] + fn size(&self) -> usize; + fn write_bytes(self: Pin<&mut MemZone>, ofst: usize, bytes: &[u8]); + #[allow(unused)] + fn mmap(self: Pin<&mut MemZone>) -> &mut [u64]; + fn sync(self: Pin<&mut MemZone>, mode: SyncMode); + } +} + +/// Generic function to easily handle multiple word size +impl MemZone { + pub fn read(&self, ofst: usize, data: &mut [T]) { + let data_bytes = bytemuck::cast_slice_mut::(data); + let ofst_bytes = ofst * std::mem::size_of::(); + self.read_bytes(ofst_bytes, data_bytes); + } + + pub fn write(self: Pin<&mut MemZone>, ofst: usize, data: &[T]) { + let data_bytes = bytemuck::cast_slice::(data); + let ofst_bytes = ofst * std::mem::size_of::(); + self.write_bytes(ofst_bytes, data_bytes); + } +} diff --git a/backends/tfhe-hpu-backend/src/ffi/xrt/mod.rs b/backends/tfhe-hpu-backend/src/ffi/xrt/mod.rs new file mode 100644 index 000000000..f3e140bfa --- /dev/null +++ b/backends/tfhe-hpu-backend/src/ffi/xrt/mod.rs @@ -0,0 +1,96 @@ +use super::*; + +// Exposed types +pub use extern_cxx::{new_hpu_hw, HpuHw, MemZone, MemZonePropertiesCxx, SyncModeCxx, VerbosityCxx}; + +#[cxx::bridge(namespace=ffi)] +mod extern_cxx { + /// Enumeration to define the synchronisation of data between Host and Device + #[derive(Debug, Clone)] + enum SyncModeCxx { + Host2Device, + Device2Host, + } + + /// Enumeration to define the verbosity in Cxx bridge + #[derive(Debug, Clone)] + #[repr(u8)] + enum VerbosityCxx { + Error = 0, + Warning, + Info, + Debug, + Trace, + } + + /// Define memory zone properties + #[derive(Debug, Clone)] + struct MemZonePropertiesCxx { + hbm_pc: usize, + size_b: usize, + } + + unsafe extern "C++" { + include!("tfhe-hpu-backend/src/ffi/xrt/cxx/hpu_hw.h"); + + // Use Opaque Cxx type + type HpuHw; + + // Access Hw register + fn read_reg(self: &HpuHw, addr: u64) -> u32; + fn write_reg(self: Pin<&mut HpuHw>, addr: u64, value: u32); + + // Handle onbeard memory + fn alloc(self: Pin<&mut HpuHw>, props: MemZonePropertiesCxx) -> UniquePtr; + // fn release(self: Pin<&mut HpuHw>, zone: &MemZone); + + fn new_hpu_hw( + fpga_id: u32, + kernel_name: String, + xclbin: String, + verbose: VerbosityCxx, + ) -> UniquePtr; + } + + unsafe extern "C++" { + include!("tfhe-hpu-backend/src/ffi/xrt/cxx/mem_zone.h"); + + // Use Opaque Cxx type + type MemZone; + + fn read_bytes(&self, ofst: usize, bytes: &mut [u8]); + fn paddr(&self) -> u64; + #[allow(unused)] + fn size(&self) -> usize; + fn write_bytes(self: Pin<&mut MemZone>, ofst: usize, bytes: &[u8]); + #[allow(unused)] + fn mmap(self: Pin<&mut MemZone>) -> &mut [u64]; + fn sync(self: Pin<&mut MemZone>, mode: SyncModeCxx); + } +} + +/// Provide conversion between global SyncMode and Cxx version +impl From for SyncModeCxx { + fn from(value: SyncMode) -> Self { + match value { + SyncMode::Host2Device => Self::Host2Device, + SyncMode::Device2Host => Self::Device2Host, + } + } +} + +/// Provide conversion between global MemZoneProperties and Cxx version +impl From for MemZonePropertiesCxx { + fn from(value: MemZoneProperties) -> Self { + let hbm_pc = match value.mem_kind { + MemKind::Ddr { .. } => { + panic!("XRT don't support DDR allocation. Only Hbm is available on board") + } + MemKind::Hbm { pc } => pc, + }; + Self { + hbm_pc, + size_b: value.size_b, + } + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs new file mode 100644 index 000000000..3a8e3fd4e --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/demo.rs @@ -0,0 +1,135 @@ +use super::*; +use crate::asm::OperandKind; +use crate::fw::program::Program; + +use crate::asm::dop::pbs_macro; +use crate::asm::iop::opcode::*; +use crate::new_pbs; + +crate::impl_fw!("Demo" [ + ADD => fw_impl::ilp::iop_add; + SUB => fw_impl::ilp::iop_sub; + MUL => fw_impl::ilp::iop_mul; + + ADDS => fw_impl::ilp::iop_adds; + SUBS => fw_impl::ilp::iop_subs; + SSUB => fw_impl::ilp::iop_ssub; + MULS => fw_impl::ilp::iop_muls; + + BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())}); + BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())}); + BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())}); + + CMP_LTE => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpLte::default().into())}); + CMP_EQ => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpEq::default().into())}); + CMP_NEQ => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpNeq::default().into())}); + + IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero; + IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else; + + ERC_20 => fw_impl::ilp::iop_erc_20; + + CMP_GT => cmp_gt; + CMP_GTE => cmp_gte; + CMP_LT => cmp_lt; +]); + +// Recursive {{{1 +pub fn cmp_gt(prog: &mut Program) { + // Create Input/Output template entry points to be linked at execution time. + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + let src_a = prog.iop_template_var(OperandKind::Src, 0); + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Get the index of the required PBSs + let sgn_pbs = new_pbs!(prog, "CmpSign"); + let red_pbs = new_pbs!(prog, "CmpReduce"); + let gt_pbs = new_pbs!(prog, "CmpGt"); + + dst[0] <<= std::iter::zip(src_a, src_b) + .rev() + .fold(prog.new_imm(pbs_macro::CMP_EQUAL), |acc, (a, b)| { + (&(&a - &b).pbs(&sgn_pbs, false) + &prog.new_imm(1)) + .pack_carry(&acc) + .pbs(&red_pbs, false) + }) + .pbs(>_pbs, false); +} +// }}} + +// Parallel {{{ +pub fn cmp_gte(prog: &mut Program) { + // Allocate metavariables: + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + let src_a = prog.iop_template_var(OperandKind::Src, 0); + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Allocate the needed PBSs + let sgn_pbs = new_pbs!(prog, "CmpSign"); + let red_pbs = new_pbs!(prog, "CmpReduce"); + let gte_pbs = new_pbs!(prog, "CmpGte"); + + let mut ord_vec: Vec<_> = std::iter::zip(src_a, src_b) + .map(|(a, b)| &(&a - &b).pbs(&sgn_pbs, false) + &prog.new_imm(1)) + .collect(); + + while ord_vec.len() > 1 { + ord_vec = ord_vec + .chunks(2) + .map(|c| { + let v: Vec<_> = c.into(); + match v.len() { + 2 => v[0].pack_carry(&v[1]).pbs(&red_pbs, false), + 1 => v[0].clone(), + _ => panic!("chunks misbehaved"), + } + }) + .collect(); + } + + dst[0] <<= ord_vec[0].pbs(>e_pbs, true); +} +// }}} + +// Parallel with flushes and without the extra last PBS {{{1 +pub fn cmp_lt(prog: &mut Program) { + // Allocate metavariables: + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + let src_a = prog.iop_template_var(OperandKind::Src, 0); + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Allocate the needed PBSs + let sgn_pbs = new_pbs!(prog, "CmpSign"); + let red_pbs = new_pbs!(prog, "CmpReduce"); + let lt_pbs = new_pbs!(prog, "CmpLtMrg"); + + let end = src_a.len() - 1; + + let mut ord_vec: Vec<_> = std::iter::zip(src_a, src_b) + .enumerate() + .map(|(i, x)| (i == end, x)) + .map(|(flush, (a, b))| &(&a - &b).pbs(&sgn_pbs, flush) + &prog.new_imm(1)) + .collect(); + + while ord_vec.len() > 2 { + let end = ord_vec.len().div_ceil(2) - 1; + ord_vec = ord_vec + .chunks(2) + .enumerate() + .map(|(i, x)| (i == end, x)) + .map(|(flush, c)| { + let v: Vec<_> = c.into(); + match v.len() { + 2 => v[0].pack_carry(&v[1]).pbs(&red_pbs, flush), + 1 => v[0].clone(), + _ => panic!("chunks misbehaved"), + } + }) + .collect(); + } + + dst[0] <<= ord_vec[0].pack_carry(&ord_vec[1]).pbs(<_pbs, true); +} +//}}} + +// vim: foldmethod=marker diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs new file mode 100644 index 000000000..b16261fad --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/ilp.rs @@ -0,0 +1,874 @@ +//! +//! Implementation of Ilp firmware +//! +//! In this version of the Fw focus is done on Instruction Level Parallelism +use std::collections::VecDeque; + +use super::*; +use crate::asm::{self, OperandKind, Pbs}; +use crate::fw::program::Program; +use crate::fw::FwParameters; +use itertools::Itertools; +use tracing::{instrument, trace, warn}; + +use crate::asm::iop::opcode::*; +use crate::new_pbs; + +crate::impl_fw!("Ilp" [ + ADD => fw_impl::ilp::iop_add; + SUB => fw_impl::ilp::iop_sub; + MUL => fw_impl::ilp::iop_mul; + + ADDS => fw_impl::ilp::iop_adds; + SUBS => fw_impl::ilp::iop_subs; + SSUB => fw_impl::ilp::iop_ssub; + MULS => fw_impl::ilp::iop_muls; + + BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())}); + BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())}); + BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())}); + + CMP_GT => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpGt::default().into())}); + CMP_GTE => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpGte::default().into())}); + CMP_LT => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpLt::default().into())}); + CMP_LTE => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpLte::default().into())}); + CMP_EQ => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpEq::default().into())}); + CMP_NEQ => (|prog| {fw_impl::ilp::iop_cmp(prog, asm::dop::PbsCmpNeq::default().into())}); + + IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero; + IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else; + + ERC_20 => fw_impl::ilp::iop_erc_20; + + MEMCPY => fw_impl::ilp::iop_memcpy; +]); + +#[instrument(level = "info", skip(prog))] +pub fn iop_add(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Operand + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment("ADD Operand::Dst Operand::Src Operand::Src".to_string()); + // Deferred implementation to generic addx function + iop_addx(prog, &mut dst, &src_a, &src_b); +} + +pub fn iop_adds(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Imm, 0); + + // Add Comment header + prog.push_comment("ADDS Operand::Dst Operand::Src Operand::Immediat".to_string()); + // Deferred implementation to generic addx function + iop_addx(prog, &mut dst, &src_a, &src_b); +} + +/// Generic Add operation +/// One destination and two sources operation +/// Source could be Operand or Immediat +#[instrument(level = "info", skip(prog))] +pub fn iop_addx( + prog: &mut Program, + dst: &mut [metavar::MetaVarCell], + src_a: &[metavar::MetaVarCell], + src_b: &[metavar::MetaVarCell], +) { + let props = prog.params(); + + // Wrapped required lookup table in MetaVar + let pbs_msg = new_pbs!(prog, "MsgOnly"); + let pbs_carry = new_pbs!(prog, "CarryInMsg"); + + let mut carry: Option = None; + + (0..prog.params().blk_w()).for_each(|blk| { + prog.push_comment(format!(" ==> Work on output block {blk}")); + + let mut msg = &src_a[blk] + &src_b[blk]; + if let Some(cin) = &carry { + msg += cin.clone(); + } + if blk < (props.blk_w() - 1) { + carry = Some(msg.pbs(&pbs_carry, false)); + } + // Force allocation of new reg to allow carry/msg pbs to run in // + let msg = msg.pbs(&pbs_msg, true); + + // Store result + dst[blk].mv_assign(&msg); + }); +} + +#[instrument(level = "info", skip(prog))] +pub fn iop_sub(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment("SUB Operand::Dst Operand::Src Operand::Src".to_string()); + // Deferred implementation to generic subx function + iop_subx(prog, &mut dst, &src_a, &src_b); +} + +pub fn iop_subs(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Imm, 0); + + // Add Comment header + prog.push_comment("SUBS Operand::Dst Operand::Src Operand::Immediat".to_string()); + // Deferred implementation to generic subx function + iop_subx(prog, &mut dst, &src_a, &src_b); +} + +/// Generic sub operation +/// One destination and two sources operation +/// Source could be Operand or Immediat +#[instrument(level = "info", skip(prog))] +pub fn iop_subx( + prog: &mut Program, + dst: &mut [metavar::MetaVarCell], + src_a: &[metavar::MetaVarCell], + src_b: &[metavar::MetaVarCell], +) { + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + + // Wrapped required lookup table in MetaVar + let pbs_msg = new_pbs!(prog, "MsgOnly"); + let pbs_carry = new_pbs!(prog, "CarryInMsg"); + + let mut z_cor: Option = None; + let mut carry: Option = None; + + (0..prog.params().blk_w()).for_each(|blk| { + // Compute -b + // Algo is based on neg_from + correction factor + // neg_from - b + z_cor + // Trick here is to merge imm before SSub to reduce operation number + let neg_from = if let Some(z) = &z_cor { + prog.new_imm(tfhe_params.msg_range() - *z) + } else { + prog.new_imm(tfhe_params.msg_range()) + }; + let b_neg = &neg_from - &src_b[blk]; + + // TODO check correction factor computation + // From the context it seems that it could be a constant 1 + z_cor = Some( + src_b[blk] + .get_degree() + .div_ceil(tfhe_params.msg_range()) + .max(1), + ); + + // Compute a + (-b) + let mut msg = &src_a[blk] + &b_neg; + + // Handle input/output carry and extract msg + if let Some(cin) = &carry { + msg += cin.clone(); + } + if blk < (props.blk_w() - 1) { + carry = Some(msg.pbs(&pbs_carry, false)); + } + // Force allocation of new reg to allow carry/msg pbs to run in // + let msg = msg.pbs(&pbs_msg, true); + + // Store result + dst[blk] <<= msg; + }); +} + +/// Implementation of SSUB +/// Provide its own implementation to match SUBS perfs +#[instrument(level = "trace", skip(prog))] +pub fn iop_ssub(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Imm, 0); + + // Add Comment header + prog.push_comment("SSUB Operand::Dst Operand::Src Operand::Immediat".to_string()); + + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + + // Wrapped required lookup table in MetaVar + let pbs_msg = new_pbs!(prog, "MsgOnly"); + let pbs_carry = new_pbs!(prog, "CarryInMsg"); + + let mut z_cor: Option = None; + let mut carry: Option = None; + + (0..prog.params().blk_w()).for_each(|blk| { + // Compute -a + // Algo is based on neg_from + correction factor + // neg_from - a + z_cor + // Trick here is to merge imm before SSub to reduce operation number + let neg_from = if let Some(z) = &z_cor { + prog.new_imm(tfhe_params.msg_range() - *z) + } else { + prog.new_imm(tfhe_params.msg_range()) + }; + let a_neg = &neg_from - &src_a[blk]; + + // TODO check correction factor computation + // From the context it seems that it could be a constant 1 + z_cor = Some( + src_a[blk] + .get_degree() + .div_ceil(tfhe_params.msg_range()) + .max(1), + ); + + // Compute b + (-a) + let mut msg = &src_b[blk] + &a_neg; + + // Handle input/output carry and extract msg + if let Some(cin) = &carry { + msg += cin.clone(); + } + if blk < (props.blk_w() - 1) { + carry = Some(msg.pbs(&pbs_carry, false)); + } + // Force allocation of new reg to allow carry/msg pbs to run in // + let msg = msg.pbs(&pbs_msg, true); + + // Store result + dst[blk] <<= msg; + }); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_mul(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment("MUL Operand::Dst Operand::Src Operand::Src".to_string()); + // Deferred implementation to generic mulx function + iop_mulx(prog, &mut dst, &src_a, &src_b); +} + +pub fn iop_muls(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Imm, 0); + + // Add Comment header + prog.push_comment("MULS Operand::Dst Operand::Src Operand::Immediat".to_string()); + // Deferred implementation to generic mulx function + iop_mulx(prog, &mut dst, &src_a, &src_b); +} + +/// Generic mul operation +/// One destination and two sources operation +/// Source could be Operand or Immediat +#[instrument(level = "trace", skip(prog))] +pub fn iop_mulx( + prog: &mut Program, + dst: &mut [metavar::MetaVarCell], + src_a: &[metavar::MetaVarCell], + src_b: &[metavar::MetaVarCell], +) { + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + let blk_w = props.blk_w(); + + // Wrapped required lookup table in MetaVar + let pbs_msg = new_pbs!(prog, "MsgOnly"); + let pbs_carry = new_pbs!(prog, "CarryInMsg"); + let pbs_mul_lsb = new_pbs!(prog, "MultCarryMsgLsb"); + let pbs_mul_msb = new_pbs!(prog, "MultCarryMsgMsb"); + + // Compute list of partial product for each blk --------------------------------- + // First compute the list of required partial product. Filter out product with + // degree higher than output one + // NB: Targeted multiplication is nBits*nBits -> nBits [i.e. LSB only] + let pp_deg_idx = (0..blk_w) + .flat_map(|blk| { + itertools::iproduct!(0..blk_w, 0..blk_w) + .filter(move |(i, j)| i + j == blk) + .map(move |(i, j)| (blk, i, j)) + }) + .collect::>(); + + // Compute all partial product by chunk + // And store result in a Deque with associated weight (i.e. blk) + let mut pp_vars = VecDeque::new(); + + for pp in pp_deg_idx.chunks(props.pbs_batch_w) { + // Pack + let pack = pp + .iter() + .map(|(w, i, j)| { + let mac = src_a[*i].mac(tfhe_params.msg_range() as u8, &src_b[*j]); + trace!(target: "Fw", "@{w}[{i}, {j}] -> {mac:?}",); + (w, mac) + }) + .collect::>(); + + // Pbs Mul + // Reserve twice as pbs_w since 2 pbs could be generated for a given block + prog.reg_bulk_reserve(2 * props.pbs_batch_w); + pack.into_iter().for_each(|(w, pp)| { + let lsb = pp.pbs(&pbs_mul_lsb, false); + trace!(target: "Fw", "Pbs generate @{w} -> {lsb:?}"); + pp_vars.push_back((*w, lsb)); + + // Extract msb if needed + if *w < (blk_w - 1) { + // Force allocation of new reg to allow lsb/msb pbs to run in // + let msb = pp.pbs(&pbs_mul_msb, false); + trace!(target: "Fw", "Pbs generate @{} -> {msb:?}", w + 1); + pp_vars.push_back((*w + 1, msb)); + } + }); + } + + // Merged partial product together --------------------------------------------- + let mut acc_wh = vec![Vec::with_capacity(props.nu); blk_w]; + let mut pdg_acc = Vec::new(); + let mut pdg_pbs = Vec::new(); + + // Use to writeback in order and prevent digits drop during propagation + let mut wb_idx = 0; + + pp_vars + .make_contiguous() + .sort_by(|x, y| x.0.partial_cmp(&y.0).unwrap()); + + while let Some((w, var)) = pp_vars.pop_front() { + acc_wh[w].push(var); + + // Trace internal state + trace!(target: "Fw", "{:#<80}",""); + trace!(target: "Fw", "pp_vars[{}] -> {pp_vars:?}", pp_vars.len(),); + trace!(target: "Fw", "pdg_acc[{}] -> {pdg_acc:?}", pdg_acc.len(),); + trace!(target: "Fw", "pdg_pbs[{}] -> {pdg_pbs:?}", pdg_pbs.len(),); + + // For each acc_wh slot check flushing condition + trace!(target: "Fw", "Acc_wh: Check flushing condition {:#<20}",""); + for (w, acc) in acc_wh.iter_mut().enumerate() { + if w < wb_idx { + // Skip position w if already committed + assert_eq!(0, acc.len(), "Error committed incomplete digit"); + continue; + } + // Check if other deg_w var are in the pp_vars store or in pbs_pipe + let winf_in_pipe = pp_vars.iter().filter(|(d, _)| *d <= w).count() + + pdg_pbs.iter().filter(|(d, _)| *d <= w).count() + + pdg_acc.iter().filter(|(d, _)| *d <= w).count(); + + trace!( + target: "Fw", + "acc {w}: [len:{}; winf:{winf_in_pipe}] -> {:?}", + acc.len(), + acc + ); + + // Trigger Add if acc warehouse is full of if no more deg_w (or previous) is in pipe + if (acc.len() == props.nu) || ((winf_in_pipe == 0) && (!acc.is_empty())) { + trace!(target: "Fw", "Flush acc_wh[{w}]",); + let mut acc_chunks = std::mem::take(acc); + match acc_chunks.len() { + 1 => { + // Try to commit directly + // Skipped acc reduction tree + if wb_idx == w { + // Finish computation for digit @w + acc_chunks[0].reg_alloc_mv(); + trace!(target:"Fw", "Commit {w} <- {:?}", acc_chunks[0]); + dst[w] <<= acc_chunks.swap_remove(0); + wb_idx += 1; + } else { + // not my turn, enqueue back + trace!(target:"Fw", "{w}::{wb_idx}: insert backed in pp_vars {:?}", acc_chunks[0]); + pp_vars.push_back((w, acc_chunks.swap_remove(0))); + } + } + _ => { + // Go through the acc reduction tree + pdg_acc.push((w, acc_chunks)); + } + } + } + } + + trace!( + target: "Fw", + "pdg_acc[{}], pp_vars[{}]: flush pdg_acc", + pdg_acc.len(), + pp_vars.len() + ); + while let Some((w, acc_chunks)) = pdg_acc.pop() { + trace!(target: "Fw", "Reduce @{w}[{}] <- {acc_chunks:?}",acc_chunks.len()); + // Hand-written tree reduction for up to 5 + match acc_chunks.len() { + 1 => { + unreachable!("This case must not go through acc reduction tree. In should have take the fast pass in acc_wh flushing."); + } + + 2 => { + let sum = &acc_chunks[0] + &acc_chunks[1]; + pdg_pbs.push((w, sum)); + } + + 3 => { + let sum_a = &acc_chunks[0] + &acc_chunks[1]; + let sum_b = &sum_a + &acc_chunks[2]; + pdg_pbs.push((w, sum_b)); + } + + 4 => { + let sum_a = &acc_chunks[0] + &acc_chunks[1]; + let mut sum_b = &acc_chunks[2] + &acc_chunks[3]; + sum_b += sum_a; + pdg_pbs.push((w, sum_b)); + } + 5 => { + let sum_a = &acc_chunks[0] + &acc_chunks[1]; + let sum_b = &acc_chunks[2] + &acc_chunks[3]; + let mut sum_c = &sum_b + &acc_chunks[4]; + sum_c += sum_a; + pdg_pbs.push((w, sum_c)); + } + _ => panic!("Currently only support nu <= 5"), + } + } + + if pdg_pbs.len() == props.pbs_batch_w || (pp_vars.is_empty()) { + trace!(target: "Fw", "pdg_pbs[{}] <- {pdg_pbs:?}", pdg_pbs.len()); + prog.reg_bulk_reserve(pdg_pbs.len()); + while let Some((w, var)) = pdg_pbs.pop() { + let lsb = var.pbs(&pbs_msg, false); + trace!(target: "Fw", "Pbs generate @{w} -> {lsb:?}"); + // TODO These explicit flush enhance perf for large MUL but degrade them for small + // one Find a proper way to arbitrait their used + // Furthermore, it induce error with current ISC without LD/ST ordering + // lsb.heap_alloc_mv(true); + pp_vars.push_back((w, lsb)); + + // Extract msb if needed + if w < (blk_w - 1) { + // Force allocation of new reg to allow carry/msg pbs to run in // + let msb = var.pbs(&pbs_carry, false); + trace!(target: "Fw", "Pbs generate @{} -> {msb:?}", w + 1); + // TODO These explicit flush enhance perf for large MUL but degrade them for + // small one Find a proper way to arbitrait their used + // Furthermore, it induce error with current ISC without LD/ST ordering + // msb.heap_alloc_mv(true); + pp_vars.push_back((w + 1, msb)); + } + } + // Compute LSB ASAP + pp_vars + .make_contiguous() + .sort_by(|x, y| x.0.partial_cmp(&y.0).unwrap()); + } + } +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_bw(prog: &mut Program, bw_op: Pbs) { + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Operand + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment(format!("BW_{bw_op} Operand::Dst Operand::Src Operand::Src")); + + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + + // Wrapped given bw_op lookup table in MetaVar + let bw_op = prog.var_from(Some(metavar::VarPos::Pbs(bw_op))); + + itertools::izip!(dst, src_a, src_b) + .enumerate() + .chunks(props.pbs_batch_w) + .into_iter() + .for_each(|chunk| { + let chunk_pack = chunk + .into_iter() + .map(|(pos, (d, a, b))| (pos, d, a.mac(tfhe_params.msg_range() as u8, &b))) + .collect::>(); + chunk_pack.into_iter().for_each(|(pos, mut d, mut pack)| { + pack.pbs_assign(&bw_op, pos == props.blk_w() - 1); + d <<= pack; + }); + }); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_cmp(prog: &mut Program, cmp_op: Pbs) { + // Dest -> Operand + let mut dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Operand + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment(format!( + "CMP_{cmp_op} Operand::Dst Operand::Src Operand::Src" + )); + + // Deferred implementation to generic cmpx function + iop_cmpx(prog, &mut dst[0], &src_a, &src_b, cmp_op); +} + +/// Generic Cmp operation +/// One destination block and two sources operands +/// Source could be Operand or Immediat +#[instrument(level = "trace", skip(prog))] +pub fn iop_cmpx( + prog: &mut Program, + dst: &mut metavar::MetaVarCell, + src_a: &[metavar::MetaVarCell], + src_b: &[metavar::MetaVarCell], + cmp_op: Pbs, +) { + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + + // Wrapped given cmp_op and comp_sign lookup table in MetaVar + let cmp_op = prog.var_from(Some(metavar::VarPos::Pbs(cmp_op))); + let pbs_none = new_pbs!(prog, "None"); + let cmp_sign = new_pbs!(prog, "CmpSign"); + let cmp_reduce = new_pbs!(prog, "CmpReduce"); + + // Pack A and B elements by pairs + let packed = std::iter::zip(src_a.chunks(2), src_b.chunks(2)) + .enumerate() + .map(|(pos, (a, b))| { + let pack_a = if a.len() > 1 { + // Reset noise for future block merge through sub + a[1].mac(tfhe_params.msg_range() as u8, &a[0]) + .pbs(&pbs_none, false) + } else { + a[0].clone() + }; + + let pack_b = if b.len() > 1 { + b[1].mac(tfhe_params.msg_range() as u8, &b[0]) + .pbs(&pbs_none, pos == (props.blk_w() / 2) - 1) + } else { + b[0].clone() + }; + (pack_a, pack_b) + }) + .collect::>(); + + let cst_1 = prog.new_imm(1); + let merged = packed + .into_iter() + .enumerate() + .chunks(props.pbs_batch_w) + .into_iter() + .flat_map(|chunk| { + let chunk = chunk + .map(|(pos, (mut a, b))| { + a -= b; + (pos, a) + }) + .collect::>(); + let chunk = chunk + .into_iter() + .map(|(pos, mut a)| { + a.pbs_assign(&cmp_sign, pos == props.blk_w().div_ceil(2) - 1); + a + }) + .collect::>(); + chunk.into_iter().map(|mut a| { + a += cst_1.clone(); + a + }) + }) + .collect::>(); + + let reduce = merged.into_iter().reduce(|acc, x| { + x.mac(tfhe_params.msg_range() as u8, &acc) + .pbs(&cmp_reduce, true) + }); + + // interpret reduce with expected cmp + let cmp = reduce.unwrap().pbs(&cmp_op, true); + dst.mv_assign(&cmp); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_if_then_zero(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src = prog.iop_template_var(OperandKind::Src, 0); + // Cond -> Operand + // second operand must be a FheBool and have only one blk + let cond = { + let mut cond_blk = prog.iop_template_var(OperandKind::Src, 1); + cond_blk.truncate(1); + cond_blk.pop().unwrap() + }; + + // Add Comment header + prog.push_comment("IF_THEN_ZERO Operand::Dst Operand::Src Operand::Src[Condition]".to_string()); + + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + + // Wrapped required lookup table in MetaVar + let pbs_if_false_zeroed = new_pbs!(prog, "IfFalseZeroed"); + + itertools::izip!(dst, src) + .enumerate() + .chunks(props.pbs_batch_w) + .into_iter() + .for_each(|chunk| { + // Pack (cond, src) + let chunk_pack = chunk + .into_iter() + .map(|(pos, (d, src))| (pos, d, cond.mac(tfhe_params.msg_range() as u8, &src))) + .collect::>(); + + chunk_pack + .into_iter() + .for_each(|(pos, mut d, mut cond_src)| { + cond_src.pbs_assign(&pbs_if_false_zeroed, pos == props.blk_w() - 1); + d <<= cond_src; + }); + }); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_if_then_else(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Operand + let src_b = prog.iop_template_var(OperandKind::Src, 1); + // Cond -> Operand + // Third operand must be a FheBool and have only one blk + let cond = { + let mut cond_blk = prog.iop_template_var(OperandKind::Src, 2); + cond_blk.truncate(1); + cond_blk.pop().unwrap() + }; + + // Add Comment header + prog.push_comment( + "IF_THEN_ELSE Operand::Dst Operand::Src Operand::Src Operand::Src[Condition]".to_string(), + ); + + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + + // Wrapped required lookup table in MetaVar + let pbs_if_true_zeroed = new_pbs!(prog, "IfTrueZeroed"); + let pbs_if_false_zeroed = new_pbs!(prog, "IfFalseZeroed"); + + itertools::izip!(dst, src_a, src_b) + .enumerate() + .chunks(props.pbs_batch_w) + .into_iter() + .for_each(|chunk| { + // Pack (cond, a), (cond, b) + let chunk_pack = chunk + .into_iter() + .map(|(pos, (d, a, b))| { + ( + pos, + d, + cond.mac(tfhe_params.msg_range() as u8, &a), + cond.mac(tfhe_params.msg_range() as u8, &b), + ) + }) + .collect::>(); + chunk_pack + .into_iter() + .for_each(|(pos, mut d, mut cond_a, mut cond_b)| { + cond_a.pbs_assign(&pbs_if_false_zeroed, false); + cond_b.pbs_assign(&pbs_if_true_zeroed, pos == props.blk_w() - 1); + d <<= &cond_a + &cond_b; + }); + }); +} + +/// Implement erc_20 fund xfer +/// Targeted algorithm is as follow: +/// 1. Check that from has enough funds +/// 2. Compute real_amount to xfer (i.e. amount or 0) +/// 3. Compute new amount (from - new_amount, to + new_amount) +#[instrument(level = "info", skip(prog))] +pub fn iop_erc_20(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let mut dst_from = prog.iop_template_var(OperandKind::Dst, 0); + let mut dst_to = prog.iop_template_var(OperandKind::Dst, 1); + // Src -> Operand + let src_from = prog.iop_template_var(OperandKind::Src, 0); + let src_to = prog.iop_template_var(OperandKind::Src, 1); + // Src Amount -> Operand + let src_amount = prog.iop_template_var(OperandKind::Src, 2); + + // Add Comment header + prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string()); + + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + + // Wrapped required lookup table in MetaVar + let pbs_msg = new_pbs!(prog, "MsgOnly"); + let pbs_carry = new_pbs!(prog, "CarryInMsg"); + let pbs_if_false_zeroed = new_pbs!(prog, "IfFalseZeroed"); + + // Check if from has enough funds + let enough_fund = { + let mut dst = prog.new_var(); + iop_cmpx( + prog, + &mut dst, + &src_from, + &src_amount, + asm::dop::PbsCmpGte::default().into(), + ); + dst + }; + + // Fuse real_amount computation and new_from, new_to + // First compute a batch of real_amount in advance + let mut real_amount_work = (0..props.blk_w()).peekable(); + let mut upfront_work = real_amount_work.by_ref().take(props.pbs_batch_w).peekable(); + prog.push_comment(" ==> Compute some real_amount in advance".to_string()); + let mut real_amount = VecDeque::new(); + while let Some(blk) = upfront_work.next() { + let mut val_cond = enough_fund.mac(tfhe_params.msg_range() as u8, &src_amount[blk]); + val_cond.pbs_assign(&pbs_if_false_zeroed, upfront_work.peek().is_none()); + real_amount.push_back(val_cond); + } + + let mut add_carry: Option = None; + + let mut sub_z_cor: Option = None; + let mut sub_carry: Option = None; + + (0..prog.params().blk_w()).for_each(|blk| { + prog.push_comment(format!(" ==> Work on output block {blk}")); + + // Compte next real_amount if any + if let Some(work) = real_amount_work.next() { + let mut val_cond = enough_fund.mac(tfhe_params.msg_range() as u8, &src_amount[work]); + val_cond.pbs_assign(&pbs_if_false_zeroed, false); + real_amount.push_back(val_cond); + } + let amount_blk = real_amount.pop_front().unwrap(); + + // Add + let mut add_msg = &src_to[blk] + &amount_blk; + if let Some(cin) = &add_carry { + add_msg += cin.clone(); + } + if blk < (props.blk_w() - 1) { + add_carry = Some(add_msg.pbs(&pbs_carry, false)); + } + // Force allocation of new reg to allow carry/msg pbs to run in // + let add_msg = add_msg.pbs(&pbs_msg, false); + + // Sub + // Compute -b + let neg_from = if let Some(z) = &sub_z_cor { + prog.new_imm(tfhe_params.msg_range() - *z) + } else { + prog.new_imm(tfhe_params.msg_range()) + }; + let amount_neg = &neg_from - &amount_blk; + + sub_z_cor = Some( + amount_blk + .get_degree() + .div_ceil(tfhe_params.msg_range()) + .max(1), + ); + + // Compute a + (-b) + let mut sub_msg = &src_from[blk] + &amount_neg; + + // Handle input/output carry and extract msg + if let Some(cin) = &sub_carry { + sub_msg += cin.clone(); + } + if blk < (props.blk_w() - 1) { + sub_carry = Some(sub_msg.pbs(&pbs_carry, false)); + } + // Force allocation of new reg to allow carry/msg pbs to run in // + let sub_msg = sub_msg.pbs(&pbs_msg, true); + + // Store result + dst_to[blk] <<= add_msg; + dst_from[blk] <<= sub_msg; + }); +} + +/// Implement memcpy operation +/// Utilities IOp used to duplicate a ciphertext when already uploaded on HPU +/// Use to enforce clone semantic at the HL-Api level +#[instrument(level = "info", skip(prog))] +pub fn iop_memcpy(prog: &mut Program) { + // Allocate metavariables: + let dst = prog.iop_template_var(OperandKind::Dst, 0); + let src = prog.iop_template_var(OperandKind::Src, 0); + + // NB: Move from memory -> memory isn't supported by HPU + // Thus we have to go through register file and LD->RegFile->ST + // Memcpy is a small IOp and could triggered issue with `min_iop_size` + // If required padded the iop with linear operaation + let iop_len = src.len().min(dst.len()) * 2; + for _ in 0..(prog.params().min_iop_size as isize - iop_len as isize) { + let _ = prog.new_cst(0); + } + + for (mut d, s) in itertools::izip!(dst, src) { + s.reg_alloc_mv(); + d <<= s; + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/kogge.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/kogge.rs new file mode 100644 index 000000000..954610038 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/kogge.rs @@ -0,0 +1,497 @@ +use super::*; + +// For the kogge stone add/sub +use crate::fw::metavar::PosKind; +use crate::fw::rtl::{Rtl, VarCell}; +use lazy_static::lazy_static; +use std::cmp::{Eq, PartialEq}; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::env; +use std::error::Error; +use std::io::Write; +use std::ops::Deref; +use std::sync::{Arc, RwLock}; + +use tracing::{trace, warn}; + +// For the kogge block table +use serde::{Deserialize, Serialize}; +use toml; + +#[derive(Clone, Serialize, Deserialize, Hash, PartialEq, Eq, Debug)] +struct KoggeBlockTableIndex(String); + +impl From for KoggeBlockTableIndex { + fn from(value: FwParameters) -> Self { + KoggeBlockTableIndex(format!( + "blk_{}_pbs_{}", + value.blk_w(), + value.min_pbs_batch_w + )) + } +} + +#[derive(Clone, Copy, Serialize, Deserialize, Debug)] +enum AddCfg { + Kogge(usize), + Ripple, +} + +#[derive(Clone, Serialize, Deserialize, Debug)] +struct KoggeBlockCfg { + #[serde(skip)] + filename: String, + table: HashMap, +} + +fn append_bin(name: &str) -> String { + let exe = env::current_exe().unwrap(); + let exe_dir = exe.parent().and_then(|p| p.to_str()).unwrap_or("."); + format!("{exe_dir}/{name}") +} + +impl KoggeBlockCfg { + fn try_with_filename(name: &str, f: F) -> Result + where + F: Fn(&str) -> Result, + { + f(name).or_else(|_| f(&append_bin(name))) + } + + pub fn new(filename: &str) -> KoggeBlockCfg { + if let Ok(contents) = + KoggeBlockCfg::try_with_filename(filename, |f| std::fs::read_to_string(f)) + { + let mut res: KoggeBlockCfg = toml::from_str(&contents) + .unwrap_or_else(|e| panic!("{filename} is not a valid KoggeBlockCfg: {e}")); + res.filename = String::from(filename); + res + } else { + KoggeBlockCfg { + filename: String::from(filename), + table: HashMap::new(), + } + } + } + + pub fn entry( + &mut self, + index: KoggeBlockTableIndex, + ) -> Entry<'_, KoggeBlockTableIndex, AddCfg> { + self.table.entry(index) + } + + pub fn get(&mut self, index: &KoggeBlockTableIndex) -> Option<&AddCfg> { + self.table.get(index) + } + + fn try_write(&self) -> Result<(), Box> { + trace!(target: "rtl", "Saving {}", self.filename); + // Convert in toml string + let toml = toml::to_string(&self)?; + + // Open file and write to it + let mut file = KoggeBlockCfg::try_with_filename(&self.filename, |name| { + std::fs::File::options() + .write(true) + .truncate(true) + .create(true) + .open(name) + })?; + write!(&mut file, "{toml}")?; + Ok(()) + } +} + +#[derive(Clone)] +struct KoggeBlockCfgPtr(Arc>); + +impl KoggeBlockCfgPtr { + fn new(filename: &str) -> Self { + KoggeBlockCfgPtr(Arc::new(RwLock::new(KoggeBlockCfg::new(filename)))) + } +} + +impl Deref for KoggeBlockCfgPtr { + type Target = RwLock; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl From<&str> for KoggeBlockCfgPtr { + fn from(cfg_f: &str) -> Self { + let mut hash = KOGGE_BLOCK_CFG.write().unwrap(); + (hash + .entry(cfg_f.to_string()) + .or_insert_with_key(|key| KoggeBlockCfgPtr::new(key))) + .clone() + } +} + +lazy_static! { + static ref KOGGE_BLOCK_CFG: Arc>> = + Arc::new(RwLock::new(HashMap::new())); +} + +#[derive(Hash, PartialEq, Eq, Clone)] +struct Range(usize, usize); + +#[derive(Clone, Debug)] +pub struct PGCarry { + var: VarCell, + cpos: usize, + fresh: VarCell, +} + +#[derive(Clone, Debug)] +pub struct RippleCarry(pub VarCell); + +#[derive(Clone, Debug)] +pub enum Carry { + PG(PGCarry), + Ripple(RippleCarry), +} + +impl Carry { + pub fn clone_on(&self, prog: &Program) -> Carry { + match self { + Carry::PG(x) => Carry::PG(x.clone_on(prog)), + Carry::Ripple(RippleCarry(x)) => Carry::Ripple(RippleCarry(x.clone_on(prog))), + } + } +} + +impl PGCarry { + pub fn fresh(var: VarCell) -> PGCarry { + PGCarry { + var: var.clone(), + cpos: 1, + fresh: var, + } + } + + pub fn clone_on(&self, prog: &Program) -> PGCarry { + PGCarry { + var: self.var.clone_on(prog), + cpos: self.cpos, + fresh: self.fresh.clone_on(prog), + } + } +} + +impl From for PGCarry { + fn from(value: Carry) -> Self { + match value { + Carry::Ripple(x) => { + if let Some(true) = x.0.copy_meta().map(|x| x.is_in(PosKind::IMM)) { + PGCarry::fresh(&x.0 * 2usize) + } else { + let pbs = pbs_by_name!("Ripple2GenProp"); + PGCarry::fresh(x.0.single_pbs(&pbs)) + } + } + Carry::PG(x) => x, + } + } +} + +impl From for RippleCarry { + fn from(value: Carry) -> Self { + match value { + Carry::Ripple(x) => x, + Carry::PG(_) => panic!("Unsupported"), + } + } +} + +enum ReduceType { + Simple(Pbs), + Inc(Pbs), +} + +impl ReduceType { + fn apply(&self, var: &VarCell) -> VarCell { + match self { + ReduceType::Simple(pbs) => var.single_pbs(pbs), + ReduceType::Inc(pbs) => &var.single_pbs(pbs) + 1, + } + } +} + +struct KoggeTree { + cache: HashMap, + tfhe_params: asm::DigitParameters, + reduce_map: HashMap, +} + +impl KoggeTree { + fn new(prg: &mut Program, inputs: Vec) -> KoggeTree { + let mut cache = HashMap::new(); + inputs.into_iter().enumerate().for_each(|(i, v)| { + cache.insert(Range(i, i), v); + }); + let props = prg.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + let mut reduce_map = HashMap::new(); + reduce_map.insert( + 2, + ReduceType::Simple(asm::Pbs::ReduceCarry2(asm::dop::PbsReduceCarry2::default())), + ); + reduce_map.insert( + 3, + ReduceType::Simple(asm::Pbs::ReduceCarry3(asm::dop::PbsReduceCarry3::default())), + ); + reduce_map.insert( + tfhe_params.total_width(), + ReduceType::Inc(asm::Pbs::ReduceCarryPad( + asm::dop::PbsReduceCarryPad::default(), + )), + ); + KoggeTree { + cache, + tfhe_params, + reduce_map, + } + } + + fn get_subindex(&self, index: &Range) -> (Range, Range) { + let range = index.1 - index.0 + 1; + // Find the biggest power of two smaller than range + let pow = 1 << range.ilog2(); + let mid = if pow == range { + index.0 + (pow >> 1) + } else { + index.0 + pow + }; + (Range(index.0, mid - 1), Range(mid, index.1)) + } + + fn insert_subtree(&mut self, index: &Range) { + if !self.cache.contains_key(index) { + let (lsb, msb) = self.get_subindex(index); + self.insert_subtree(&lsb); + self.insert_subtree(&msb); + + let (lsb, msb) = (self.cache.get(&lsb).unwrap(), self.cache.get(&msb).unwrap()); + let merge = { + let cpos_trial = lsb.cpos + msb.cpos; + let (lsb, msb, cpos, msb_shift) = if cpos_trial > self.tfhe_params.total_width() { + if msb.cpos + 1 > self.tfhe_params.total_width() { + (&lsb.fresh, &msb.fresh, 2, 2) + } else { + (&lsb.fresh, &msb.var, msb.cpos + 1, 2) + } + } else { + (&lsb.var, &msb.var, cpos_trial, 1 << lsb.cpos) + }; + + let var = lsb.mac(msb_shift, msb); + let fresh = self.reduce_map[&cpos].apply(&var); + PGCarry { var, cpos, fresh } + }; + + self.cache.insert((*index).clone(), merge); + } + } + + fn get_subtree(&mut self, index: &Range) -> &PGCarry { + self.insert_subtree(index); + self.cache.get(index).unwrap() + } +} + +// Receives cypher texts with carry (in carry save form) and outputs cypher +// texts with carry propagated. The first item in the input vector is the carry +// in. +// Calling this only makes sense if the generated PBSs fit nicely into the batch +// size. +#[instrument(level = "trace", skip(prog))] +pub fn propagate_carry( + prog: &mut Program, + dst: &mut [VarCell], + carrysave: &[VarCell], + cin: &Option, +) -> PGCarry { + let tfhe_params: asm::DigitParameters = prog.params().clone().into(); + + let pbs_genprop = pbs_by_name!("ManyGenProp"); + let pbs_genprop_add = pbs_by_name!("GenPropAdd"); + + // Make sure the TFHE parameters are enough to run this + assert!( + tfhe_params.total_width() >= 3, + "Cannot run Kogge stone with a total message width less than 3" + ); + + // Split the result into message and propagate/generate information using a + // manyLUT + let (msg, mut carry): (Vec<_>, Vec<_>) = carrysave + .iter() + .map(|v| { + let mut res = v.pbs(&pbs_genprop).into_iter(); + (res.next().unwrap(), PGCarry::fresh(res.next().unwrap())) + }) + .unzip(); + + // Add the carry in as the first carry if any + carry.insert( + 0, + cin.clone() + .unwrap_or_else(|| PGCarry::fresh(VarCell::from(prog.new_imm(0)))), + ); + + // Build a list of terminal outputs + let mut carry_tree = KoggeTree::new(prog, carry); + + for i in 0..msg.len() { + let subtree = carry_tree.get_subtree(&Range(0, i)); + let mac = msg[i].mac(tfhe_params.msg_range(), &subtree.fresh); + let pbs = mac.single_pbs(&pbs_genprop_add); + dst[i] <<= &pbs; + } + + carry_tree.get_subtree(&Range(0, msg.len())).clone() +} + +// Adds two vectors of VarCells and produces a register transfer level +// description of a kogge stone adder that can then be added to the program +pub fn add( + prog: &mut Program, + mut dst: Vec, + a: Vec, + b: Vec, + cin: Option, + par_w: usize, +) -> Rtl { + // Convert Carry go PGCarry + let mut cin: Option = cin.map(|x| x.into()); + + // Carry save add + let csave: Vec<_> = a + .into_iter() + .zip_longest(b) + .map(|r| match r { + EitherOrBoth::Left(x) | EitherOrBoth::Right(x) => x, + EitherOrBoth::Both(a, b) => &a + &b, + }) + .collect(); + + (0..csave.len().div_ceil(par_w)).for_each(|chunk_idx| { + let start = chunk_idx * par_w; + let end = (start + par_w).min(csave.len()); + cin = Some(kogge::propagate_carry( + prog, + &mut dst[start..], + &csave[start..end], + &cin, + )); + }); + + Rtl::from(dst) +} + +pub fn sub( + prog: &mut Program, + dst: Vec, + a: Vec, + b: Vec, + par_w: usize, +) -> Rtl { + let b_inv = bw_inv(prog, b); + let one = Carry::Ripple(RippleCarry(VarCell::from(prog.new_imm(1)))); + kogge::add(prog, dst, a, b_inv, Some(one), par_w) +} + +pub fn ripple_sub(prog: &mut Program, dst: Vec, a: Vec, b: Vec) -> Rtl { + let b_inv = bw_inv(prog, b); + let one = Carry::Ripple(RippleCarry(VarCell::from(prog.new_imm(1)))); + kogge::ripple_add(dst, a, b_inv, Some(one)) +} + +pub fn ripple_add( + mut dst: Vec, + src_a: Vec, + src_b: Vec, + carry: Option, +) -> Rtl { + let pbs = pbs_by_name!("ManyCarryMsg"); + + let mut carry: Option = carry.map(|x| RippleCarry::from(x).0); + + dst.iter_mut() + .zip(src_a.into_iter().zip_longest(src_b).map(|r| match r { + EitherOrBoth::Left(x) | EitherOrBoth::Right(x) => x.clone(), + EitherOrBoth::Both(a, b) => &a + &b, + })) + .for_each(|(dst, mut msg)| { + // Conditional carry + if let Some(carry) = &carry { + msg = &msg + carry; + } + + // Extract carry and message + let mut pbs_iter = msg.pbs(&pbs).into_iter(); + *dst <<= &pbs_iter.next().unwrap(); + carry = Some(pbs_iter.next().unwrap()); + }); + + Rtl::from(dst) +} + +// cached kogge_adder wrapper +// This finds the best par_w for the given architecture and caches the result +pub fn cached_add( + prog: &mut Program, + a: Vec, + b: Vec, + cin: Option, + dst: Vec, +) -> Rtl { + let kogge_cfg_ptr = KoggeBlockCfgPtr::from(prog.params().kogge_cfg.as_str()); + let mut kogge_cfg = kogge_cfg_ptr.write().unwrap(); + let index: KoggeBlockTableIndex = prog.params().into(); + let dst: Vec<_> = dst.iter().map(|v| VarCell::from(v.clone())).collect(); + let clone_on = |prog: &Program, v: &Vec| v.iter().map(|v| v.clone_on(prog)).collect(); + let mut dirty = false; + + trace!(target: "rtl", "kogge config: {:?}", kogge_cfg); + + kogge_cfg + .get(&index) + .copied() + .or_else(|| { + dirty = true; + (1..=prog.params().blk_w()) + .map(AddCfg::Kogge) + .chain([AddCfg::Ripple]) + .map(|cfg| { + // Build a new tree for every par_w trial, which means that we + // need to get fresh variables for each trial. + let mut tmp_prog = Program::new(&prog.params()); + let a: Vec<_> = clone_on(&tmp_prog, &a); + let b: Vec<_> = clone_on(&tmp_prog, &b); + let dst: Vec<_> = clone_on(&tmp_prog, &dst); + let cin = cin.clone().map(|c| c.clone_on(&tmp_prog)); + let tree = match cfg { + AddCfg::Kogge(w) => add(&mut tmp_prog, dst, a, b, cin, w), + AddCfg::Ripple => ripple_add(dst, a, b, cin), + }; + (cfg, tree.estimate(&tmp_prog)) + }) + .min_by_key(|(_, cycle_estimate)| *cycle_estimate) + .map(|(cfg, _)| cfg) + }) + .map(|cfg| { + kogge_cfg.entry(index).or_insert(cfg); + if dirty && kogge_cfg.try_write().is_err() { + warn!("Could not write kogge config"); + } + match cfg { + AddCfg::Kogge(w) => add(prog, dst, a, b, cin, w), + AddCfg::Ripple => ripple_add(dst, a, b, cin), + } + }) + .unwrap() +} diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs new file mode 100644 index 000000000..7177258cb --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/mod.rs @@ -0,0 +1,500 @@ +use super::*; + +pub mod kogge; +pub mod vardeg; + +use super::rtl::{Rtl, VarCell}; +use kogge::{Carry, RippleCarry}; +use vardeg::*; + +use crate::asm::iop::opcode::*; +use crate::asm::{self, OperandKind, Pbs}; +use crate::fw::metavar::MetaVarCell; +use crate::fw::program::Program; +use crate::pbs_by_name; +use itertools::{EitherOrBoth, Itertools}; +use std::collections::HashMap; +use tracing::{instrument, trace}; + +crate::impl_fw!("Llt" [ + ADD => fw_impl::llt::iop_add; + SUB => fw_impl::llt::iop_sub; + MUL => fw_impl::llt::iop_mul; + + ADDS => fw_impl::llt::iop_adds; + SUBS => fw_impl::llt::iop_subs; + SSUB => fw_impl::llt::iop_ssub; + MULS => fw_impl::llt::iop_muls; + + BW_AND => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwAnd::default().into())}); + BW_OR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwOr::default().into())}); + BW_XOR => (|prog| {fw_impl::ilp::iop_bw(prog, asm::dop::PbsBwXor::default().into())}); + + CMP_GT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGtMrg"), pbs_by_name!("CmpGt"))}); + CMP_GTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpGteMrg"), pbs_by_name!("CmpGte"))}); + CMP_LT => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLtMrg"), pbs_by_name!("CmpLt"))}); + CMP_LTE => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpLteMrg"), pbs_by_name!("CmpLte"))}); + CMP_EQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpEqMrg"), pbs_by_name!("CmpEq"))}); + CMP_NEQ => (|prog| {fw_impl::llt::iop_cmp(prog, pbs_by_name!("CmpNeqMrg"), pbs_by_name!("CmpNeq"))}); + + IF_THEN_ZERO => fw_impl::ilp::iop_if_then_zero; + IF_THEN_ELSE => fw_impl::ilp::iop_if_then_else; + + ERC_20 => fw_impl::llt::iop_erc_20; + MEMCPY => fw_impl::ilp::iop_memcpy; +]); + +// ---------------------------------------------------------------------------- +// API +// ---------------------------------------------------------------------------- + +#[instrument(level = "trace", skip(prog))] +pub fn iop_add(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment("ADD Operand::Dst Operand::Src Operand::Src".to_string()); + iop_addx(prog, dst, src_a, src_b); +} + +pub fn iop_adds(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Imm, 0); + + // Add Comment header + prog.push_comment("ADDS Operand::Dst Operand::Src Operand::Immediat".to_string()); + iop_addx(prog, dst, src_a, src_b); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_sub(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment("SUB Operand::Dst Operand::Src Operand::Src".to_string()); + iop_subx(prog, dst, src_a, src_b); +} + +pub fn iop_subs(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Imm, 0); + + // Add Comment header + prog.push_comment("SUBS Operand::Dst Operand::Src Operand::Immediat".to_string()); + iop_subx(prog, dst, src_a, src_b); +} + +pub fn iop_ssub(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Imm, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Src, 0); + + // Add Comment header + prog.push_comment("SSUB Operand::Dst Operand::Src Operand::Immediat".to_string()); + iop_subx(prog, dst, src_a, src_b); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_cmp(prog: &mut Program, mrg_op: Pbs, cmp_op: Pbs) { + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Operand + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment(format!( + "CMP_{cmp_op} Operand::Dst Operand::Src Operand::Src" + )); + + // Deferred implementation to generic cmpx function + iop_cmpx(prog, &dst[0], &src_a, &src_b, mrg_op, cmp_op).add_to_prog(prog); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_mul(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Src, 1); + + // Add Comment header + prog.push_comment("MUL Operand::Dst Operand::Src Operand::Src".to_string()); + // Deferred implementation to generic mulx function + iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog); +} + +#[instrument(level = "trace", skip(prog))] +pub fn iop_muls(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst = prog.iop_template_var(OperandKind::Dst, 0); + // SrcA -> Operand + let src_a = prog.iop_template_var(OperandKind::Src, 0); + // SrcB -> Immediat + let src_b = prog.iop_template_var(OperandKind::Imm, 0); + + // Add Comment header + prog.push_comment("MULS Operand::Dst Operand::Src Operand::Immediat".to_string()); + // Deferred implementation to generic mulx function + iop_mulx(prog, dst, src_a, src_b).add_to_prog(prog); +} + +/// Implement erc_20 fund xfer +/// Targeted algorithm is as follow: +/// 1. Check that from has enough funds +/// 2. Compute real_amount to xfer (i.e. amount or 0) +/// 3. Compute new amount (from - new_amount, to + new_amount) +#[instrument(level = "trace", skip(prog))] +pub fn iop_erc_20(prog: &mut Program) { + // Allocate metavariables: + // Dest -> Operand + let dst_from = prog.iop_template_var(OperandKind::Dst, 0); + let dst_to = prog.iop_template_var(OperandKind::Dst, 1); + // Src -> Operand + let src_from = prog.iop_template_var(OperandKind::Src, 0); + let src_to = prog.iop_template_var(OperandKind::Src, 1); + // Src Amount -> Operand + let src_amount = prog.iop_template_var(OperandKind::Src, 2); + + // Add Comment header + prog.push_comment("ERC_20 (new_from, new_to) <- (from, to, amount)".to_string()); + + // TODO: Make this a parameter or sweep this + // All these little parameters would be very handy to write an + // exploration/compilation program which would try to minimize latency by + // playing with these. + let kogge_blk_w = 10; + let ripple = true; + + let tree = { + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + let lut = pbs_by_name!("IfFalseZeroed"); + let dst_to = VarCell::from_vec(dst_to); + let dst_from = VarCell::from_vec(dst_from); + let src_to = VarCell::from_vec(src_to); + let src_from = VarCell::from_vec(src_from); + let src_amount = VarCell::from_vec(src_amount); + + // Check if from has enough funds + let enough_fund = iop_cmpx_rtl( + prog, + src_from.clone(), + src_amount.clone(), + pbs_by_name!("CmpGteMrg"), + pbs_by_name!("CmpGte"), + ); + + let src_amount = src_amount + .into_iter() + .map(|x| { + x.mac(tfhe_params.msg_range(), &enough_fund) + .pbs(&lut) + .into_iter() + .next() + .unwrap() + }) + .collect::>(); + + if ripple { + kogge::ripple_add(dst_to, src_to, src_amount.clone(), None) + + kogge::ripple_sub(prog, dst_from, src_from, src_amount) + } else { + kogge::add(prog, dst_to, src_to, src_amount.clone(), None, kogge_blk_w) + + kogge::sub(prog, dst_from, src_from, src_amount, kogge_blk_w) + } + }; + tree.add_to_prog(prog); +} + +// ---------------------------------------------------------------------------- +// Helper Functions +// ---------------------------------------------------------------------------- +fn iop_addx( + prog: &mut Program, + dst: Vec, + src_a: Vec, + src_b: Vec, +) { + { + // Convert MetaVarCell in VarCell for Rtl analysis + let a = VarCell::from_vec(src_a); + let b = VarCell::from_vec(src_b); + // Do a + b with the kogge stone adder + kogge::cached_add(prog, a, b, None, dst) + } // Any reference to any metavar not linked to the RTL is dropped here + .add_to_prog(prog); +} + +fn iop_subx( + prog: &mut Program, + dst: Vec, + src_a: Vec, + src_b: Vec, +) { + { + // Convert MetaVarCell in VarCell for Rtl analysis + let a = VarCell::from_vec(src_a); + let b = VarCell::from_vec(src_b); + let b_inv = bw_inv(prog, b); + let one = Carry::Ripple(RippleCarry(VarCell::from(prog.new_imm(1)))); + kogge::cached_add(prog, a, b_inv, Some(one), dst) + } + .add_to_prog(prog); +} + +/// Generic mul operation +/// One destination and two sources operation +/// Source could be Operand or Immediat +#[instrument(level = "trace", skip(prog))] +pub fn iop_mulx( + prog: &mut Program, + dst: Vec, + src_a: Vec, + src_b: Vec, +) -> Rtl { + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + let blk_w = props.blk_w(); + + // Transform metavars into RTL vars + let mut dst = VarCell::from_vec(dst); + let src_a = VarCell::from_vec(src_a); + let src_b = VarCell::from_vec(src_b); + let max_deg = VarDeg { + deg: props.max_val(), + nu: props.nu, + }; + + let pbs_msg = pbs_by_name!("MsgOnly"); + let pbs_carry = pbs_by_name!("CarryInMsg"); + let pbs_mul_lsb = pbs_by_name!("MultCarryMsgLsb"); + let pbs_mul_msb = pbs_by_name!("MultCarryMsgMsb"); + + let mut mul_map: HashMap> = HashMap::new(); + itertools::iproduct!(0..blk_w, 0..blk_w).for_each(|(i, j)| { + let pp = src_a[i].mac(tfhe_params.msg_range(), &src_b[j]); + let lsb = pp.single_pbs(&pbs_mul_lsb); + let msb = pp.single_pbs(&pbs_mul_msb); + mul_map + .entry(i + j) + .or_default() + .push(VarCellDeg::new(props.max_msg(), lsb)); + mul_map + .entry(i + j + 1) + .or_default() + .push(VarCellDeg::new(props.max_msg(), msb)); + }); + + for (blk, dst) in dst.iter_mut().enumerate() { + let mut to_sum: VecVarCellDeg = mul_map.remove(&blk).unwrap().into(); + let mut bootstrap = |sum: &VarCellDeg| -> VarCellDeg { + trace!(target: "llt:mulx:bootstrap", "bootstrap: {:?}", sum); + if sum.deg.deg > props.max_msg() { + mul_map.entry(blk + 1).or_default().push(VarCellDeg::new( + sum.deg.deg >> props.msg_w, + sum.var.single_pbs(&pbs_carry), + )); + } + VarCellDeg::new(props.max_msg(), sum.var.single_pbs(&pbs_msg)) + }; + + while to_sum.len() > 1 { + to_sum = to_sum + .deg_chunks(&max_deg) + // Leveled Sum + .map(|mut chunk| { + trace!(target: "ilp:mulx", "leveled chunk: {:?}", chunk); + + while chunk.len() > 1 { + chunk = chunk + .chunks(2) + .map(|chunk| match chunk.len() { + 1 => chunk[0].clone(), + 2 => &chunk[0] + &chunk[1], + _ => panic!("Invalid chunk size"), + }) + .collect() + } + + chunk.into_iter().next().unwrap() + }) + // Bootstrap + .map(|sum| { + assert!(sum.deg.nu <= props.nu); + if sum.deg == max_deg { + bootstrap(&sum) + } else { + sum + } + }) + .collect::>() + .into(); + + // If no element has been bootstrapped, bootstrap the worst case + // This will be very unlikely, but if it ever happened it would have hanged + // the whole loop. Also, the output needs to be bootstrapped, + // anyway. + to_sum.0.iter().all(|x| x.deg.nu > 1).then(|| { + let max = to_sum.max_mut().unwrap(); + *max = bootstrap(max); + }); + } + + let out = to_sum.first().unwrap(); + + assert!( + { + let deg = out.deg.clone(); + deg.deg <= props.max_msg() && deg.nu == 1 + }, + "Output variable {blk} is not bootstrapped" + ); + + *dst <<= &out.var; + } + + Rtl::from(dst) +} + +/// Generic Cmp operation +/// One destination block and two sources operands +/// Source could be Operand or Immediat +#[instrument(level = "trace", skip(prog))] +pub fn iop_cmpx( + prog: &mut Program, + dst: &metavar::MetaVarCell, + src_a: &[metavar::MetaVarCell], + src_b: &[metavar::MetaVarCell], + mrg_lut: Pbs, + cmp_lut: Pbs, +) -> Rtl { + let mut dst = VarCell::from(dst); + let src_a = src_a.iter().map(VarCell::from).collect(); + let src_b = src_b.iter().map(VarCell::from).collect(); + dst <<= &iop_cmpx_rtl(prog, src_a, src_b, mrg_lut, cmp_lut); + Rtl::from(vec![dst]) +} + +/// Generic Cmp operation +/// One destination block and two sources operands +/// Source could be Operand or Immediat +#[instrument(level = "trace", skip(prog))] +pub fn iop_cmpx_rtl( + prog: &mut Program, + src_a: Vec, + src_b: Vec, + mrg_lut: Pbs, + cmp_lut: Pbs, +) -> VarCell { + let props = prog.params(); + let tfhe_params: asm::DigitParameters = props.clone().into(); + + // Wrapped given cmp_op and comp_sign lookup table in MetaVar + let pbs_none = pbs_by_name!("None"); + let cmp_sign = pbs_by_name!("CmpSign"); + let cmp_reduce = pbs_by_name!("CmpReduce"); + + // Pack A and B elements by pairs + let packed = std::iter::zip(src_a.chunks(2), src_b.chunks(2)) + .map(|(a, b)| { + let pack_a = if a.len() > 1 { + // Reset noise for future block merge through sub + a[0].mac(tfhe_params.msg_range(), &a[1]) + .single_pbs(&pbs_none) + } else { + a[0].clone() + }; + + let pack_b = if b.len() > 1 { + b[0].mac(tfhe_params.msg_range(), &b[1]) + .single_pbs(&pbs_none) + } else { + b[0].clone() + }; + (pack_a, pack_b) + }) + .collect::>(); + + let mut merged = packed + .into_iter() + .map(|(a, b)| &(&a - &b).single_pbs(&cmp_sign) + 1) + .collect::>(); + + while merged.len() > 2 { + merged = merged + .into_iter() + .chunks(2) + .into_iter() + .map(|mut chunk| { + let left = chunk.next(); + let right = chunk.next(); + match (left, right) { + (Some(l), None) => l, + (Some(l), Some(r)) => { + l.mac(tfhe_params.msg_range(), &r).single_pbs(&cmp_reduce) + } + _ => panic!("Chunk misbehaved"), + } + }) + .collect() + } + + match merged.len() { + 2 => merged[0] + .mac(tfhe_params.msg_range(), &merged[1]) + .single_pbs(&mrg_lut), + 1 => merged[0].single_pbs(&cmp_lut), + _ => panic!("Fix your bugs!"), + } +} + +fn bw_inv(prog: &mut Program, b: Vec) -> Vec { + let blk_w = prog.params().blk_w(); + let imm = (0..blk_w).map(|_| VarCell::from(prog.new_imm((1 << prog.params().msg_w) - 1))); + b.iter() + .zip_longest(imm) + .map(|r| match r { + EitherOrBoth::Right(i) => i, + EitherOrBoth::Both(b, i) => &i - b, + EitherOrBoth::Left(_) => { + panic!( + "The input to be inverted is greater than blk_w({}): {}", + blk_w, + b.len() + ) + } + }) + .collect::>() +} diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs new file mode 100644 index 000000000..e93682074 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/llt/vardeg.rs @@ -0,0 +1,160 @@ +use super::rtl::VarCell; +use tracing::trace; + +#[derive(Clone, Eq, Default, Debug)] +pub struct VarDeg { + pub deg: usize, + pub nu: usize, +} + +impl std::ops::Add for &VarDeg { + type Output = VarDeg; + + fn add(self, rhs: Self) -> Self::Output { + VarDeg { + deg: self.deg + rhs.deg, + nu: self.nu + rhs.nu, + } + } +} + +impl PartialOrd for VarDeg { + fn partial_cmp(&self, other: &VarDeg) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for VarDeg { + fn cmp(&self, other: &VarDeg) -> std::cmp::Ordering { + if self.deg > other.deg || self.nu > other.nu { + std::cmp::Ordering::Greater + } else if self.deg == other.deg || self.nu == other.nu { + std::cmp::Ordering::Equal + } else { + std::cmp::Ordering::Less + } + } +} + +impl PartialEq for VarDeg { + fn eq(&self, other: &VarDeg) -> bool { + self.cmp(other) == std::cmp::Ordering::Equal + } +} + +#[derive(Clone, Eq)] +pub struct VarCellDeg { + pub var: VarCell, + pub deg: VarDeg, +} + +impl PartialOrd for VarCellDeg { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for VarCellDeg { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.deg.cmp(&other.deg) + } +} + +impl PartialEq for VarCellDeg { + fn eq(&self, other: &VarCellDeg) -> bool { + self.cmp(other) == std::cmp::Ordering::Equal + } +} + +impl std::ops::Add for &VarCellDeg { + type Output = VarCellDeg; + + fn add(self, rhs: Self) -> Self::Output { + VarCellDeg { + var: &self.var + &rhs.var, + deg: &self.deg + &rhs.deg, + } + } +} + +impl VarCellDeg { + pub fn new(deg: usize, var: VarCell) -> Self { + VarCellDeg { + var, + deg: VarDeg { deg, nu: 1 }, + } + } +} + +#[derive(Debug)] +pub struct VecVarCellDeg(pub Vec); + +impl From> for VecVarCellDeg { + fn from(v: Vec) -> Self { + VecVarCellDeg(v) + } +} + +impl std::fmt::Debug for VarCellDeg { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VarCellDeg") + .field("deg", &self.deg.deg) + .field("nu", &self.deg.nu) + .finish() + } +} + +impl VecVarCellDeg { + pub fn deg_chunks( + mut self, + max_deg: &VarDeg, + ) -> > as IntoIterator>::IntoIter { + trace!(target: "ilp:deg_chunks", "len: {:?}, {:?}", self.len(), self.0); + + let mut res: Vec> = Vec::new(); + let mut acc: VarDeg = VarDeg::default(); + let mut chunk: Vec = Vec::new(); + + // There are many ways to combine the whole vector in chunks up to + // max_deg. We'll be greedy and sum up the elements by maximum degree + // first. + self.0.sort(); + + while !self.is_empty() { + let sum = &acc + &self.0.last().unwrap().deg; + if sum <= *max_deg { + chunk.push(self.0.pop().unwrap()); + acc = sum; + } else { + res.push(chunk); + acc = VarDeg::default(); + chunk = Vec::new(); + } + trace!(target: "ilp:deg_chunks:loop", "len: {:?}, {:?}, chunk: {:?}, + acc: {:?}", self.len(), self.0, chunk, acc); + } + + // Any remaining chunk is appended + if !chunk.is_empty() { + res.push(chunk); + } + + res.into_iter() + } + + pub fn first(self) -> Option { + self.0.into_iter().next() + } + + pub fn max_mut(&mut self) -> Option<&mut VarCellDeg> { + self.0.iter_mut().max() + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.len() == 0 + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/fw_impl/mod.rs b/backends/tfhe-hpu-backend/src/fw/fw_impl/mod.rs new file mode 100644 index 000000000..3ceeb8db9 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/fw_impl/mod.rs @@ -0,0 +1,45 @@ +use super::*; +use crate::asm::{AsmIOpcode, DOp, IOpcode}; + +pub mod demo; +pub mod ilp; +pub mod llt; + +/// Utility macro to define new FW implementation +#[macro_export] +macro_rules! impl_fw { + ( + $name: literal + [ + $($opcode: ident => $func: expr $(;)?)* + ] + ) => { + ::paste::paste! { + pub struct [<$name:camel>](); + + impl Default for [<$name:camel>]{ + fn default() -> Self { + Self() + } + } + + impl Fw for [<$name:camel>]{ + fn expand(&self, params: &FwParameters, iopcode: &AsmIOpcode) -> asm::Program { + let mut prog = program::Program::new(params); + match IOpcode::from(iopcode) { + $( + IOpcode($opcode) => { + prog.set_op(iopcode.format.as_ref() + .map(|a| a.name.as_str()) + .unwrap_or("default")); + $func(&mut prog) + }, + )* + _ => panic!("Fw {} doesn't support `{iopcode}`", $name), + } + prog.into() + } + } + } + }; +} diff --git a/backends/tfhe-hpu-backend/src/fw/isc_sim/mod.rs b/backends/tfhe-hpu-backend/src/fw/isc_sim/mod.rs new file mode 100644 index 000000000..72b6deb8e --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/isc_sim/mod.rs @@ -0,0 +1,160 @@ +//! Model the instruction flom in HPU. +//! +//! Use a simple event based time simulation and simple PE modelisation + +use bitflags::bitflags; +use serde::{Deserialize, Serialize}; + +/// Implement a pool that mimics the RTL +mod pool; +use pool::Pool; + +/// Implement simple model of Pe +mod pe; +pub(crate) use pe::{Flush as PeFlush, PeStore}; +pub use pe::{PeConfig, PeConfigStore, PeCost}; + +/// Implement time simulation of Isc +mod scheduler; +pub use scheduler::Scheduler; + +pub(crate) mod report; + +use crate::asm; + +/// Event used for modelisation of time advance +/// Contain the cycle in which the event must occurred and the associated event type +#[derive(Debug)] +pub struct Event { + pub(crate) at_cycle: usize, + pub(crate) event_type: EventType, +} + +impl Event { + pub(crate) fn new(event_type: EventType, at_cycle: usize) -> Self { + Self { + at_cycle, + event_type, + } + } +} + +/// Event are stored in a BinaryHeap and we want to pop the smallest one firs +/// Thuse Ord trait is implemented in a "reverse". +impl Ord for Event { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.at_cycle.cmp(&other.at_cycle).reverse() + } +} + +impl PartialOrd for Event { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl PartialEq for Event { + fn eq(&self, other: &Self) -> bool { + self.at_cycle == other.at_cycle + } +} +impl Eq for Event {} + +/// Kind of the event +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum EventType { + RdUnlock(InstructionKind, usize), + WrUnlock(InstructionKind, usize), + ReqTimeout(InstructionKind, usize), + DelTimeout(InstructionKind, usize), + BatchStart { pe_id: usize, issued: usize }, + QuantumEnd, + BpipTimeout, + Query, +} + +bitflags! { +/// Instruction are dispatch on Pe based on their kind +/// However, we also need to filter on a multi-kind fashion, thus we rely on bitflag instead of std +/// rust enum + #[repr(transparent)] + #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize, Hash)] + pub struct InstructionKind: usize { + const None = 0x00; + const MemLd= 0x01; + const MemSt= 0x02; + const Arith= 0x04; + const Pbs = 0x08; + const Sync = 0x10; + } +} + +impl std::fmt::Display for InstructionKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let kind = match *self { + Self::None => "None", + Self::MemLd => "MemLd", + Self::MemSt => "MemSt", + Self::Arith => "Arith", + Self::Pbs => "Pbs", + Self::Sync => "Sync", + _ => "MultiKind", + }; + write!(f, "{kind}") + } +} + +impl From<&asm::DOp> for InstructionKind { + fn from(value: &asm::DOp) -> Self { + match value { + asm::DOp::ADD(_) + | asm::DOp::SUB(_) + | asm::DOp::MAC(_) + | asm::DOp::ADDS(_) + | asm::DOp::SUBS(_) + | asm::DOp::SSUB(_) + | asm::DOp::MULS(_) => Self::Arith, + asm::DOp::LD(_) => Self::MemLd, + asm::DOp::ST(_) => Self::MemSt, + asm::DOp::PBS(_) + | asm::DOp::PBS_ML2(_) + | asm::DOp::PBS_ML4(_) + | asm::DOp::PBS_ML8(_) => Self::Pbs, + asm::DOp::PBS_F(_) + | asm::DOp::PBS_ML2_F(_) + | asm::DOp::PBS_ML4_F(_) + | asm::DOp::PBS_ML8_F(_) => Self::Pbs, + asm::DOp::SYNC(_) => Self::Sync, + } + } +} + +/// Use in the execution trace +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum Query { + Refill, + Issue, + RdUnlock, + Retire, +} + +#[derive(Debug, Serialize, Deserialize)] +enum TraceEvent { + Query { cmd: Query, slot: pool::Slot }, + Timeout, + ReqTimeout(usize), + DelTimeout, + BatchStart { pe_id: usize, issued: usize }, +} + +/// Generate a detailed execution trace that could be read afterward +#[derive(Debug, Serialize, Deserialize)] +pub struct Trace { + timestamp: usize, + event: TraceEvent, +} + +impl std::fmt::Display for Trace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "@{}::{:?}", self.timestamp, self.event) + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/isc_sim/pe.rs b/backends/tfhe-hpu-backend/src/fw/isc_sim/pe.rs new file mode 100644 index 000000000..aec3e7dbc --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/isc_sim/pe.rs @@ -0,0 +1,572 @@ +use std::collections::VecDeque; + +use serde::{Deserialize, Serialize}; + +use crate::prelude::{HpuConfig, HpuParameters}; + +use super::*; + +use tracing::trace; + +#[derive(PartialEq, Eq, Clone, Copy)] +pub enum Flush { + Timeout, + Force, + BatchFull, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +enum BatchCost { + Fixed(usize), + Linear { + cnst: usize, // Fixed batch cost + ppbs: usize, // Cost per PBS + bmin: usize, // The minimum batch size + }, +} + +impl Default for BatchCost { + fn default() -> Self { + BatchCost::Fixed(0) + } +} + +impl BatchCost { + fn cost(&self, batch_size: usize) -> usize { + match self { + BatchCost::Fixed(cost) => *cost, + BatchCost::Linear { cnst, ppbs, bmin } => *cnst + *ppbs * batch_size.max(*bmin), + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct PeCost { + rd_lock: BatchCost, + wr_lock: BatchCost, +} + +#[derive(Clone, Debug, Default)] +pub struct PeStats { + pub batches: usize, + pub usage_sum: f64, + pub issued: usize, + pub by_timeout: usize, + pub by_batchfull: usize, +} + +#[derive(Clone, Debug, Deserialize, Serialize, Copy)] +pub struct BatchSize { + pub min: usize, + pub max: usize, +} + +impl Default for BatchSize { + fn default() -> Self { + BatchSize { min: 1, max: 1 } + } +} + +#[derive(Clone, Debug, Default)] +pub(crate) struct Pe { + // PE Batch Configuration + // The limits of the batch size for the PE + batch_size: BatchSize, + flush_opportunism: bool, + // Runtime State + // A limit to instructions in the PE + pe_limit: usize, + // The current FIFO limit + fifo_limit: usize, + // The current limit to start execution. Can be less than batch_size.max + batch_limit: usize, + // Flush state of the instructions currently in the PE+Queue + in_fifo: VecDeque, + // Instructions in the FIFO that are reading from the regfile + reading: usize, + // Instructions in the FIFO that have finished reading but not yet executing + waiting: usize, + // Instructions in the FIFO that are executing + executing: usize, + timeout_active: bool, + cost: PeCost, + kind: InstructionKind, + stats: PeStats, +} + +impl Pe { + fn pending(&self) -> usize { + self.in_fifo.len() + } + + fn fifo_free(&self) -> usize { + self.fifo_limit.saturating_sub(self.pending()) + } + + fn set_batch_limit(&mut self, limit: usize) { + self.batch_limit = limit; + } + + fn is_busy(&self) -> bool { + self.executing != 0 + } + + fn is_full(&self) -> bool { + self.fifo_free() == 0 + } + + fn avail_kind(&self) -> InstructionKind { + if self.is_full() { + InstructionKind::None + } else { + self.kind + } + } + + fn push(&mut self, flush: bool) { + self.in_fifo.push_back(flush); + assert!( + self.in_fifo.len() <= self.fifo_limit, + "Pushed above the PE fifo limit" + ) + } + + fn rd_unlock(&mut self) { + assert!(self.reading > 0, "RdUnlock request on already unlock pe"); + self.reading -= 1; + self.waiting += 1; + } + + fn wr_unlock(&mut self) { + assert!(0 < self.executing, "WrUnlock request on a non-busy PE"); + self.executing -= 1; + self.in_fifo.pop_front(); + } + + fn probe_for_exec( + &mut self, + pe_id: usize, + at_cycle: usize, + batch_flush: Option, + ) -> Vec { + let mut evt = Vec::new(); + + // Check if any instruction can be read + let rd = self + .pe_limit + .min(self.pending()) + .saturating_sub(self.reading + self.waiting + self.executing); + if rd > 0 { + evt.extend((0..rd).map(|_| { + Event::new( + EventType::RdUnlock(self.kind, pe_id), + at_cycle + self.cost.rd_lock.cost(rd), + ) + })); + self.reading += rd; + } + + if !self.is_busy() { + // Check if a batch can be issued + let issued = (0..self.waiting) + .map(|i| self.in_fifo[i]) + // Check if there's a forced flush queued + .position(|c| c) + .and_then(|p| { + if self.flush_opportunism { + // With flush_opportunism, we flush everything that is + // waiting + (self.waiting < self.batch_limit).then_some((Flush::Force, self.waiting)) + } else { + // Else, flush exactly up to the first queued flush + (p < self.batch_limit).then(|| (Flush::Force, p + 1)) + } + }) + // If not, check if the batch is full + .or_else(|| { + (self.waiting >= self.batch_limit) + .then_some((Flush::BatchFull, self.batch_limit)) + }) + // If not, check if there's a timeout or any other reason to + // flush + .or_else(|| { + batch_flush + .map(|b| (b, self.waiting)) + .filter(|(_, pdg)| *pdg > 0) + }); + + if let Some((flush, issued)) = issued { + // update state + self.waiting -= issued; + self.executing += issued; + self.stats.issued += issued; + self.stats.batches += 1; + self.stats.by_timeout += (flush == Flush::Timeout) as usize; + self.stats.by_batchfull += (flush == Flush::BatchFull) as usize; + self.stats.usage_sum += (issued as f64 / self.batch_size.min as f64).min(1.0f64); + + evt.push(Event::new( + EventType::BatchStart { pe_id, issued }, + at_cycle, + )); + + if self.timeout_active && self.batch_limit > 1 { + evt.push(Event::new( + EventType::DelTimeout(self.kind, pe_id), + at_cycle, + // +1 To make sure the timer is deleted after being + // restarted + )); + self.timeout_active = false; + } + + // Register unlock event + evt.extend((0..issued).map(|_| { + Event::new( + EventType::WrUnlock(self.kind, pe_id), + at_cycle + self.cost.wr_lock.cost(issued), + ) + })); + } else if !self.timeout_active && self.waiting > 0 && self.batch_limit > 1 { + self.timeout_active = true; + evt.push(Event::new( + EventType::ReqTimeout(self.kind, pe_id), + at_cycle, + )); + } + } + + evt + } + + pub fn reset_stats(&mut self) { + self.stats = PeStats::default(); + } + + pub fn stats(&self) -> PeStats { + self.stats.clone() + } +} + +#[derive(Clone, Debug)] +pub(crate) struct PeStore(pub(crate) Vec<(String, Pe)>); + +impl PeStore { + pub(crate) fn avail_kind(&self) -> InstructionKind { + self.0 + .iter() + .fold(InstructionKind::None, |acc, pe| acc | pe.1.avail_kind()) + } + + pub(crate) fn push(&mut self, kind_1h: InstructionKind, flush: bool) { + // TODO check that kind is really one hot + let mut capable_pe = self + .0 + .iter_mut() + .filter(|(_, pe)| InstructionKind::None != (pe.kind & kind_1h)) + .collect::>(); + + assert_eq!( + capable_pe.len(), + 1, + "Found {} capable pe for {:?}, unsupported", + capable_pe.len(), + kind_1h, + ); + capable_pe[0].1.push(flush); + } + + pub(crate) fn try_push(&mut self, kind_1h: InstructionKind, flush: bool) -> Option { + let mut capable_pe = self + .0 + .iter_mut() + .enumerate() + .filter(|(_, (_, pe))| (InstructionKind::None != (pe.kind & kind_1h)) && !pe.is_full()) + .collect::>(); + + capable_pe.first_mut().map(|(id, (_, pe))| { + pe.push(flush); + *id + }) + } + + pub(crate) fn probe_for_exec_id( + &mut self, + id: usize, + at_cycle: usize, + batch_flush: Option, + ) -> Vec { + self.0[id].1.probe_for_exec(id, at_cycle, batch_flush) + } + + pub(crate) fn probe_for_exec( + &mut self, + at_cycle: usize, + batch_flush: Option, + ) -> Vec { + let mut events = Vec::new(); + self.0.iter_mut().enumerate().for_each(|(id, pe)| { + let evt = pe.1.probe_for_exec(id, at_cycle, batch_flush); + events.extend(evt); + }); + events + } + + pub(crate) fn rd_unlock(&mut self, pe_id: usize) { + self.0[pe_id].1.rd_unlock() + } + + pub(crate) fn wr_unlock(&mut self, pe_id: usize) { + self.0[pe_id].1.wr_unlock() + } + + pub(crate) fn pending(&self) -> usize { + self.0.iter().map(|(_, pe)| pe.pending()).sum::() + } + + pub(crate) fn reset_stats(&mut self) { + self.0.iter_mut().for_each(|(_, pe)| { + pe.reset_stats(); + }); + } + + pub(crate) fn set_min_batch_limit(&mut self) { + self.0.iter_mut().for_each(|(_, pe)| { + pe.set_batch_limit(pe.batch_size.min); + }); + } + + pub(crate) fn set_fifo_to_batch_limit(&mut self) { + self.0.iter_mut().for_each(|(_, pe)| { + pe.fifo_limit = pe.batch_limit; + }); + } +} + +/// Ligther view of Pe with only the parameters not the runtime state +/// Use for serde in config file +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct PeConfig { + pub cost: PeCost, + pub kind: InstructionKind, + pub batch_size: BatchSize, // The batch sizes + pub pe_limit: Option, // The limit on the number of PBSs in the PE + pub in_limit: Option, // The limit on the input fifo before the PE + pub flush_opportunism: bool, /* Whether the PE is opportunistic when + * scheduling */ +} + +impl PeConfig { + pub fn new( + cost: PeCost, + kind: InstructionKind, + batch_size: BatchSize, + pe_limit: Option, + in_limit: Option, + flush_opportunism: bool, + ) -> Self { + Self { + cost, + kind, + batch_size, + pe_limit, + in_limit, + flush_opportunism, + } + } +} + +impl From for Pe { + fn from(config: PeConfig) -> Self { + let PeConfig { + cost, + kind, + batch_size, + pe_limit, + in_limit, + flush_opportunism, + } = config; + + assert!(batch_size.max > 0, "Invalid batch_size value"); + Self { + cost, + kind, + batch_size, + flush_opportunism, + pe_limit: pe_limit.unwrap_or(usize::MAX), + fifo_limit: pe_limit + .unwrap_or(usize::MAX) + .saturating_add(in_limit.unwrap_or(usize::MAX)), + batch_limit: batch_size.max, + in_fifo: VecDeque::new(), + ..Default::default() + } + } +} + +impl From<&Pe> for PeConfig { + fn from(pe: &Pe) -> Self { + Self { + cost: pe.cost.clone(), + kind: pe.kind, + batch_size: pe.batch_size, + pe_limit: Some(pe.pe_limit), + in_limit: Some(pe.fifo_limit - pe.pe_limit), + flush_opportunism: pe.flush_opportunism, + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct PeConfigStore(pub Vec<(String, PeConfig)>); + +impl PeConfigStore { + pub fn new(store: Vec<(String, PeConfig)>) -> Self { + Self(store) + } +} + +/// Construct PeConfigStore directly from HpuParameters +/// Use RTL parameters to compute the expected performances +impl From<(&HpuParameters, &HpuConfig)> for PeConfigStore { + fn from(tuple_config: (&HpuParameters, &HpuConfig)) -> Self { + let (params, config) = tuple_config; + // TODO: Add register to depicts the number of computation units (NB: Currently fixed to 1) + let ldst_pe_nb = 1; + let lin_pe_nb = 1; + let pbs_pe_nb = 1; + let total_pbs_nb = params.ntt_params.total_pbs_nb; + let in_limit = Some(8); // TODO: Add registers with this information per PE + + // Extract used parameters for ease of access + let batch_pbs = params.ntt_params.batch_pbs_nb; + let lwe_k = params.pbs_params.lwe_dimension; + let glwe_k = params.pbs_params.glwe_dimension; + let poly_size = params.pbs_params.polynomial_size; + let flush_opportunism = config.rtl.bpip_use_opportunism; + let pem_axi_w = params.pc_params.pem_pc * params.pc_params.pem_bytes_w * 8; + let ct_w = params.ntt_params.ct_width as usize; + let lbx = params.ks_params.lbx; + let min_batch_size = params.ntt_params.min_pbs_nb.unwrap(); + + // Compute some intermediate values + let blwe_coefs = (poly_size * glwe_k) + 1; + let glwe_coefs = poly_size * (glwe_k + 1); + let rpsi = params.ntt_params.radix * params.ntt_params.psi; + + // Cycles required to load a ciphertext in the computation pipe + let ct_load_cycles = usize::div_ceil(glwe_coefs * params.pbs_params.pbs_level, rpsi); + // Latency of a Cmux for a batch + let cmux_lat = ct_load_cycles * batch_pbs; + + // NB: Keyswitch latency is dimension to match roughly the Cmux latency (with lbx coefs in + // //) Keep this approximation here + let ks_cycles = cmux_lat * lbx; + + let mut pe_config_store = Vec::with_capacity(ldst_pe_nb + lin_pe_nb + batch_pbs); + + // LoadStore + // Load store performance is computed as access_cycle *2 + // Take 2 as really raw approximation + // LoadStore operation don't support early rd_unlock -> assign same value as wr_unlock + let ldst_raw_cycle = (blwe_coefs * ct_w).div_ceil(pem_axi_w); + let ldst_cycle = ldst_raw_cycle * 2; + for i in 0..ldst_pe_nb { + let name = format!("LdSt_{i}"); + let cost = PeCost { + rd_lock: BatchCost::Fixed(ldst_cycle), + wr_lock: BatchCost::Fixed(1), + }; + let kind = InstructionKind::MemLd | InstructionKind::MemSt; + pe_config_store.push(( + name, + PeConfig::new(cost, kind, BatchSize::default(), Some(1), in_limit, true), + )); + } + + // Linear operation + // Linear operation performance is computed roughly as glwe_n*glwe_k + // In practice this could be lower if multiple coefs are handle in // + // Linear operation don't support early rd_unlock -> assign same value as wr_unlock + let lin_cycle = blwe_coefs; + for i in 0..lin_pe_nb { + let name = format!("Lin_{i}"); + let cost = PeCost { + rd_lock: BatchCost::Fixed(lin_cycle), + wr_lock: BatchCost::Fixed(1), + }; + let kind = InstructionKind::Arith; + pe_config_store.push(( + name, + PeConfig::new(cost, kind, BatchSize::default(), Some(1), in_limit, true), + )); + } + + // KsPbs operation + // View as PeBatch unit + // IPIP/BPIP Mode is handle by the scheduler module + // Thus we view the KsPbs engine as a list of batch_pbs alu with full latency each + let kspbs_rd_cycle = blwe_coefs.div_ceil(params.regf_params.coef_nb); + let kspbs_cnst_cost = kspbs_rd_cycle; // write to regfile + let kspbs_pbs_cost = ( + ks_cycles // latency of keyswitch + + lwe_k * cmux_lat // Loop of cmux lat + + batch_pbs * blwe_coefs.div_ceil(rpsi / 2 /* approx */) + //Sample extract latency + ) / batch_pbs; + + for i in 0..pbs_pe_nb { + let name = format!("KsPbs_{i}"); + let cost = PeCost { + rd_lock: BatchCost::Fixed(kspbs_rd_cycle), + wr_lock: BatchCost::Linear { + cnst: kspbs_cnst_cost, + ppbs: kspbs_pbs_cost, + bmin: min_batch_size, + }, + }; + let kind = InstructionKind::Pbs; + pe_config_store.push(( + name, + PeConfig::new( + cost, + kind, + BatchSize { + min: min_batch_size, + max: batch_pbs, + }, + Some(total_pbs_nb), + in_limit, + flush_opportunism, + ), + )); + } + + trace!("pe_config_store: {:?}", pe_config_store); + + Self::new(pe_config_store) + } +} + +impl From for PeStore { + fn from(config: PeConfigStore) -> Self { + let store = config + .0 + .into_iter() + .map(|(name, pe)| (name, Pe::from(pe))) + .collect::>(); + + Self(store) + } +} +impl From<&PeStore> for PeConfigStore { + fn from(store: &PeStore) -> Self { + let config = store + .0 + .iter() + .map(|(name, pe)| (name.clone(), PeConfig::from(pe))) + .collect::>(); + + Self(config) + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/isc_sim/pool.rs b/backends/tfhe-hpu-backend/src/fw/isc_sim/pool.rs new file mode 100644 index 000000000..a197f31ef --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/isc_sim/pool.rs @@ -0,0 +1,531 @@ +use asm::dop::{ + DOpPbs, DOpPbsF, DOpPbsMl2, DOpPbsMl2F, DOpPbsMl4, DOpPbsMl4F, DOpPbsMl8, DOpPbsMl8F, IsFlush, + ToAsm, +}; +use asm::PbsLut; +use tracing::instrument; + +use super::*; + +#[derive(Debug)] +pub struct Pool { + max_depth: usize, + store: Vec, +} + +impl std::fmt::Display for Pool { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Pool content [{}]:", self.max_depth)?; + for (i, slot) in self.store.iter().enumerate() { + writeln!(f, "{i} -> {slot:?}")?; + } + Ok(()) + } +} + +#[derive(Debug)] +pub enum IssueEvt { + None, + DOp { + kind_1h: InstructionKind, + flush: bool, + slot: Slot, + }, + Sync(Slot), +} + +impl Pool { + pub fn new(isc_depth: usize) -> Self { + Self { + max_depth: isc_depth, + store: Vec::with_capacity(isc_depth), + } + } + + /// Check if the pool is full + pub fn is_full(&self) -> bool { + self.store.len() >= self.max_depth + } + + /// This function find the first matching slot update it in place + /// And also update lock counter of all matching slot + /// kind_mh is an aggregtion of pending rd_unlock + #[instrument(level = "trace", skip(self))] + pub fn rd_unlock(&mut self, kind_mh: InstructionKind) -> (InstructionKind, &Slot) { + // 1. find matching slot and update + // -> Search for oldest issued instruction with matching kind + let filter = Filter { + vld: Some(true), + rd_pdg: Some(true), + pdg: Some(true), + kind: Some(kind_mh), + ..Default::default() + }; + let mut slot = self.first_match(filter).expect("RdUnlock unmatched"); + slot.state.rd_pdg = false; + + // 2. Decrease matching rd_lock cnt + let filter = Filter { + vld: Some(true), + pdg: Some(false), + srcs_on_dst: Some((slot.inst.srca_id, slot.inst.srcb_id)), + ..Default::default() + }; + self.idx_matches(filter).into_iter().for_each(|idx| { + tracing::trace!("RdLock decrement -> {:?}", self.store[idx]); + // TODO dig in this condition + // Find a case that required the underflow filtering + if self.store[idx].state.rd_lock != 0 { + self.store[idx].state.rd_lock -= 1; + } + }); + + // 3. Insert modified slot back + let kind_1h = slot.inst.kind; + self.store.push(slot); + // Use hand call to trace to prevent closure escape of ref with #[instrument(ret)] + tracing::trace!("Return: {:?}", self.store.last().unwrap()); + (kind_1h, self.store.last().unwrap()) + } + + /// This function find the first matching slot update it in place + /// And also update lock counter of all matching slot + /// kind_mh is an aggregtion of pending wr_unlock + #[instrument(level = "trace", skip(self), ret)] + pub fn retire(&mut self, kind_mh: InstructionKind) -> Slot { + // 1. find matching slot and update + // -> Search for oldest issued instruction with matching kind + let filter = Filter { + vld: Some(true), + rd_pdg: Some(false), + pdg: Some(true), + kind: Some(kind_mh), + ..Default::default() + }; + let slot = self.first_match(filter).expect("Retire unmatched"); + + // 2. Decrease matching wr_lock cnt + let filter = Filter { + vld: Some(true), + rd_pdg: Some(true), + pdg: Some(false), + dst_on_srcs: Some(slot.inst.dst_id), + dst_on_dst: Some(slot.inst.dst_id), + ..Default::default() + }; + self.idx_matches(filter).into_iter().for_each(|idx| { + tracing::trace!("WrLock decrement -> {:?}", self.store[idx]); + if self.store[idx].state.wr_lock != 0 { + self.store[idx].state.wr_lock -= 1; + } + }); + + // 2. Decrease matching wr_lock cnt of Sync token + let filter = Filter { + vld: Some(true), + kind: Some(InstructionKind::Sync), + sync_id: Some(slot.state.sync_id), + ..Default::default() + }; + self.idx_matches(filter).into_iter().for_each(|idx| { + tracing::trace!("SyncLock decrement -> {:?}", self.store[idx]); + if self.store[idx].state.wr_lock != 0 { + self.store[idx].state.wr_lock -= 1; + } + }); + + slot + } + + /// This function find the first empty slot, populated with DOp information and move it + /// in front position + #[instrument(level = "trace", skip(self), ret)] + pub fn refill(&mut self, sync_id: usize, dop: asm::DOp) -> &Slot { + assert!( + self.store.len() < self.max_depth, + "Refill in a already full pool" + ); + + let op_kind = InstructionKind::from(&dop); + let dst_id = ArgId::from_dst(&dop); + let srca_id = ArgId::from_srca(&dop); + let srcb_id = ArgId::from_srcb(&dop); + let flush = dop.is_flush(); + + // 1. Compute (wr_lock, rd_lock) + // RdLock -> #instruction before us that need to READ into our destination + // WrLock -> #instruction before us that need to Write into one of our sources + let (wr_lock, rd_lock, issue_lock) = if op_kind == InstructionKind::Sync { + // Count vld instruction that match with sync_id + let filter = Filter { + vld: Some(true), + sync_id: Some(sync_id), + ..Default::default() + }; + let sync_lock = self.idx_matches(filter).len(); + (sync_lock, 0, 0) + } else { + // Count vld instruction where our dst match on their srcs + let filter = Filter { + vld: Some(true), + rd_pdg: Some(true), + dst_on_srcs: Some(dst_id), + ..Default::default() + }; + let rd_lock = self.idx_matches(filter).len(); + + // Count vld instruction where our src match on their dst + let filter = Filter { + vld: Some(true), + srcs_on_dst: Some((srca_id, srcb_id)), + dst_on_dst: Some(dst_id), + ..Default::default() + }; + let wr_lock = self.idx_matches(filter).len(); + + // Count vld instruction that were not issued and are not flushes if + // this is a flush and vice-versa. Only for PBSs. + let issue_lock = if op_kind == InstructionKind::Pbs { + let filter = Filter { + vld: Some(true), + rd_pdg: Some(true), + pdg: Some(false), + flush: Some(!flush), + kind: Some(InstructionKind::Pbs), + ..Default::default() + }; + self.idx_matches(filter).len() + } else { + 0 + }; + + (wr_lock, rd_lock, issue_lock) + }; + + // 2. Create new slot and insert it in store + let slot = Slot { + inst: Instruction { + kind: op_kind, + dst_id, + srca_id, + srcb_id, + flush, + op: dop, + }, + state: State { + sync_id, + rd_lock, + wr_lock, + issue_lock, + vld: true, + rd_pdg: true, + pdg: false, + }, + }; + tracing::debug!("Refill with {slot:?}"); + self.store.push(slot); + self.store.last().unwrap() + } + + /// This function find the first issuable slot if any, update it's information and move it + /// in back position + /// kind_mh is an aggregtion of available pe + #[instrument(level = "trace", skip(self), ret)] + pub fn issue(&mut self, kind_mh: InstructionKind) -> IssueEvt { + // 1. find matching slot and update + // -> Search for oldest unissued instruction with matching kind + let filter = Filter { + vld: Some(true), + rd_pdg: Some(true), + pdg: Some(false), + lock_rdy: Some(true), + kind: Some(kind_mh), + ..Default::default() + }; + if let Some(mut slot) = self.first_match(filter) { + if slot.inst.kind == InstructionKind::Sync { + // Sync are handle with custom logic -> Issue them, directly release the slot + IssueEvt::Sync(slot) + } else { + if slot.inst.kind == InstructionKind::Pbs { + let filter = Filter { + vld: Some(true), + rd_pdg: Some(true), + pdg: Some(false), + flush: Some(!slot.inst.flush), + kind: Some(InstructionKind::Pbs), + ..Default::default() + }; + self.idx_matches(filter).into_iter().for_each(|idx| { + tracing::trace!("Issue decrement -> {:?}", self.store[idx]); + self.store[idx].state.issue_lock = + self.store[idx].state.issue_lock.saturating_sub(1); + }); + } + + // Update slot and insert back + slot.state.pdg = true; + let kind_1h = slot.inst.kind; + let flush = slot.inst.flush; + let trace_slot = slot.clone(); + self.store.push(slot); + IssueEvt::DOp { + kind_1h, + flush, + slot: trace_slot, + } + } + } else { + IssueEvt::None + } + } +} + +impl Pool { + /// Extract the first matching entry from the pool + fn first_match(&mut self, filter: Filter) -> Option { + let match_idx = self.idx_matches(filter); + if let Some(idx) = match_idx.first() { + // extract value + Some(self.store.remove(*idx)) + } else { + None + } + } + + /// Return a vector of matching index + fn idx_matches(&self, filter: Filter) -> Vec { + self.store + .iter() + .enumerate() + .filter(|(_, elem)| { + if let Some(sync_id) = filter.sync_id { + elem.state.sync_id == sync_id + } else { + true + } + }) + .filter(|(_, elem)| { + if let Some(vld) = filter.vld { + elem.state.vld == vld + } else { + true + } + }) + .filter(|(_, elem)| { + if let Some(rd_pdg) = filter.rd_pdg { + elem.state.rd_pdg == rd_pdg + } else { + true + } + }) + .filter(|(_, elem)| { + if let Some(pdg) = filter.pdg { + elem.state.pdg == pdg + } else { + true + } + }) + .filter(|(_, elem)| { + if let Some(lock_rdy) = filter.lock_rdy { + elem.state.lock_rdy() == lock_rdy + } else { + true + } + }) + .filter(|(_, elem)| { + if let Some(kind) = filter.kind { + (elem.inst.kind & kind) != InstructionKind::None + } else { + true + } + }) + .filter(|(_, elem)| { + //TODO rework to enhance readability + (if let Some(dst) = &filter.dst_on_srcs { + dst.mode != DOpMode::Unused + && ((elem.inst.srca_id == *dst) || (elem.inst.srcb_id == *dst)) + } else { + true + } && if let Some((srca, srcb)) = &filter.srcs_on_dst { + ((srca.mode != DOpMode::Unused) && (elem.inst.dst_id == *srca)) + || ((srcb.mode != DOpMode::Unused) && (elem.inst.dst_id == *srcb)) + } else { + true + }) || filter + .dst_on_dst + .map(|dst| (dst.mode != DOpMode::Unused) && (elem.inst.dst_id == dst)) + .unwrap_or(false) + }) + .filter(|(_, elem)| filter.flush.map(|f| f == elem.inst.flush).unwrap_or(true)) + .map(|(idx, _)| idx) + .collect::>() + } +} + +/// Instruction Mode -> Rid/Mid +/// Used as src/dst identifier +#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] +enum DOpMode { + Unused, + Memory, + Register(usize), +} +/// Argument Id +/// Use for lock computation and match +#[derive(Debug, Eq, Clone, Copy, Serialize, Deserialize)] +struct ArgId { + mode: DOpMode, + id: usize, +} + +impl PartialEq for ArgId { + fn eq(&self, other: &Self) -> bool { + match (self.mode, other.mode) { + (DOpMode::Memory, DOpMode::Memory) | (DOpMode::Unused, DOpMode::Unused) => { + self.id == other.id + } + (DOpMode::Register(self_msk), DOpMode::Register(other_msk)) => { + // Does range overlaps ?! -> yes + ((self.id ^ other.id) & (self_msk & other_msk)) == 0 + } + _ => false, + } + } +} + +impl Default for ArgId { + fn default() -> Self { + ArgId { + mode: DOpMode::Unused, + id: 0, + } + } +} + +impl ArgId { + fn from_arg(arg: asm::DOpArg) -> Self { + match arg { + asm::DOpArg::Reg(rid) => Self { + mode: DOpMode::Register(usize::MAX), + id: rid.0 as usize, + }, + asm::DOpArg::Mem(ms) => { + let id = match ms { + asm::MemId::Addr(ct_id) => ct_id.0 as usize, + _ => panic!("Template must have been resolved before execution"), + }; + Self { + mode: DOpMode::Memory, + id, + } + } + asm::DOpArg::Imm(_) | asm::DOpArg::Pbs(_) | asm::DOpArg::Sync(_) => Self { + mode: DOpMode::Unused, + id: 0, + }, + } + } + + fn from_dst(dop: &asm::DOp) -> Self { + let dst = dop.dst(); + if dst.is_empty() { + // No dest arg -> i.e Sync + Self::default() + } else { + let mut arg = Self::from_arg(dst[0].clone()); + tracing::trace!(target = "pool", "Building dst for {:?}", dop); + match dop { + // Are we sure that this is better than what I had before? + asm::DOp::PBS(DOpPbs(pbs)) + | asm::DOp::PBS_ML2(DOpPbsMl2(pbs)) + | asm::DOp::PBS_ML4(DOpPbsMl4(pbs)) + | asm::DOp::PBS_ML8(DOpPbsMl8(pbs)) + | asm::DOp::PBS_F(DOpPbsF(pbs)) + | asm::DOp::PBS_ML2_F(DOpPbsMl2F(pbs)) + | asm::DOp::PBS_ML4_F(DOpPbsMl4F(pbs)) + | asm::DOp::PBS_ML8_F(DOpPbsMl8F(pbs)) => { + // PBS used multiple contiguous register in case of many-lut + let lut = asm::Pbs::from_hex(pbs.gid).expect("Invalid PbsGid"); + arg.mode = DOpMode::Register(lut.lut_msk()); + tracing::trace!( + target = "pool", + "destination mask for {:?} = {:?}", + pbs, + arg.mode + ); + arg + } + // Otherwise Standard ArgId handling + _ => arg, + } + } + } + + fn from_srca(dop: &asm::DOp) -> Self { + let src = dop.src(); + if src.is_empty() { + // No src arg -> i.e Sync + Self::default() + } else { + Self::from_arg(src[0].clone()) + } + } + fn from_srcb(dop: &asm::DOp) -> Self { + let src = dop.src(); + if src.len() < 2 { + Self::default() + } else { + Self::from_arg(src[1].clone()) + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct Instruction { + pub(crate) op: asm::DOp, + pub(crate) kind: InstructionKind, + dst_id: ArgId, + srca_id: ArgId, + srcb_id: ArgId, + flush: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct State { + sync_id: usize, + // RdLock -> #instruction before us that need to READ into our destination + rd_lock: usize, + // WrLock -> #instruction before us that need to Write into one of our sources + wr_lock: usize, + // IssueLock -> #instruction before us that need to be issued + issue_lock: usize, + vld: bool, + rd_pdg: bool, + pdg: bool, +} +impl State { + fn lock_rdy(&self) -> bool { + (self.rd_lock | self.wr_lock | self.issue_lock) == 0 + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct Slot { + pub(crate) inst: Instruction, + pub(crate) state: State, +} + +#[derive(Default, Debug)] +struct Filter { + sync_id: Option, + vld: Option, + rd_pdg: Option, + pdg: Option, + lock_rdy: Option, + kind: Option, + dst_on_srcs: Option, + srcs_on_dst: Option<(ArgId, ArgId)>, + dst_on_dst: Option, + flush: Option, +} diff --git a/backends/tfhe-hpu-backend/src/fw/isc_sim/report.rs b/backends/tfhe-hpu-backend/src/fw/isc_sim/report.rs new file mode 100644 index 000000000..6d0858df7 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/isc_sim/report.rs @@ -0,0 +1,94 @@ +//! Report structure + +use std::collections::HashMap; + +use super::InstructionKind; + +use super::pe::{Pe, PeStats, PeStore}; + +#[derive(Debug)] +pub struct TimeRpt { + pub cycle: usize, + pub duration: std::time::Duration, +} + +#[derive(Debug)] +pub struct DOpRpt(pub HashMap); + +impl std::fmt::Display for DOpRpt { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let key_val = { + // Order alphabetically by key + let mut keys = self.0.keys().collect::>(); + keys.sort(); + + keys.iter() + .map(|k| { + format!( + "{k}: {}", + self.0 + .get(k) + .unwrap_or_else(|| panic!("Error: Key {k} not available in DOpRpt")) + ) + }) + .collect::>() + }; + write!(f, "InstructionKind {{{}}}", key_val.join(", ")) + } +} + +#[derive(Debug)] +pub struct PeRpt { + pub stats: PeStats, + pub usage: f64, +} +impl std::fmt::Display for PeRpt { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "issued: {}, batches: {}, by_timeout: {}, usage: {}", + self.stats.issued, self.stats.batches, self.stats.by_timeout, self.usage + )?; + Ok(()) + } +} + +#[derive(Debug)] +pub struct PeStoreRpt(HashMap); +impl PeStoreRpt { + pub fn new(map: HashMap) -> PeStoreRpt { + PeStoreRpt(map) + } +} + +impl From<&PeStore> for PeStoreRpt { + fn from(value: &PeStore) -> Self { + let report_collection: HashMap = value + .0 + .iter() + .map(|(name, pe)| (name.clone(), PeRpt::from(pe))) + .collect(); + PeStoreRpt::new(report_collection) + } +} + +impl From<&Pe> for PeRpt { + fn from(value: &Pe) -> Self { + let stats = value.stats(); + let usage = stats.usage_sum / (stats.batches as f64); + PeRpt { stats, usage } + } +} + +impl std::fmt::Display for PeStoreRpt { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "Processing element statistics:")?; + // Order alphabetically by key and print one by line + let mut keys = self.0.keys().collect::>(); + keys.sort(); + for k in keys { + writeln!(f, "\t {k:?} => {}", self.0.get(k).unwrap())?; + } + Ok(()) + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/isc_sim/scheduler.rs b/backends/tfhe-hpu-backend/src/fw/isc_sim/scheduler.rs new file mode 100644 index 000000000..3bfa0aa6a --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/isc_sim/scheduler.rs @@ -0,0 +1,420 @@ +use crate::prelude::HpuIscParameters; + +use super::*; +use std::collections::{BinaryHeap, HashMap, VecDeque}; + +use report::{DOpRpt, PeStoreRpt, TimeRpt}; + +// NB: Pool query take 4 cycles on avg there are 3 pool request in a query +const QUERY_CYCLE: usize = 12; + +#[derive(Debug)] +pub struct Scheduler { + freq_mhz: usize, + quantum_cycles: usize, + sim_cycles: usize, + sync_id: usize, + + dop_pdg: VecDeque, + dop_exec: Vec, + pool: Pool, + evt_pdg: BinaryHeap, + rd_unlock: Vec, + wr_unlock: Vec, + pe_store: PeStore, + trace: Vec, +} + +impl Scheduler { + pub fn new( + freq_mhz: usize, + quantum_us: usize, + isc_params: &HpuIscParameters, + pe_config: PeConfigStore, + ) -> Self { + // NB: Scale match between freq and time (i.e. us vs MHz) + let quantum_cycles = freq_mhz * quantum_us; + let pool = Pool::new(isc_params.depth); + let pe_store = PeStore::from(pe_config); + + Self { + freq_mhz, + dop_pdg: VecDeque::new(), + dop_exec: Vec::new(), + sim_cycles: 0, + sync_id: 0, + quantum_cycles, + evt_pdg: BinaryHeap::new(), + pool, + rd_unlock: Vec::new(), + wr_unlock: Vec::new(), + pe_store, + + trace: Vec::new(), + } + } + + /// Insert the given list of DOp in the isc stream + pub fn insert_dops(&mut self, dops: Vec) { + self.dop_pdg.extend(dops); + } + + /// Simulate execution for simulation quantum + /// Return the list of retired Dops during the simulated windows + pub fn schedule(&mut self, bpip_timeout: Option) -> Vec { + tracing::trace!( + "Start simulation @{} [{}]", + self.sim_cycles, + self.quantum_cycles + ); + + // Register end-of-quantum + self.evt_pdg.push(Event::new( + EventType::QuantumEnd, + self.sim_cycles + self.quantum_cycles, + )); + + // Register next query + self.evt_pdg + .push(Event::new(EventType::Query, self.sim_cycles)); + + // Start simulation loop + loop { + let Event { + at_cycle, + event_type, + } = self.evt_pdg.pop().expect("Event queue is empty"); + tracing::trace!("[@{at_cycle}] -> {event_type:?}"); + + // Update cycle + assert!( + at_cycle >= self.sim_cycles, + "Simulation error, next register event is in the past" + ); + self.sim_cycles = at_cycle; + + let trigger_query = match event_type { + EventType::RdUnlock(kind, id) => { + // update associated pe state + self.pe_store.rd_unlock(id); + self.rd_unlock.push(kind); + + // Update the pe + let evts = self.pe_store.probe_for_exec_id(id, self.sim_cycles, None); + evts.into_iter().for_each(|evt| self.evt_pdg.push(evt)); + + true + } + EventType::WrUnlock(kind, id) => { + // update associated pe state + self.pe_store.wr_unlock(id); + self.wr_unlock.push(kind); + + // Update the pe + let evts = self.pe_store.probe_for_exec_id(id, self.sim_cycles, None); + evts.into_iter().for_each(|evt| self.evt_pdg.push(evt)); + + true + } + EventType::ReqTimeout(kind, _id) => { + match kind { + InstructionKind::Pbs => { + // Register Bpip timeout + if let Some(timeout) = bpip_timeout { + // delete the timeout timer + self.evt_pdg = std::mem::take(&mut self.evt_pdg) + .into_iter() + .filter(|ev| ev.event_type != EventType::BpipTimeout) + .collect(); + + // And re-start it + let timeout_stamp = self.sim_cycles + timeout as usize; + self.evt_pdg + .push(Event::new(EventType::BpipTimeout, timeout_stamp)); + + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::ReqTimeout(timeout_stamp), + }); + } + } + _ => panic!("Unexpected unit required a timeout registration {kind:?}"), + }; + false + } + EventType::BatchStart { pe_id, issued } => { + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::BatchStart { pe_id, issued }, + }); + false + } + EventType::QuantumEnd => { + break; + } + EventType::DelTimeout(kind, _) => { + assert!( + kind == InstructionKind::Pbs, + "Unexpected unit requiring a timeout deletion {kind:?}" + ); + + // delete the timeout timer + self.evt_pdg = std::mem::take(&mut self.evt_pdg) + .into_iter() + .filter(|ev| ev.event_type != EventType::BpipTimeout) + .collect(); + + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::DelTimeout, + }); + + false + } + EventType::BpipTimeout => { + // Trigger issue on pe store with batch_flush flag + let evts = self + .pe_store + .probe_for_exec(self.sim_cycles, Some(pe::Flush::Timeout)); + evts.into_iter().for_each(|evt| self.evt_pdg.push(evt)); + + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::Timeout, + }); + true + } + EventType::Query => self.query(), + }; + + // Register next Query event + // NB: Register new query event only if something useful has append. Other-wise wait + // for the next registered event + if trigger_query + && !self.evt_pdg.iter().any( + |Event { + at_cycle: _, + event_type, + }| *event_type == EventType::Query, + ) + { + // Queries should be issued periodically at every QUERY_CYCLE + let next_query = ((self.sim_cycles + QUERY_CYCLE) / QUERY_CYCLE) * QUERY_CYCLE; + self.evt_pdg.push(Event::new(EventType::Query, next_query)); + } + } + + // Replace content of dop_exec with empty vec and return it's previous content + std::mem::take(&mut self.dop_exec) + } + + /// Acknowledge rd_unlock + /// Remove first matching entry + fn ack_rd_unlock(&mut self, kind_1h: InstructionKind) { + let match_idx = self + .rd_unlock + .iter() + .enumerate() + .filter(|(_, kind)| InstructionKind::None != (**kind & kind_1h)) + .map(|(idx, _)| idx) + .collect::>(); + + self.rd_unlock.remove(match_idx[0]); + } + + fn ack_wr_unlock(&mut self, kind_1h: InstructionKind) { + let match_idx = self + .wr_unlock + .iter() + .enumerate() + .filter(|(_, kind)| InstructionKind::None != (**kind & kind_1h)) + .map(|(idx, _)| idx) + .collect::>(); + + self.wr_unlock.remove(match_idx[0]); + } +} + +impl Scheduler { + /// Issue a query to the pool to update instruction state + /// The generated query is arbiter as follow: + /// * RdUnlock + /// * Retire + /// * Refill + /// * Issue + // NB: Aims is to remove finish instruction ASAP and to ensure that the pool is + // filled as much as possible + fn query(&mut self) -> bool { + if !self.rd_unlock.is_empty() { + let kind_mh = self.rd_unlock_kind(); + let (kind_1h, slot) = self.pool.rd_unlock(kind_mh); + //NB: Operation behavior is executed at the rd_unlock staage to prevent later operation + // to clutter the source operands. The dst register is then available in + // advance, but not used before it's real availability due to wr_lock. + // -> Another option would have been to buffer the source operands. However, due to the + // operands size, we had preferred to move the behavioral execution at the rd_unlock + // stage + self.dop_exec.push(slot.inst.op.clone()); + + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::Query { + cmd: Query::RdUnlock, + slot: slot.clone(), + }, + }); + + self.ack_rd_unlock(kind_1h); + + true + } else if !self.wr_unlock.is_empty() { + let kind_mh = self.wr_unlock_kind(); + let slot = self.pool.retire(kind_mh); + self.ack_wr_unlock(slot.inst.kind); + + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::Query { + cmd: Query::Retire, + slot, + }, + }); + true + } else if !self.pool.is_full() && !self.dop_pdg.is_empty() { + let dop = self.dop_pdg.pop_front().unwrap(); + let nxt_sync_id = match &dop { + asm::DOp::SYNC(_) => self.sync_id + 1, + _ => self.sync_id, + }; + let slot = self.pool.refill(self.sync_id, dop); + self.sync_id = nxt_sync_id; + + tracing::trace!("Refill: {:?}", slot); + + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::Query { + cmd: Query::Refill, + slot: slot.clone(), + }, + }); + true + } else { + // By default try to issue + let pe_avail = self.pe_store.avail_kind() | InstructionKind::Sync; + match self.pool.issue(pe_avail) { + pool::IssueEvt::None => { + tracing::trace!("{}", self.pool); + tracing::trace!("{:?}", self.pe_store); + + false + } + pool::IssueEvt::DOp { + kind_1h, + flush, + slot, + } => { + tracing::trace!("Issue: {:?} flush: {:?}", slot, flush); + + // Push token in associated pe + self.pe_store.push(kind_1h, flush); + + // Flush the PE if this is a flush instruction + self.pe_store + .probe_for_exec(self.sim_cycles, None) + .into_iter() + .for_each(|evt| self.evt_pdg.push(evt)); + + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::Query { + cmd: Query::Issue, + slot, + }, + }); + true + } + pool::IssueEvt::Sync(slot) => { + self.dop_exec.push(slot.inst.op.clone()); + self.trace.push(Trace { + timestamp: self.sim_cycles, + event: TraceEvent::Query { + cmd: Query::Issue, + slot, + }, + }); + true + } + } + } + } + + /// Aggregate all pending rd_unlock to obtain multibit filtering flag + fn rd_unlock_kind(&self) -> InstructionKind { + self.rd_unlock + .iter() + .fold(InstructionKind::None, |acc, kind| acc | *kind) + } + + /// Aggregate all pending wr_unlock to obtain multibit filtering flag + fn wr_unlock_kind(&self) -> InstructionKind { + self.wr_unlock + .iter() + .fold(InstructionKind::None, |acc, kind| acc | *kind) + } +} + +impl Scheduler { + pub fn dop_report(&self) -> DOpRpt { + let mut map = HashMap::new(); + + self.trace.iter().for_each(|pt| { + if let Trace { + timestamp: _, + event: + TraceEvent::Query { + cmd: Query::Issue, + slot, + }, + } = pt + { + if let Some(entry) = map.get_mut(&slot.inst.kind) { + *entry += 1; + } else { + map.insert(slot.inst.kind, 1); + } + } + }); + DOpRpt(map) + } + + pub fn time_report(&self) -> TimeRpt { + let start = self.trace.first(); + let end = self.trace.last(); + + match (start, end) { + (Some(start), Some(end)) => { + let cycle = end.timestamp - start.timestamp; + let dur_us = cycle / self.freq_mhz; + TimeRpt { + cycle, + duration: std::time::Duration::from_micros(dur_us as u64), + } + } + (None, None) | (None, Some(_)) | (Some(_), None) => TimeRpt { + cycle: 0, + duration: std::time::Duration::from_secs(0), + }, + } + } + + pub fn pe_report(&mut self) -> PeStoreRpt { + let rpt = PeStoreRpt::from(&self.pe_store); + self.pe_store.reset_stats(); + rpt + } + + pub fn reset_trace(&mut self) -> Vec { + std::mem::take(&mut self.trace) + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/metavar.rs b/backends/tfhe-hpu-backend/src/fw/metavar.rs new file mode 100644 index 000000000..569bd6a5c --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/metavar.rs @@ -0,0 +1,1400 @@ +//! +//! Abstraction over Digit +//! Enable to write code that translate into firmwave in an easy-way +//! +//! Wrap asm::Arg with metadata and overload std::ops on it + +use super::*; + +use crate::asm::dop::DOp; +use crate::asm::{self, DigitParameters, ImmId, PbsLut}; +use crate::fw::program::StmtLink; +use tracing::{debug, error, trace}; + +use std::cell::RefCell; +use std::ops::{Add, AddAssign, Mul, MulAssign, ShlAssign, Sub, SubAssign}; +use std::rc::{Rc, Weak}; + +use bitflags::bitflags; + +// Used to filter on multiple position at once +bitflags! { + #[repr(transparent)] + #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] + pub struct PosKind: usize { + const EMPTY = 0x0; + const REG = 0x1; + const MEM = 0x2; + const IMM = 0x4; + const PBS = 0x8; + } +} + +/// Wrap any kind of DOp operand in an enum +/// Enable to depict the position of the associated data in the architecture +#[derive(Debug, Clone)] +pub enum VarPos { + Reg(asm::dop::RegId), + Mem(asm::dop::MemId), + Imm(asm::dop::ImmId), + Pbs(asm::dop::Pbs), +} + +#[derive(Clone)] +struct RegLock(MetaVarCell); + +impl std::fmt::Debug for RegLock { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + write!(f, "Lock: {}", self.0.as_reg().unwrap()) + } +} + +#[derive(Debug, Clone)] +struct RegLockWeakPtr(Weak); + +#[derive(Debug, Clone)] +pub struct RegLockPtr(Option>); + +impl Drop for RegLock { + fn drop(&mut self) { + let rid = self.0.as_reg().unwrap(); + let mut meta_inner = self.0 .0.borrow_mut(); + { + trace!(target: "MetaOp", "Unlocking register {}", rid); + let mut prog = meta_inner.prog.borrow_mut(); + prog.reg_put(rid, Some(MetaVarCellWeak::from(&self.0))); + prog.reg_promote(rid); + } + meta_inner.reg_lock = None; + } +} + +impl From for RegLockPtr { + fn from(value: RegLock) -> Self { + RegLockPtr(Some(Rc::new(value))) + } +} + +impl From<&RegLockPtr> for RegLockWeakPtr { + fn from(value: &RegLockPtr) -> Self { + RegLockWeakPtr(std::rc::Rc::downgrade(value.0.as_ref().unwrap())) + } +} + +impl From<&RegLockWeakPtr> for RegLockPtr { + fn from(value: &RegLockWeakPtr) -> Self { + RegLockPtr(Some(value.0.upgrade().unwrap())) + } +} + +impl From<&RegLockPtr> for MetaVarCell { + fn from(value: &RegLockPtr) -> Self { + value.0.as_ref().unwrap().0.clone() + } +} + +/// Wrap asm::Arg with metadata +/// asm::Arg is used to know position of the associated value +#[derive(Clone)] +struct MetaVar { + prog: program::Program, + #[allow(unused)] + uid: usize, + pos: Option, + degree: usize, + reg_lock: Option, +} + +/// Don't show ref to prog in Debug message +impl std::fmt::Debug for MetaVar { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "MetaVar{{uid: {}, pos: {:?}, degree: {}}}", + self.uid, self.pos, self.degree + ) + } +} + +impl Drop for MetaVar { + fn drop(&mut self) { + trace!(target: "MetaDrop", "Drop::{self:?}"); + if let Some(pos) = &self.pos { + let mut prog = self.prog.borrow_mut(); + // Release resource attached to inner + match pos { + VarPos::Reg(rid) => { + assert!( + self.reg_lock.is_none(), + "Dropping a metavariable with a locked register!" + ); + prog.reg_release(*rid); + } + VarPos::Mem(mid) => { + prog.heap_release(*mid); + } + VarPos::Imm(_) | VarPos::Pbs(_) => {} + } + } + } +} + +/// Weak Wrapped type +/// Use to keep reference on MetaVar without breaking lifetime analyses +#[derive(Debug, Clone)] +pub struct MetaVarCellWeak(Weak>); + +impl TryFrom<&MetaVarCellWeak> for MetaVarCell { + type Error = String; + + fn try_from(value: &MetaVarCellWeak) -> Result { + if let Some(inner_cell) = value.0.upgrade() { + Ok(Self(inner_cell)) + } else { + Err("Not allocated anymore".to_string()) + } + } +} + +/// Wrapped type +/// Define std::ops directly on the wrapper to have clean FW writing syntax +#[derive(Clone)] +pub struct MetaVarCell(Rc>); + +impl std::fmt::Debug for MetaVarCell { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.borrow().fmt(f) + } +} + +impl From<&MetaVarCell> for MetaVarCellWeak { + fn from(value: &MetaVarCell) -> Self { + Self(std::rc::Rc::downgrade(&(value.0))) + } +} + +/// MetaVarCell Constructors +impl MetaVarCell { + pub fn new( + prog: program::Program, + uid: usize, + from: Option, + tfhe_params: DigitParameters, + ) -> Self { + let degree = if let Some(pos) = from.as_ref() { + match pos { + VarPos::Reg(_) | VarPos::Mem(_) | VarPos::Imm(_) => tfhe_params.msg_mask(), + VarPos::Pbs(lut) => + // TODO Apply degree analyses later for many-lut case + { + (0..lut.lut_nb() as usize) + .map(|pos| lut.fn_at(pos, &tfhe_params, tfhe_params.msg_mask())) + .max() + .unwrap() + } + } + } else { + 0 + }; + let metavar = MetaVar { + prog, + uid, + pos: from, + degree, + reg_lock: None, + }; + + Self(Rc::new(RefCell::new(metavar))) + } + + pub fn clone_on(&self, prog: &program::Program) -> Self { + let borrow = self.0.borrow(); + MetaVarCell::new( + prog.clone(), + borrow.uid, + borrow.pos.clone(), + DigitParameters::from(prog.params()), + ) + } +} + +/// MetaVarCell Reg/Heap management +impl MetaVarCell { + /// Allocate in register and moved MetaVar content if any + /// In case of register eviction, this function handle the offloading in memory + pub(super) fn reg_alloc_mv(&self) { + trace!(target: "MetaOp", "RegAlloc::{self:?}"); + + // Early return if already in Reg or Imm like var + if self.is_in(PosKind::REG) { + // Update LRU and return + self.0 + .borrow() + .prog + .borrow_mut() + .reg_access(self.as_reg().unwrap()); + return; + } else if self.is_cst() || self.is_in(PosKind::PBS) { + return; + } + + let (rid, _) = self.0.borrow().prog.borrow_mut().reg_lru(); + self.force_reg_alloc(rid); + } + + // Forces allocation in register regid. + pub(super) fn force_reg_alloc(&self, rid: asm::RegId) { + trace!(target: "MetaOp", "ForceRegAlloc::{self:?} <= {:?}", rid); + + // Early return if already in Reg or Imm like var + if self.is_in(PosKind::REG) && self.as_reg().unwrap() == rid { + // Update LRU and return + self.0 + .borrow() + .prog + .borrow_mut() + .reg_access(self.as_reg().unwrap()); + return; + } else if self.is_cst() || self.is_in(PosKind::PBS) { + return; + } + + // Get cache entry and update state + let evicted = self + .0 + .borrow() + .prog + .borrow_mut() + .reg_swap_force(&rid, self.clone()); + + // Move evicted value in Memory if any + if let Some(var) = evicted { + var.heap_alloc_mv(false); + } + + // Move Self content and update metadata + match self.get_pos() { + PosKind::EMPTY => { + // Only update associated metadata + } + PosKind::MEM => { + // Physically moved value in register + let src = self.as_mem().unwrap(); + // Acquire prog Cell + let inner = self.0.borrow(); + let mut prog = inner.prog.borrow_mut(); + + assert!( + !matches!(src, asm::MemId::Dst { .. }), + "Load from UserDst register" + ); + let asm: DOp = asm::dop::DOpLd::new(rid, src).into(); + prog.stmts.push_stmt(asm); + + // Release associated heap slot and update reg cache + prog.heap_release(src); + prog.reg_access(rid); + } + PosKind::IMM => { + let imm = self.as_imm().unwrap(); + let inner = self.0.borrow(); + let mut prog = inner.prog.borrow_mut(); + prog.stmts + .push_stmt(asm::dop::DOpSub::new(rid, rid, rid).into()); + prog.stmts + .push_stmt(asm::dop::DOpAdds::new(rid, rid, imm).into()); + prog.reg_access(rid); + trace!(target: "MetaOp", "ForceRegAlloc:: {:?} <= {:?}", rid, imm); + } + _ => { + panic!("{self:?} must have been filter before register alloc/eviction") + } + } + // Update associated metadata + self.updt_pos(Some(VarPos::Reg(rid))); + } + + /// Allocate in heap and moved MetaVar content if any + /// In case of heap eviction, this function `panic` since there is no way to properly handle + /// this case (i.e. heap full) + pub(crate) fn heap_alloc_mv(&self, reg_release: bool) { + // Early return if already in Mem or Imm like var + if self.is_in(PosKind::EMPTY | PosKind::MEM | PosKind::IMM | PosKind::PBS) { + return; + } + trace!(target: "Fw", "Evict {self:?} in heap"); + + // Get cache entry and update state + let (mid, evicted) = self + .0 + .borrow() + .prog + .borrow_mut() + .heap_swap_lru(self.clone()); + // Check state of heap -> No value was dropped due to overflow + if let Some(_slot) = evicted { + panic!("Error: Heap overflow."); + } + + // Move Self content and update metadata + match self.get_pos() { + PosKind::REG => { + // Physically moved value in memory + let src = self.as_reg().unwrap(); + + // Acquire prog Cell + let inner = self.0.borrow(); + let mut prog = inner.prog.borrow_mut(); + prog.heap_access(mid); + let asm: DOp = asm::dop::DOpSt::new(mid, src).into(); + prog.stmts.push_stmt(asm); + + if reg_release { + prog.reg_release(src); + } + } + _ => { + panic!("{self:?} must have been filter before heap alloc") + } + } + // Update associated metadata + self.updt_pos(Some(VarPos::Mem(mid))); + } +} + +/// Utilities to manipulate and check position +impl MetaVarCell { + /// Return MetaVar position in an easy to reason about form + pub fn get_pos(&self) -> PosKind { + if let Some(pos) = self.0.borrow().pos.as_ref() { + match pos { + VarPos::Reg(_) => PosKind::REG, + VarPos::Mem(_) => PosKind::MEM, + VarPos::Imm(_) => PosKind::IMM, + VarPos::Pbs(_) => PosKind::PBS, + } + } else { + PosKind::empty() + } + } + + /// Check if MetaVar is one of many position + pub fn is_in(&self, position: PosKind) -> bool { + if let Some(pos) = self.0.borrow().pos.as_ref() { + match pos { + VarPos::Reg(_) => position.contains(PosKind::REG), + VarPos::Mem(_) => position.contains(PosKind::MEM), + VarPos::Imm(_) => position.contains(PosKind::IMM), + VarPos::Pbs(_) => position.contains(PosKind::PBS), + } + } else { + position.is_empty() + } + } + + /// Check if MetaVar is a compile time constant + pub fn is_cst(&self) -> bool { + matches!(self.0.borrow().pos, Some(VarPos::Imm(ImmId::Cst(_)))) + } + + /// Update MetaVar position + fn updt_pos(&self, pos: Option) { + trace!(target: "MetaOp", "UpdatePos::{self:?} => {:?}", pos); + let mut inner = self.0.borrow_mut(); + inner.pos = pos; + } +} + +/// Utilities to manipulate and check degree +impl MetaVarCell { + pub fn updt_degree(&self, degree: usize) { + let mut inner = self.0.borrow_mut(); + inner.degree = degree; + } + + pub fn get_degree(&self) -> usize { + self.0.borrow().degree + } + + pub fn check_degree(&self) { + let max_degree = { + let msg_w = self.0.borrow().prog.params().msg_w; + let carry_w = self.0.borrow().prog.params().carry_w; + (1 << (msg_w + carry_w + 1/* padding */)) - 1 + }; + + assert!(self.get_degree() <= max_degree) + } +} + +/// Utilities for uncheck field extraction +impl MetaVarCell { + pub(crate) fn as_reg(&self) -> Option { + if let Some(VarPos::Reg(id)) = self.0.borrow().pos { + Some(id) + } else { + None + } + } + + pub(crate) fn as_mem(&self) -> Option { + if let Some(VarPos::Mem(mid)) = self.0.borrow().pos { + Some(mid) + } else { + None + } + } + + pub(crate) fn as_imm(&self) -> Option { + if let Some(VarPos::Imm(val)) = self.0.borrow().pos { + Some(val) + } else { + None + } + } + + pub(crate) fn as_pbs(&self) -> Option { + if let Some(VarPos::Pbs(lut)) = self.0.borrow().pos.as_ref() { + Some(lut.clone()) + } else { + None + } + } +} + +impl MetaVarCell { + pub(super) fn pbs_raw( + dst_slice: &[&MetaVarCell], + src: &MetaVarCell, + lut: &MetaVarCell, + flush: bool, + tfhe_params: &DigitParameters, + ) -> StmtLink { + assert!( + src.is_in(PosKind::REG | PosKind::MEM), + "Pbs src must be of kind Reg|Mem MetaVar" + ); + + assert!( + lut.is_in(PosKind::PBS), + "Pbs lut must be of kind Reg|Mem MetaVar" + ); + + // Enforce that operand are in Register + // and that all destinations are consecutive + let dst = &dst_slice[0]; + + let in_reg = dst_slice.iter().any(|d| d.get_pos() == PosKind::REG); + + if !in_reg { + // Get the best possible range of registers + let dst_rid = dst + .0 + .borrow() + .prog + .borrow() + .aligned_reg_range(dst_slice.len()) + .unwrap(); + // Evict whatever is in the range + dst_slice + .iter() + .enumerate() + .for_each(|(i, d)| d.force_reg_alloc(asm::RegId(dst_rid.0 + i as u8))); + } else { + let lut_lg = lut.as_pbs().unwrap().lut_lg(); + let mask = u8::MAX << lut_lg; + assert!( + dst.as_reg().is_some() + && dst_slice + .iter() + .fold( + (dst.as_reg().unwrap().0 & mask, true), + |(prev, acc), this| { + (prev + 1, acc && (prev == this.as_reg().unwrap().0)) + } + ) + .1, + "ManyLUT PBS register indexes must be consecutive and aligned to \ + the respective power of two, current indexes: {:?}", + dst_slice + .iter() + .map(|d| d.as_reg().unwrap()) + .collect::>() + ); + } + src.reg_alloc_mv(); + + // The first destination is used as the source of all information + let dst_rid = dst.as_reg().unwrap(); + let src_rid = src.as_reg().unwrap(); + let pbs = lut.as_pbs().unwrap(); + + assert!( + pbs.lut_nb() == dst_slice.len() as u8, + "No enough destinations specified to receive all outputs in the PBS" + ); + + // Select between standard and flushed Pbs + // Also select correct opcode based lut width + let asm = if flush { + match pbs.lut_nb() { + 1 => asm::dop::DOpPbsF::new(dst_rid, src_rid, pbs.gid()).into(), + 2 => asm::dop::DOpPbsMl2F::new(dst_rid, src_rid, pbs.gid()).into(), + 4 => asm::dop::DOpPbsMl4F::new(dst_rid, src_rid, pbs.gid()).into(), + 8 => asm::dop::DOpPbsMl8F::new(dst_rid, src_rid, pbs.gid()).into(), + _ => panic!("PbsF with {} entries lut are not supported", pbs.lut_nb()), + } + } else { + match pbs.lut_nb() { + 1 => asm::dop::DOpPbs::new(dst_rid, src_rid, pbs.gid()).into(), + 2 => asm::dop::DOpPbsMl2::new(dst_rid, src_rid, pbs.gid()).into(), + 4 => asm::dop::DOpPbsMl4::new(dst_rid, src_rid, pbs.gid()).into(), + 8 => asm::dop::DOpPbsMl8::new(dst_rid, src_rid, pbs.gid()).into(), + _ => panic!("PbsF with {} entries lut are not supported", pbs.lut_nb()), + } + }; + let stmtlink = dst.0.borrow_mut().prog.push_stmt(asm); + + dst_slice + .iter() + .enumerate() + .for_each(|(i, dst)| dst.updt_degree(pbs.deg_at(i, tfhe_params, src.get_degree()))); + + trace!( + target: "MetaOp", + "PbsRaw:: {:?} <= {:?}, {:?}{}", + vec![dst_slice.iter().map(|dst| dst.0.borrow())], + src.0.borrow(), + lut.0.borrow(), + if flush { "[Flush]" } else { ""}, + ); + + dst_slice.iter().for_each(|d| d.check_degree()); + + stmtlink + } + + pub fn pbs_assign(&mut self, lut: &MetaVarCell, flush: bool) { + // Construct tfhe params + let tfhe_params = self.0.borrow().prog.params().clone().into(); + // Deferred to default logic + Self::pbs_raw(&[self], self, lut, flush, &tfhe_params); + } + + pub fn pbs(&self, lut: &MetaVarCell, flush: bool) -> Self { + // Allocate output variable + let prog = &self.0.borrow().prog.clone(); + let dst = prog.borrow_mut().new_var(prog.clone()); + + // NB: No need to move the destination to a register here, it is done in + // pbs_raw already + + // Construct tfhe params + let tfhe_params = prog.params().clone().into(); + + // Deferred to default logic + Self::pbs_raw(&[&dst], self, lut, flush, &tfhe_params); + dst + } + + pub fn pbs_many(&self, lut: &MetaVarCell, flush: bool) -> Vec { + // Allocate output variable + let lut_nb = lut.as_pbs().unwrap().lut_nb(); + let out_vec = (0..lut_nb) + .map(|_| { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }) + .collect::>(); + + // Construct tfhe params + let tfhe_params = self.0.borrow().prog.params().clone().into(); + + // Deferred to default logic + Self::pbs_raw( + &out_vec.iter().collect::>(), + self, + lut, + flush, + &tfhe_params, + ); + out_vec + } + + // TODO define bivariant version of Pbs +} + +/// Implement mac operator +impl MetaVarCell { + /// Raw Mac implementation + /// MAC output= (rhs_0 * mul_factor) + rhs_1 + pub(super) fn mac_raw( + &self, + rhs_0: &MetaVarCell, + mul_factor: u8, + rhs_1: &MetaVarCell, + ) -> StmtLink { + // Check operand type + assert!( + rhs_0.is_in(PosKind::REG | PosKind::MEM | PosKind::IMM), + "Add src must be of kind Reg|Mem|IMM MetaVar" + ); + assert!( + rhs_1.is_in(PosKind::REG | PosKind::MEM | PosKind::IMM), + "Add src must be of kind Reg|Mem|IMM MetaVar" + ); + assert!( + mul_factor + <= (1 + << (rhs_0.0.borrow().prog.params().carry_w + + rhs_0.0.borrow().prog.params().msg_w)), + "mul_factor must be <= carry_mask to prevent overflow" + ); + + // Move variables to registers if needed + rhs_0.reg_alloc_mv(); + rhs_1.reg_alloc_mv(); + + // Check rhs operands type and position + let rhs_0_imm = rhs_0.is_in(PosKind::IMM); + let rhs_1_imm = rhs_1.is_in(PosKind::IMM); + + match (rhs_0_imm, rhs_1_imm) { + (false, false) => { + // (Ct x Const) + Ct + // -> dst must be in ALU + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = (rhs_0.as_reg().unwrap(), rhs_1.as_reg().unwrap()); + let degree = (rhs_0.get_degree() * mul_factor as usize) + rhs_1.get_degree(); + + let asm = asm::dop::DOpMac::new( + dst_rid, + rhs_rid.0, + rhs_rid.1, + crate::asm::dop::MulFactor(mul_factor), + ) + .into(); + + self.updt_degree(degree); + rhs_0.0.borrow_mut().prog.push_stmt(asm) + } + (false, true) => { + // (Ct * Const) + Imm + // -> dst must be in ALU + // MAC anti-pattern, add comment in the generated stream + self.0.borrow().prog.borrow_mut().stmts.push_comment( + "mac_raw anti-pattern. Expand on two DOps [Muls, Adds]".to_string(), + ); + + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = rhs_0.as_reg().unwrap(); + let msg_cst = rhs_1.as_imm().unwrap(); + + // First DOp -> Muls + let mut degree = rhs_0.get_degree() * mul_factor as usize; + self.0.borrow().prog.borrow_mut().stmts.push_stmt( + asm::dop::DOpMuls::new(dst_rid, rhs_rid, ImmId::Cst(mul_factor as u16)).into(), + ); + + // Second DOp -> Adds + degree += match msg_cst { + ImmId::Cst(cst) => cst as usize, + ImmId::Var { .. } => { + let tfhe_params: asm::DigitParameters = + rhs_0.0.borrow().prog.borrow().params.clone().into(); + tfhe_params.msg_mask() + } + }; + self.updt_degree(degree); + self.0 + .borrow_mut() + .prog + .push_stmt(asm::dop::DOpAdds::new(dst_rid, dst_rid, msg_cst).into()) + } + (true, false) => { + // (Imm x Const) + Ct + // -> dst must be in ALU + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = rhs_1.as_reg().unwrap(); + let msg_cst = rhs_0.as_imm().unwrap(); + + match msg_cst { + asm::ImmId::Cst(imm) => { + // Imm x mul_factor could be computed offline + let msg_cst = imm * mul_factor as u16; + let degree = rhs_0.get_degree() + msg_cst as usize; + + let asm = + asm::dop::DOpAdds::new(dst_rid, rhs_rid, asm::ImmId::Cst(msg_cst)) + .into(); + + self.updt_degree(degree); + rhs_0.0.borrow_mut().prog.push_stmt(asm) + } + asm::ImmId::Var { .. } => { + // TODO add a warning, since it's not the native pattern expected by MAC ? + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_0 = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_0.reg_alloc_mv(); + reg_0.mv_assign(rhs_0); + self.mac_raw(®_0, mul_factor, rhs_1) + } + } + } + (true, true) => { + // (Imm x Const) + Imm -> compile time computation + match (rhs_0.as_imm().unwrap(), rhs_1.as_imm().unwrap()) { + (ImmId::Cst(cst_a), ImmId::Cst(cst_b)) => { + // Compile time constant + let imm = cst_a + (cst_b * mul_factor as u16); + self.updt_pos(Some(VarPos::Imm(ImmId::Cst(imm)))); + self.updt_degree(imm as usize); + StmtLink::empty(self.0.borrow().prog.clone()) + } + (ImmId::Var { .. }, _) => { + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_0 = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_0.reg_alloc_mv(); + reg_0.mv_assign(rhs_0); + self.mac_raw(®_0, mul_factor, rhs_1) + } + (_, ImmId::Var { .. }) => { + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_1 = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_1.reg_alloc_mv(); + reg_1.mv_assign(rhs_1); + self.mac_raw(rhs_0, mul_factor, ®_1) + } + } + } + } + } + + pub fn pack_carry(&self, msb: &MetaVarCell) -> MetaVarCell { + let tfhe_params: asm::DigitParameters = self.0.borrow().prog.params().clone().into(); + msb.mac(tfhe_params.msg_range() as u8, self) + } + + pub fn mac(&self, mul_factor: u8, rhs: &MetaVarCell) -> MetaVarCell { + // Allocate output variable + let dst = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + MetaVarCell::mac_raw(&dst, self, mul_factor, rhs); + dst + } + + pub fn mac_assign(&mut self, mul_factor: u8, rhs: &MetaVarCell) { + MetaVarCell::mac_raw(self, self, mul_factor, rhs); + } +} + +/// Implement move operator +impl MetaVarCell { + /// Move around value + /// Support following configuration + /// * Reg <- Reg|Mem|Imm + /// * Mem <- Reg + /// * Uninit <- Reg|Mem|Imm + /// + /// NB: Option Mem <- Mem isn't provided. + /// Indeed, this operation induce useless LD/ST and could be replaced by + // MetaVarCell swapping [0 cost at runtime] + pub fn mv_assign(&mut self, rhs: &Self) { + // Case of self is uninit => Alloc and same as Reg + if self.is_in(PosKind::empty()) { + self.reg_alloc_mv(); + } + + let self_pos = self.get_pos(); + let rhs_pos = rhs.get_pos(); + + match (self_pos, rhs_pos) { + (PosKind::REG, PosKind::REG) => { + let dst = self.as_reg().unwrap(); + let src = rhs.as_reg().unwrap(); + let inner = self.0.borrow(); + let mut prog = inner.prog.borrow_mut(); + //Update reg cache + prog.reg_access(src); + prog.reg_access(dst); + let asm = asm::dop::DOpAdds::new(dst, src, asm::ImmId::Cst(0)).into(); + + prog.stmts.push_stmt(asm); + } + (PosKind::REG, PosKind::MEM) => { + let dst = self.as_reg().unwrap(); + let src = rhs.as_mem().unwrap(); + assert!( + !matches!(src, asm::MemId::Dst { .. }), + "Load from UserDst register" + ); + let asm: DOp = asm::dop::DOpLd::new(dst, src).into(); + self.0.borrow().prog.borrow_mut().stmts.push_stmt(asm); + } + (PosKind::REG, PosKind::IMM) => { + // Way to Trivial encrypt Imm is to do: + // A <- A - A + // A <- A + Imm + + let dst = self.as_reg().unwrap(); + let imm = rhs.as_imm().unwrap(); + let inner = self.0.borrow(); + let mut prog = inner.prog.borrow_mut(); + prog.stmts + .push_stmt(asm::dop::DOpSub::new(dst, dst, dst).into()); + prog.stmts + .push_stmt(asm::dop::DOpAdds::new(dst, dst, imm).into()); + } + (PosKind::MEM, PosKind::REG) => { + let dst = self.as_mem().unwrap(); + let src = rhs.as_reg().unwrap(); + assert!( + !matches!(dst, asm::MemId::Src { .. }), + "Store into UserSrc register" + ); + + // Update heap if required + self.0.borrow().prog.borrow_mut().heap_access(dst); + let asm: DOp = asm::dop::DOpSt::new(dst, src).into(); + self.0.borrow().prog.borrow_mut().stmts.push_stmt(asm); + } + _ => panic!("Unsupported MOVE {self:?} <- {rhs:?}"), + } + // Update degree + self.updt_degree(rhs.get_degree()); + } +} + +/// Overload <<= for syntaxic sugar around it +impl ShlAssign for MetaVarCell { + fn shl_assign(&mut self, rhs: Self) { + self.mv_assign(&rhs) + } +} + +/// Implement raw addition and derive Add/AddAsign from it +impl MetaVarCell { + pub(super) fn add_raw( + &self, + rhs_0: &MetaVarCell, + rhs_1: &MetaVarCell, + upd_degree: bool, + ) -> StmtLink { + // Check operand type + assert!( + rhs_0.is_in(PosKind::REG | PosKind::MEM | PosKind::IMM), + "Add src must be of kind Reg|Mem|IMM MetaVar" + ); + assert!( + rhs_1.is_in(PosKind::REG | PosKind::MEM | PosKind::IMM), + "Add src must be of kind Reg|Mem|IMM MetaVar" + ); + + // Move variables to registers if required + rhs_0.reg_alloc_mv(); + rhs_1.reg_alloc_mv(); + + // Check rhs operands type and position + let rhs_0_imm = rhs_0.is_in(PosKind::IMM); + let rhs_1_imm = rhs_1.is_in(PosKind::IMM); + + let link = match (rhs_0_imm, rhs_1_imm) { + (false, false) => { + // Ct x Ct + // -> dst must be in ALU + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = (rhs_0.as_reg().unwrap(), rhs_1.as_reg().unwrap()); + let degree = rhs_0.get_degree() + rhs_1.get_degree(); + + let asm = asm::dop::DOpAdd::new(dst_rid, rhs_rid.0, rhs_rid.1).into(); + + if upd_degree { + self.updt_degree(degree); + } + rhs_0.0.borrow_mut().prog.push_stmt(asm) + } + (false, true) => { + // Ct x Imm + // -> dst must be in ALU + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = rhs_0.as_reg().unwrap(); + let msg_cst = rhs_1.as_imm().unwrap(); + let degree = match msg_cst { + ImmId::Cst(cst) => rhs_0.get_degree() + cst as usize, + ImmId::Var { .. } => { + let tfhe_params: asm::DigitParameters = + rhs_1.0.borrow().prog.borrow().params.clone().into(); + rhs_0.get_degree() + tfhe_params.msg_mask() + } + }; + + let asm = asm::dop::DOpAdds::new(dst_rid, rhs_rid, msg_cst).into(); + if upd_degree { + self.updt_degree(degree); + } + rhs_0.0.borrow_mut().prog.push_stmt(asm) + } + (true, false) => { + // Imm x Ct + // -> dst must be in ALU + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = rhs_1.as_reg().unwrap(); + let msg_cst = rhs_0.as_imm().unwrap(); + // let degree = rhs_1.get_degree() + msg_cst; + let degree = match msg_cst { + ImmId::Cst(cst) => rhs_1.get_degree() + cst as usize, + ImmId::Var { .. } => { + let tfhe_params: asm::DigitParameters = + rhs_0.0.borrow().prog.borrow().params.clone().into(); + rhs_1.get_degree() + tfhe_params.msg_mask() + } + }; + + if upd_degree { + self.updt_degree(degree); + } + + let asm = asm::dop::DOpAdds::new(dst_rid, rhs_rid, msg_cst).into(); + rhs_0.0.borrow_mut().prog.push_stmt(asm) + } + (true, true) => { + // Imm x Imm -> Check if this could be a compiled time constant + match (rhs_0.as_imm().unwrap(), rhs_1.as_imm().unwrap()) { + (ImmId::Cst(cst_a), ImmId::Cst(cst_b)) => { + // Compile time constant + let imm = cst_a + cst_b; + self.updt_pos(Some(VarPos::Imm(ImmId::Cst(imm)))); + if upd_degree { + self.updt_degree(imm as usize); + } + StmtLink::empty(self.0.borrow().prog.clone()) + } + (ImmId::Var { .. }, _) => { + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_0 = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_0.reg_alloc_mv(); + reg_0.mv_assign(rhs_0); + self.add_raw(®_0, rhs_1, upd_degree) + } + (_, ImmId::Var { .. }) => { + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_1 = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_1.reg_alloc_mv(); + reg_1.mv_assign(rhs_1); + self.add_raw(rhs_0, ®_1, upd_degree) + } + } + } + }; + trace!( + target: "MetaOp", + "AddRaw:: {:?} <= {:?}, {:?}", + self.0.borrow(), + rhs_0.0.borrow(), + rhs_1.0.borrow() + ); + self.check_degree(); + link + } +} + +impl Add for &MetaVarCell { + type Output = MetaVarCell; + + fn add(self, rhs: Self) -> Self::Output { + // Allocate output variable + let dst = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + + MetaVarCell::add_raw(&dst, self, rhs, true); + dst + } +} + +impl AddAssign for MetaVarCell { + fn add_assign(&mut self, rhs: Self) { + Self::add_raw(self, self, &rhs, true); + } +} + +/// Implement raw subtraction and derive Sub/SubAssign from it +impl MetaVarCell { + pub(super) fn sub_raw( + &self, + rhs_0: &MetaVarCell, + rhs_1: &MetaVarCell, + upd_degree: bool, + ) -> StmtLink { + // Check operand type + assert!( + rhs_0.is_in(PosKind::REG | PosKind::MEM | PosKind::IMM), + "Sub src must be of kind Reg|Mem|IMM MetaVar" + ); + assert!( + rhs_1.is_in(PosKind::REG | PosKind::MEM | PosKind::IMM), + "Sub src must be of kind Reg|Mem|IMM MetaVar" + ); + + // Move variables to registers if required + rhs_0.reg_alloc_mv(); + rhs_1.reg_alloc_mv(); + + // Check rhs operands type and position + let rhs_0_imm = rhs_0.is_in(PosKind::IMM); + let rhs_1_imm = rhs_1.is_in(PosKind::IMM); + + let link = match (rhs_0_imm, rhs_1_imm) { + (false, false) => { + // Ct x Ct + // -> dst must be in ALU + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = (rhs_0.as_reg().unwrap(), rhs_1.as_reg().unwrap()); + let degree = rhs_0.get_degree() - rhs_1.get_degree(); + + if upd_degree { + self.updt_degree(degree); + } + + let asm = asm::dop::DOpSub::new(dst_rid, rhs_rid.0, rhs_rid.1).into(); + rhs_0.0.borrow_mut().prog.push_stmt(asm) + } + (false, true) => { + // Ct x Imm + // -> dst must be in ALU + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = rhs_0.as_reg().unwrap(); + let msg_cst = rhs_1.as_imm().unwrap(); + let degree = match msg_cst { + ImmId::Cst(cst) => rhs_0.get_degree() - cst as usize, + ImmId::Var { .. } => { + let tfhe_params: asm::DigitParameters = + rhs_1.0.borrow().prog.borrow().params.clone().into(); + rhs_0.get_degree() - tfhe_params.msg_mask() + } + }; + + if upd_degree { + self.updt_degree(degree); + } + rhs_0 + .0 + .borrow_mut() + .prog + .push_stmt(asm::dop::DOpSubs::new(dst_rid, rhs_rid, msg_cst).into()) + } + (true, false) => { + // Imm x Ct + // -> dst must be in ALU + self.reg_alloc_mv(); + let dst_rid = self.as_reg().unwrap(); + let rhs_rid = rhs_1.as_reg().unwrap(); + let msg_cst = rhs_0.as_imm().unwrap(); + let degree = match msg_cst { + ImmId::Cst(cst) => cst as usize - rhs_1.get_degree(), + ImmId::Var { .. } => { + let tfhe_params: asm::DigitParameters = + rhs_0.0.borrow().prog.borrow().params.clone().into(); + tfhe_params.msg_mask() - rhs_0.get_degree() + } + }; + + if upd_degree { + self.updt_degree(degree); + } + + rhs_0 + .0 + .borrow_mut() + .prog + .push_stmt(asm::dop::DOpSsub::new(dst_rid, rhs_rid, msg_cst).into()) + } + (true, true) => { + // Imm x Imm -> Check if this could be a compiled time constant + match (rhs_0.as_imm().unwrap(), rhs_1.as_imm().unwrap()) { + (ImmId::Cst(cst_a), ImmId::Cst(cst_b)) => { + // Compile time constant + let imm = cst_a - cst_b; + self.updt_pos(Some(VarPos::Imm(ImmId::Cst(imm)))); + self.updt_degree(imm as usize); + StmtLink::empty(self.0.borrow().prog.clone()) + } + (ImmId::Var { .. }, _) => { + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_0 = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_0.reg_alloc_mv(); + reg_0.mv_assign(rhs_0); + self.sub_raw(®_0, rhs_1, upd_degree) + } + (_, ImmId::Var { .. }) => { + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_1 = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_1.reg_alloc_mv(); + reg_1.mv_assign(rhs_1); + self.sub_raw(rhs_0, ®_1, upd_degree) + } + } + } + }; + trace!( + target: "MetaOp", + "SubRaw:: {:?} <= {:?}, {:?}", + self.0.borrow(), + rhs_0.0.borrow(), + rhs_1.0.borrow() + ); + self.check_degree(); + link + } +} + +impl Sub for &MetaVarCell { + type Output = MetaVarCell; + + fn sub(self, rhs: Self) -> Self::Output { + // Allocate output variable + let dst = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + + dst.sub_raw(self, rhs, true); + dst + } +} + +impl SubAssign for MetaVarCell { + fn sub_assign(&mut self, rhs: Self) { + self.sub_raw(self, &rhs, true); + } +} +/// Implement raw subtraction and derive Mul/MulAssign from it +impl MetaVarCell { + fn mul_raw(dst: &MetaVarCell, rhs_0: &MetaVarCell, rhs_1: &MetaVarCell) { + // Check operand type + assert!( + rhs_0.is_in(PosKind::REG | PosKind::MEM | PosKind::IMM), + "Mul src must be of kind Reg|Mem|IMM MetaVar" + ); + assert!( + rhs_1.is_in(PosKind::REG | PosKind::MEM | PosKind::IMM), + "Mul src must be of kind Reg|Mem|IMM MetaVar" + ); + + // Move variables to registers if needed + rhs_0.reg_alloc_mv(); + rhs_1.reg_alloc_mv(); + + // Check rhs operands type and position + let rhs_0_imm = rhs_0.is_in(PosKind::IMM); + let rhs_1_imm = rhs_1.is_in(PosKind::IMM); + + match (rhs_0_imm, rhs_1_imm) { + (false, false) => { + error!("Try to multiply two Ciphertext together. This is not supported by TFHE, used Pbs instead"); + debug!(target: "Fw", "{rhs_0:?} x {rhs_1:?}"); + panic!("Invalid operation on MetaVar"); + } + (false, true) => { + // Ct x Imm + // -> dst must be in ALU + dst.reg_alloc_mv(); + let dst_rid = dst.as_reg().unwrap(); + let rhs_rid = rhs_0.as_reg().unwrap(); + let msg_cst = rhs_1.as_imm().unwrap(); + let degree = match msg_cst { + ImmId::Cst(cst) => rhs_0.get_degree() * cst as usize, + ImmId::Var { .. } => { + let tfhe_params: asm::DigitParameters = + rhs_1.0.borrow().prog.borrow().params.clone().into(); + rhs_0.get_degree() * tfhe_params.msg_mask() + } + }; + + rhs_0 + .0 + .borrow() + .prog + .borrow_mut() + .stmts + .push_stmt(asm::dop::DOpMuls::new(dst_rid, rhs_rid, msg_cst).into()); + dst.updt_degree(degree); + } + (true, false) => { + // Imm x Ct + // -> dst must be in ALU + dst.reg_alloc_mv(); + let dst_rid = dst.as_reg().unwrap(); + let rhs_rid = rhs_1.as_reg().unwrap(); + let msg_cst = rhs_0.as_imm().unwrap(); + let degree = match msg_cst { + ImmId::Cst(cst) => rhs_1.get_degree() * cst as usize, + ImmId::Var { .. } => { + let tfhe_params: asm::DigitParameters = + rhs_0.0.borrow().prog.borrow().params.clone().into(); + rhs_1.get_degree() + tfhe_params.msg_mask() + } + }; + + rhs_0 + .0 + .borrow() + .prog + .borrow_mut() + .stmts + .push_stmt(asm::dop::DOpMuls::new(dst_rid, rhs_rid, msg_cst).into()); + dst.updt_degree(degree); + } + (true, true) => { + // Imm x Imm -> Check if this could be a compiled time constant + match (rhs_0.as_imm().unwrap(), rhs_1.as_imm().unwrap()) { + (ImmId::Cst(cst_a), ImmId::Cst(cst_b)) => { + // Compile time constant + let imm = cst_a * cst_b; + dst.updt_pos(Some(VarPos::Imm(ImmId::Cst(imm)))); + dst.updt_degree(imm as usize); + } + (ImmId::Var { .. }, _) => { + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_0 = { + let prog = &dst.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_0.reg_alloc_mv(); + reg_0.mv_assign(rhs_0); + Self::mul_raw(dst, ®_0, rhs_1) + } + (_, ImmId::Var { .. }) => { + // Move templated constant in register and recurse + // Allocate extra register + // Force it's value to Imm::Var + let mut reg_1 = { + let prog = &dst.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + reg_1.reg_alloc_mv(); + reg_1.mv_assign(rhs_1); + Self::mul_raw(dst, rhs_0, ®_1) + } + } + } + } + trace!( + target: "MetaOp", + "MulRaw:: {:?} <= {:?}, {:?}", + dst.0.borrow(), + rhs_0.0.borrow(), + rhs_1.0.borrow() + ); + dst.check_degree(); + } + + pub fn mul(&self, rhs_0: &MetaVarCell, rhs_1: &MetaVarCell) { + Self::mul_raw(self, rhs_0, rhs_1) + } +} + +impl Mul for &MetaVarCell { + type Output = MetaVarCell; + + fn mul(self, rhs: Self) -> Self::Output { + // Allocate output variable + let dst = { + let prog = &self.0.borrow().prog; + let var = prog.borrow_mut().new_var(prog.clone()); + var + }; + MetaVarCell::mul_raw(&dst, self, rhs); + dst + } +} + +impl MulAssign for MetaVarCell { + fn mul_assign(&mut self, rhs: Self) { + Self::mul_raw(self, self, &rhs); + } +} + +// Utilities for finer register control +impl MetaVarCell { + pub(super) fn reg_lock(&mut self) -> RegLockPtr { + let rid = self.as_reg(); + let mut inner = self.0.borrow_mut(); + + inner + .reg_lock + .as_ref() + .map(|lock| lock.into()) + .unwrap_or_else(|| { + rid.map(|rid| { + trace!(target: "MetaOp", "Locking register {}", rid); + inner.prog.reg_pop(&rid); + let lock_ptr = RegLockPtr::from(RegLock(self.clone())); + inner.reg_lock = Some((&lock_ptr).into()); + lock_ptr + }) + .unwrap_or(RegLockPtr(None)) + }) + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/mod.rs b/backends/tfhe-hpu-backend/src/fw/mod.rs new file mode 100644 index 000000000..d0d41bd8d --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/mod.rs @@ -0,0 +1,102 @@ +//! +//! Top level abstraction of a Firmware +//! +//! Provide two concrete implementation of those traits +//! * DigitOperations (DOp) +//! * IntegerOperarions (IOp) + +pub mod fw_impl; +pub mod isc_sim; +pub mod metavar; +pub mod program; +pub mod rtl; + +use crate::asm; +use enum_dispatch::enum_dispatch; +use strum_macros::{EnumDiscriminants, EnumIter, EnumString, VariantNames}; + +/// Parameters that reflect the targeted architecture +/// Used to generate fw customized for the targeted architecture +#[derive(Debug, Clone)] +pub struct FwParameters { + pub register: usize, + pub isc_depth: usize, + pub heap_size: usize, + pub min_iop_size: usize, + pub min_pbs_batch_w: usize, + pub pbs_batch_w: usize, + pub total_pbs_nb: usize, + + pub msg_w: usize, + pub carry_w: usize, + pub nu: usize, + pub integer_w: usize, + pub use_ipip: bool, + pub kogge_cfg: String, + pub pe_cfg: isc_sim::PeConfigStore, + pub op_cfg: rtl::config::RtlCfg, + pub cur_op_cfg: rtl::config::OpCfg, + pub op_name: Option, +} + +impl FwParameters { + pub fn blk_w(&self) -> usize { + self.integer_w.div_ceil(self.msg_w) + } + + pub fn max_msg(&self) -> usize { + (1 << self.msg_w) - 1 + } + + pub fn max_val(&self) -> usize { + (1 << (self.msg_w + self.carry_w)) - 1 + } + + pub fn set_op(&mut self, opname: &str) { + self.op_name = Some(opname.into()); + self.cur_op_cfg = self.op_cfg.get(opname); + } + + pub fn op_cfg(&self) -> rtl::config::OpCfg { + self.cur_op_cfg + } +} + +impl From for asm::DigitParameters { + fn from(value: FwParameters) -> Self { + Self { + msg_w: value.msg_w, + carry_w: value.carry_w, + } + } +} + +/// Fw trait abstraction +/// Use to handle Fw implemantion in an abstract way +#[enum_dispatch] +pub trait Fw { + /// Expand a program of IOp into a program of DOp + fn expand(&self, params: &FwParameters, iopcode: &asm::AsmIOpcode) -> asm::Program; +} + +/// Gather available Fw in a enum for selection at runtime by user +#[enum_dispatch(Fw)] +#[derive(EnumDiscriminants, VariantNames)] +#[strum_discriminants(name(FwName))] +#[strum_discriminants(derive(EnumIter))] +#[strum_discriminants(derive(EnumString))] +pub enum AvlblFw { + Ilp(fw_impl::ilp::Ilp), + Llt(fw_impl::llt::Llt), + Demo(fw_impl::demo::Demo), +} + +impl AvlblFw { + pub fn new(kind: &FwName) -> Self { + match kind { + FwName::Ilp => Self::Ilp(Default::default()), + FwName::Llt => Self::Llt(Default::default()), + FwName::Demo => Self::Demo(Default::default()), + } + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/program.rs b/backends/tfhe-hpu-backend/src/fw/program.rs new file mode 100644 index 000000000..571990cc5 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/program.rs @@ -0,0 +1,538 @@ +//! +//! Abstraction used to ease FW writing +//! +//! It provide a set of utilities used to help FW implementation +//! with a clean and easy to read API + +use lru::LruCache; +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +use crate::asm; + +use super::metavar::{MetaVarCell, MetaVarCellWeak, VarPos}; +use super::FwParameters; + +use tracing::trace; + +use crate::fw::rtl::config::OpCfg; + +#[derive(Debug, Clone)] +pub struct ProgramInner { + uid: usize, + pub(crate) params: FwParameters, + pub(crate) regs: LruCache>, + pub(crate) heap: LruCache>, + pub(crate) vars: HashMap, + pub(crate) stmts: asm::Program, +} + +/// ProgramInner constructors +impl ProgramInner { + pub fn new(params: &FwParameters) -> Self { + let nb_regs = match std::num::NonZeroUsize::try_from(params.register) { + Ok(val) => val, + _ => panic!("Error: Number of registers must be >= 0"), + }; + let mut regs = LruCache::>::new(nb_regs); + // At start regs cache is full of unused slot + for rid in 0..params.register { + regs.put(asm::RegId(rid as u8), None); + } + + let nb_heap = match std::num::NonZeroUsize::try_from(params.heap_size) { + Ok(val) => val, + _ => panic!("Error: Number of heap slot must be >= 0"), + }; + let mut heap = LruCache::>::new(nb_heap); + // At start heap cache is full of unused slot + for hid in 0..params.heap_size as u16 { + heap.put(asm::MemId::new_heap(hid), None); + } + + Self { + uid: 0, + params: params.clone(), + regs, + heap, + vars: HashMap::new(), + stmts: asm::Program::default(), + } + } +} + +/// Cache handling +impl ProgramInner { + /// Retrieved least-recent-used register entry + /// Return associated register id and evicted variable if any + /// Warn: Keep cache state unchanged ... + pub(crate) fn reg_lru(&mut self) -> (asm::RegId, Option) { + let (rid, rdata) = self + .regs + .peek_lru() + .expect("Error: register cache empty. Check register management"); + + // Handle evicted slot if any + // Convert it in strong reference for later handling + let evicted = if let Some(weak_evicted) = rdata { + weak_evicted.try_into().ok() + } else { + None + }; + + (*rid, evicted) + } + + // Tries to get a range of consecutive aligned free registers and falls back + // to the range starting a the LRU + pub(crate) fn aligned_reg_range(&self, range: usize) -> Option { + let range = range as u8; + let log_size = asm::dop::ceil_ilog2(&range); + let mask = (1 << log_size) - 1; + let aligned = || { + self.regs + .iter() + .rev() + .filter(|(reg, _)| (reg.0 & mask) == 0) + }; + let rid = aligned() + .filter(|(reg, _)| { + let reg = reg.0; + (reg..reg + range).all(|reg| { + self.regs + .peek(&asm::RegId(reg)) + .is_some_and(|r| r.is_none()) + }) + }) + .map(|(reg, _)| *reg) + .next(); + rid.or_else(|| { + aligned() + .filter(|(reg, _)| { + let reg = reg.0; + (reg + 1..reg + range).all(|reg| self.regs.peek(&asm::RegId(reg)).is_some()) + }) + .map(|(i, _)| *i) + .next() + }) + } + + // Retrieves the indicated RID + // The cache state is unchanged + pub(crate) fn reg(&mut self, rid: &asm::RegId) -> Option { + let rdata = self + .regs + .peek(rid) + .unwrap_or_else(|| panic!("Error register {rid:} is not available")); + + if let Some(weak_evicted) = rdata { + weak_evicted.try_into().ok() + } else { + None + } + } + + // Insert the MetaVar in the indicated cache slot and return any evicted + // value + pub(crate) fn reg_swap_force( + &mut self, + rid: &asm::RegId, + var: MetaVarCell, + ) -> Option { + // Find lru slot + let evicted = self.reg(rid); + + // Update cache state + *(self.regs.get_mut(rid).expect("Update an `unused` register")) = Some((&var).into()); + + evicted + } + + /// Release register entry + pub(crate) fn reg_promote(&mut self, rid: asm::RegId) { + // Update cache state + // Put this slot in front of all `empty` slot instead of in lru pos + self.regs.promote(&rid); + let demote_order = self + .regs + .iter() + .filter(|(_, var)| var.is_none()) + .map(|(rid, _)| *rid) + .collect::>(); + demote_order + .into_iter() + .for_each(|rid| self.regs.demote(&rid)); + } + + /// Release register entry + pub(crate) fn reg_release(&mut self, rid: asm::RegId) { + trace!(target: "Program", "Release Reg {rid}"); + + *(self + .regs + .get_mut(&rid) + .expect("Release an `unused` register")) = None; + + self.reg_promote(rid); + } + + /// Notify register access to update LRU state + pub(crate) fn reg_access(&mut self, rid: asm::RegId) { + self.regs.promote(&rid) + } + + /// Retrieved least-recent-used heap entry + /// Return associated heap id and evicted variable if any + /// Warn: Keep cache state unchanged ... + fn heap_lru(&mut self) -> (asm::MemId, Option) { + let (mid, rdata) = self + .heap + .peek_lru() + .expect("Error: heap cache empty. Check register management"); + + // Handle evicted slot if any + // Convert it in strong reference for later handling + let evicted = if let Some(weak_evicted) = rdata { + weak_evicted.try_into().ok() + } else { + None + }; + + (*mid, evicted) + } + + /// Release register entry + pub(crate) fn heap_release(&mut self, mid: asm::MemId) { + trace!(target: "Program", "Release Heap {mid}"); + match mid { + asm::MemId::Heap { .. } => { + *(self + .heap + .get_mut(&mid) + .expect("Release an `unused` heap slot")) = None; + // Update cache state + // Put this slot in front of all `empty` slot instead of in lru pos + self.heap.promote(&mid); + let demote_order = self + .heap + .iter() + .filter(|(_mid, var)| var.is_none()) + .map(|(mid, _)| *mid) + .collect::>(); + demote_order + .into_iter() + .for_each(|mid| self.heap.demote(&mid)); + } + _ => { /*Only release Heap slot*/ } + } + } + + /// Notify heap access to update LRU state + pub(crate) fn heap_access(&mut self, mid: asm::MemId) { + match mid { + asm::MemId::Heap { .. } => self.heap.promote(&mid), + _ => { /* Do Nothing slot do not below to heap*/ } + } + } + + /// Insert MetaVar in cache and return evicted value if any + pub(crate) fn heap_swap_lru(&mut self, var: MetaVarCell) -> (asm::MemId, Option) { + // Find lru slot + let (mid, evicted) = self.heap_lru(); + + // Update cache state + *(self + .heap + .get_mut(&mid) + .expect("Update an `unused` heap slot")) = Some((&var).into()); + + (mid, evicted) + } + + /// Adds the given register for use + pub(super) fn reg_put(&mut self, rid: asm::RegId, meta: Option) { + assert!(self.regs.peek(&rid).is_none()); + self.regs.put(rid, meta); + } +} + +/// MetaVar handling +impl ProgramInner { + /// Create MetaVar from an optional argument + fn var_from(&mut self, from: Option, ref_to_self: Program) -> MetaVarCell { + // Create MetaVar + let uid = self.uid; + self.uid += 1; + + // Construct tfhe params + let tfhe_params: asm::DigitParameters = self.params.clone().into(); + let var = MetaVarCell::new(ref_to_self, uid, from, tfhe_params); + + // Register in var store + self.vars.insert(uid, (&var).into()); + + var + } + + pub fn new_var(&mut self, ref_to_self: Program) -> MetaVarCell { + self.var_from(None, ref_to_self) + } +} + +#[derive(Clone)] +pub struct Program { + inner: Rc>, +} + +impl std::ops::Deref for Program { + type Target = Rc>; + + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +#[derive(Clone)] +pub struct StmtLink { + prog: Program, + pos: Vec, +} + +impl std::fmt::Debug for StmtLink { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { + f.debug_struct("StmtLink").field("pos", &self.pos).finish() + } +} + +impl StmtLink { + pub fn empty(prog: Program) -> StmtLink { + StmtLink { + prog, + pos: Vec::new(), + } + } + + pub fn to_flush(&mut self) { + if let Some(pos) = self.pos.first() { + let mut borrow = self.prog.borrow_mut(); + let dop = borrow.stmts.get_stmt_mut(*pos); + dop.to_flush(); + } + } +} + +impl Program { + pub fn new(params: &FwParameters) -> Self { + Self { + inner: Rc::new(RefCell::new(ProgramInner::new(params))), + } + } + + pub fn params(&self) -> FwParameters { + self.inner.borrow().params.clone() + } + + pub fn op_cfg(&self) -> OpCfg { + self.inner.borrow().params.op_cfg() + } + + pub fn op_name(&self) -> Option { + self.inner.borrow().params.op_name.clone() + } + + pub fn set_op(&mut self, opname: &str) { + self.inner.borrow_mut().params.set_op(opname); + } + + pub fn push_comment(&mut self, comment: String) { + self.inner.borrow_mut().stmts.push_comment(comment) + } + + // pub fn get_stmts(&self) -> Vec { + // self.inner.borrow().stmts.clone() + // } + + pub fn var_from(&mut self, from: Option) -> MetaVarCell { + self.inner.borrow_mut().var_from(from, self.clone()) + } + + pub fn new_var(&mut self) -> MetaVarCell { + self.var_from(None) + } + + /// Easy way to create new imm value + pub fn new_imm(&mut self, imm: usize) -> MetaVarCell { + let arg = Some(VarPos::Imm(asm::ImmId::Cst(imm as u16))); + self.var_from(arg) + } + + /// Easy way to create constant backed in register + pub fn new_cst(&mut self, cst: usize) -> MetaVarCell { + let mut var = self.var_from(None); + var.reg_alloc_mv(); + // Force val to 0 then add cst value + var -= var.clone(); + if cst != 0 { + let imm = self.new_imm(cst); + var += imm; + } + + var + } + + /// Create templated arguments + /// kind is used to specify if it's bind to src/dst or immediat template + /// pos_id is used to bind the template to an IOp operand position + // TODO pass the associated operand or immediat to obtain the inner blk properties instead of + // using the global one + pub fn iop_template_var(&mut self, kind: asm::OperandKind, pos_id: u8) -> Vec { + let nb_blk = self.params().blk_w() as u8; + match kind { + asm::OperandKind::Src => { + // Digit in iop arg are contiguous + (0..nb_blk) + .map(|bid| { + let mid = asm::MemId::new_src(pos_id, bid); + self.var_from(Some(VarPos::Mem(mid))) + }) + .collect::>() + } + asm::OperandKind::Dst => { + // Digit in iop arg are contiguous + (0..nb_blk) + .map(|bid| { + let mid = asm::MemId::new_dst(pos_id, bid); + self.var_from(Some(VarPos::Mem(mid))) + }) + .collect::>() + } + asm::OperandKind::Imm => (0..nb_blk) + .map(|bid| { + let iid = asm::ImmId::new_var(pos_id, bid); + self.var_from(Some(VarPos::Imm(iid))) + }) + .collect::>(), + asm::OperandKind::Unknown => panic!("Template var required a known kind"), + } + } + + pub fn push_stmt(&mut self, asm: asm::dop::DOp) -> StmtLink { + let pos = self.borrow_mut().stmts.push_stmt_pos(asm); + StmtLink { + prog: self.clone(), + pos: vec![pos], + } + } +} + +#[derive(PartialEq, Eq, Debug)] +pub enum AtomicRegType { + NewRange(usize), + Existing(asm::RegId), + None, +} + +// Register utilities +impl Program { + /// Bulk reserve + /// Evict value from cache in a bulk manner. This enable to prevent false dependency of bulk + /// operations when cache is almost full Enforce that at least bulk_size register is `free` + pub(crate) fn reg_bulk_reserve(&self, bulk_size: usize) { + // Iter from Lru -> MRu and take bulk_size regs + let to_evict = self + .inner + .borrow() + .regs + .iter() + .rev() + .take(bulk_size) + .filter(|(_, var)| var.is_some()) + .map(|(_, var)| var.as_ref().unwrap().clone()) + .collect::>(); + + // Evict metavar to heap and release + to_evict.into_iter().for_each(|var| { + // Evict in memory if needed + if let Ok(cell) = MetaVarCell::try_from(&var) { + cell.heap_alloc_mv(true); + } + }); + } + + /// Removes the given register from use + pub fn reg_pop(&self, rid: &asm::RegId) -> Option { + self.inner.borrow_mut().regs.pop(rid).unwrap() + } + + /// Adds the given register for use + pub fn reg_put(&self, rid: asm::RegId, meta: Option) { + self.inner.borrow_mut().reg_put(rid, meta); + } + + // Inspects the register cache and yields the requested register ranges, if + // possible. This does not touch the cache state. + pub fn atomic_reg_range(&self, ranges: &[AtomicRegType]) -> Option> { + let mut borrow = self.inner.borrow_mut(); + + // Clone the register cache to restore it at the end + let backup = borrow.regs.clone(); + + // Remove first all already allocated ranges + ranges.iter().for_each(|r| { + if let AtomicRegType::Existing(rid) = r { + borrow.regs.pop(rid); + } + }); + + let result: Option> = ranges + .iter() + .map(|r| { + match r { + AtomicRegType::NewRange(r) => borrow.aligned_reg_range(*r).inspect(|rid| { + borrow.regs.pop(rid); + }), + AtomicRegType::Existing(rid) => Some(*rid), + // To ignore + AtomicRegType::None => Some(asm::RegId::default()), + } + }) + .collect(); + + // Restore the cache state + borrow.regs = backup; + + result + } +} + +impl From for asm::Program { + fn from(value: Program) -> Self { + let inner = value.inner.borrow(); + inner.stmts.clone() + } +} + +/// Syntax sugar to help user wrap PbsLut in MetaVarCell +#[macro_export] +macro_rules! new_pbs { + ( + $prog:ident, $pbs: literal + ) => { + ::paste::paste! { + $prog.var_from(Some(metavar::VarPos::Pbs(asm::dop::[]::default().into()))) + } + }; +} + +/// To get an asm PBS from its name +#[macro_export] +macro_rules! pbs_by_name { + ( + $pbs: literal + ) => { + ::paste::paste! { + asm::Pbs::[<$pbs:camel>](asm::dop::[]::default()) + } + }; +} diff --git a/backends/tfhe-hpu-backend/src/fw/rtl/config.rs b/backends/tfhe-hpu-backend/src/fw/rtl/config.rs new file mode 100644 index 000000000..3e3f82da7 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/rtl/config.rs @@ -0,0 +1,89 @@ +use std::collections::HashMap; + +#[derive(Debug, Clone, Copy, serde::Deserialize, serde::Serialize, Default)] +pub struct OpCfg { + /// Whether to fill the batch fifo when scheduling or not + pub fill_batch_fifo: bool, + /// Uses the minimum batch size in the firmware generation + pub min_batch_size: bool, + /// The current flush behavior if flushing + pub flush_behaviour: FlushBehaviour, + /// Whether to emit flushes or not + pub flush: bool, + /// Whether to use latency tiers when scheduling + pub use_tiers: bool, +} + +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct RtlCfg { + by_op: HashMap, + default: OpCfg, +} + +#[derive(Debug, Clone, Copy, serde::Deserialize, serde::Serialize, Default)] +pub enum FlushBehaviour { + #[default] + Patient, + NoPBS, + Opportunist, + Timeout(usize), +} + +impl From for FlushBehaviour { + fn from(s: String) -> Self { + match s.as_str() { + "Patient" => FlushBehaviour::Patient, + "NoPBS" => FlushBehaviour::NoPBS, + "Opportunist" => FlushBehaviour::Opportunist, + // Try to parse the string as a number for the Timeout variant + _ => { + if let Ok(timeout) = s.parse::() { + FlushBehaviour::Timeout(timeout) + } else { + // Default case if parsing fails, you can handle it as needed + FlushBehaviour::Patient + } + } + } + } +} + +impl std::fmt::Display for FlushBehaviour { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FlushBehaviour::Patient => write!(f, "Patient"), + FlushBehaviour::NoPBS => write!(f, "NoPBS"), + FlushBehaviour::Opportunist => write!(f, "Opportunist"), + FlushBehaviour::Timeout(val) => write!(f, "Timeout({val}_ms)"), + } + } +} + +impl RtlCfg { + pub fn new(default: OpCfg) -> Self { + Self { + by_op: HashMap::new(), + default, + } + } + + pub fn insert(&mut self, key: &str, value: OpCfg) { + self.by_op.insert(key.to_string(), value); + } + + pub fn default(&self) -> OpCfg { + self.default + } +} + +impl From for RtlCfg { + fn from(value: OpCfg) -> Self { + Self::new(value) + } +} + +impl RtlCfg { + pub fn get(&self, key: &str) -> OpCfg { + *self.by_op.get(key).unwrap_or(&self.default) + } +} diff --git a/backends/tfhe-hpu-backend/src/fw/rtl/macros.rs b/backends/tfhe-hpu-backend/src/fw/rtl/macros.rs new file mode 100644 index 000000000..b422a4d1a --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/rtl/macros.rs @@ -0,0 +1,82 @@ +#[macro_export] +macro_rules! rtl_op { + ( + $name: literal, + $kind: ident, + $data: ty + ) => { + ::paste::paste! { + #[derive(Clone)] + pub struct [<$name:camel Op>] { + src: Vec, + dst: Vec>, + uid: usize, + load_stats: Option, + data: $data, + } + + impl std::hash::Hash for [<$name:camel Op>] { + fn hash(&self, state: &mut H) { + self.uid.hash(state); + } + } + + impl std::cmp::PartialEq for [<$name:camel Op>] { + fn eq(&self, other: &[<$name:camel Op>]) -> bool { + self.uid == other.uid + } + } + + impl std::cmp::Eq for [<$name:camel Op>] { } + + impl std::ops::Drop for [<$name:camel Op>] { + fn drop(&mut self) { + trace!(target: "rtl", "Operation Dropped: {:?}", &self); + } + } + + impl OperationTrait for [<$name:camel Op>] { + fn clone_on(&self, prog: &Program) -> Operation { + Operation::[<$name:upper>](Self { + src: self.src.iter().map(|v| v.clone_on(prog)).collect(), + dst: self.dst.iter().map(|_| None).collect(), + uid: self.uid, + load_stats: self.load_stats.clone(), + data: self.data.clone(), + }) + } + + #[cfg(feature = "rtl_graph")] + fn name(&self) -> &str { + $name + } + fn kind(&self) -> InstructionKind { + InstructionKind::$kind + } + + fn clear_src(&mut self) { self.src.clear() } + fn clear_dst(&mut self) { self.dst.clear() } + + // Ideally this could be derived using getset but I don't seem + // to find a way to do it in a trait + + fn dst(&self) -> &Vec> { &self.dst } + fn src(&self) -> &Vec { &self.src } + fn uid(&self) -> &usize { &self.uid } + fn load_stats(&self) -> &Option { &self.load_stats } + fn load_stats_mut(&mut self) -> &mut Option { &mut self.load_stats } + fn dst_mut(&mut self) -> &mut Vec> { &mut self.dst } + } + + impl Debug for [<$name:camel Op>] { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.debug_struct($name) + .field("uid", self.uid()) + .field("dst", &self.dst().len()) + .field("data", &self.data) + .finish() + } + } + } + }; +} diff --git a/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs b/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs new file mode 100644 index 000000000..4a1e5b834 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/fw/rtl/mod.rs @@ -0,0 +1,1837 @@ +//! +//! A firmware abstraction layer in which the operation dependencies are +//! represented in a non acyclical graph. The resulting graph can then be used +//! to dump a series of instructions that maximize the target resources. + +pub mod config; +mod macros; + +use super::isc_sim; +use super::isc_sim::report::PeStoreRpt; +use super::isc_sim::{InstructionKind, PeFlush, PeStore}; +use super::metavar::{MetaVarCell, PosKind, RegLockPtr, VarPos}; +use super::program::{AtomicRegType, Program, StmtLink}; +use crate::asm::{ImmId, Pbs, PbsLut}; +use crate::rtl_op; +use bitflags::bitflags; +use config::{FlushBehaviour, OpCfg}; +use enum_dispatch::enum_dispatch; +use std::cell::{Ref, RefCell, RefMut}; +use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; +use std::fmt; +use std::fmt::Debug; +use std::rc::Rc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use strum_macros::{Display, EnumDiscriminants, EnumString}; +use tracing::{debug, instrument, trace}; + +static COUNTER: AtomicUsize = AtomicUsize::new(1); +fn new_uid() -> usize { + COUNTER.fetch_add(1, Ordering::Relaxed) +} + +#[derive(Clone, Debug, Default)] +pub struct LoadStats { + depth: usize, +} + +// Encodes an operation priority when scheduling +// Order first by depth, then by most registers, then by uid +#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)] +struct Prio { + latency_tier: usize, + depth: usize, + reg_balance: usize, + uid: usize, +} + +impl From<&OperationCell> for Prio { + fn from(value: &OperationCell) -> Self { + let value = value.borrow(); + let stats = value.load_stats().clone().unwrap_or_default(); + Prio { + latency_tier: value.latency_tier(), + depth: stats.depth, + uid: *value.uid(), + reg_balance: value.src().len(), + } + } +} + +#[derive(Clone)] +pub struct Var { + driver: Option<(OperationCell, usize)>, + loads: HashSet, + meta: Option, + load_stats: Option, + uid: usize, +} + +impl Var { + pub fn clone_on(&self, prog: &Program) -> Var { + Var { + driver: self.driver.as_ref().map(|(d, i)| (d.clone_on(prog), *i)), + loads: HashSet::new(), + meta: self.meta.as_ref().map(|m| m.clone_on(prog)), + load_stats: self.load_stats.clone(), + ..*self + } + } +} + +impl std::ops::Drop for Var { + fn drop(&mut self) { + trace!("Var Dropped: {:?}", &self); + } +} + +impl std::cmp::PartialEq for Var { + fn eq(&self, other: &Var) -> bool { + self.uid == other.uid + } +} + +impl std::cmp::Eq for Var {} + +impl std::hash::Hash for Var { + fn hash(&self, state: &mut H) { + self.uid.hash(state); + } +} + +#[derive(Clone, PartialEq, Eq)] +pub struct VarCell(Rc>); + +impl VarCell { + // Purposely not public + fn borrow_mut(&self) -> RefMut<'_, Var> { + self.0.borrow_mut() + } + fn borrow(&self) -> Ref<'_, Var> { + self.0.borrow() + } + + pub fn clone_on(&self, prog: &Program) -> Self { + self.borrow().clone_on(prog).into() + } + + pub fn copy_uid(&self) -> usize { + self.borrow().uid + } + + pub fn copy_meta(&self) -> Option { + self.0.borrow().meta.clone() + } + + pub fn copy_driver(&self) -> Option<(OperationCell, usize)> { + self.borrow().driver.clone() + } + + pub fn copy_loads(&self) -> Vec { + self.borrow().loads.iter().cloned().collect() + } + + // If a variable's driver was removed, it is scheduled + pub fn is_ready(&self) -> bool { + !self.has_driver() + } + + pub fn has_driver(&self) -> bool { + self.0.borrow().driver.is_some() + } + + pub fn has_meta(&self) -> bool { + self.0.borrow().meta.is_some() + } + + pub fn set_driver(&self, op: Option<(OperationCell, usize)>) { + self.0.borrow_mut().driver = op; + } + + // The key is not mutable since OperationCell implements an immutable hash + #[allow(clippy::mutable_key_type)] + pub fn set_loads(&self, loads: HashSet) { + self.0.borrow_mut().loads = loads; + } + + pub fn set_load_stats(&self, load_stats: LoadStats) -> LoadStats { + self.borrow_mut().load_stats = Some(load_stats.clone()); + load_stats + } + + pub fn set_meta(&self, var: MetaVarCell) { + self.0.borrow_mut().meta = Some(var); + } + + pub fn add_load(&self, op: &OperationCell) { + self.0.borrow_mut().loads.insert(op.clone()); + } + + pub fn clear_driver(&self) { + self.0.borrow_mut().driver = None; + } + + pub fn remove_load(&self, load: &OperationCell) { + self.0.borrow_mut().loads.remove(load); + } + + pub fn copy_load_stats(&self) -> LoadStats { + let load_stats = self.borrow().load_stats.clone(); + load_stats.unwrap_or_else(|| self.set_load_stats(self.compute_load_stats())) + } + + //The load of a variable is the number of variables depending on it + //(excluding itself). + pub fn compute_load_stats(&self) -> LoadStats { + LoadStats { + depth: self + .copy_loads() + .into_iter() + .map(|d| d.copy_load_stats().depth) + .max() + .unwrap_or(0), + } + } + + // Adds references from root to leaf, recursively + pub fn load(&self) { + if let Some((d, i)) = self.copy_driver() { + if d.borrow().dst()[i].is_none() { + d.set_dst(i, self); + d.load(); + } + } + } + + pub fn new() -> VarCell { + VarCell(Rc::new(RefCell::new(Var { + driver: None, + loads: HashSet::new(), + meta: None, + uid: new_uid(), + load_stats: None, + }))) + } + + pub fn pbs(&self, lut: &Pbs) -> Vec { + let var: Vec<_> = (0..lut.lut_nb()).map(|_| VarCell::new()).collect(); + let new_op = PbsOp::new_op(var.as_slice(), lut, self); + var.iter() + .enumerate() + .for_each(|(i, v)| v.set_driver(Some((new_op.clone(), i)))); + var + } + + pub fn single_pbs(&self, lut: &Pbs) -> VarCell { + self.pbs(lut).into_iter().next().unwrap() + } + + pub fn mac(&self, cnst: usize, coeff: &VarCell) -> VarCell { + let var = VarCell::new(); + let new_op = MacOp::new_op(cnst, coeff, self); + var.set_driver(Some((new_op.clone(), 0))); + var + } + + pub fn from_vec(v: Vec) -> Vec { + v.into_iter().map(VarCell::from).collect() + } +} + +impl Default for VarCell { + fn default() -> Self { + Self::new() + } +} + +impl From for VarCell { + fn from(meta: MetaVarCell) -> VarCell { + let var = VarCell::new(); + var.set_meta(meta); + var + } +} + +impl From<&MetaVarCell> for VarCell { + fn from(meta: &MetaVarCell) -> VarCell { + let var = VarCell::new(); + var.set_meta(meta.clone()); + var + } +} + +impl From for VarCell { + fn from(var: Var) -> VarCell { + VarCell(Rc::new(RefCell::new(var))) + } +} + +impl std::hash::Hash for VarCell { + fn hash(&self, state: &mut H) { + self.borrow().hash(state) + } +} + +#[enum_dispatch(Operation)] +trait OperationTrait +where + Self: Sized + Debug + std::hash::Hash, +{ + fn dst(&self) -> &Vec>; + fn dst_mut(&mut self) -> &mut Vec>; + fn src(&self) -> &Vec; + fn uid(&self) -> &usize; + fn load_stats(&self) -> &Option; + fn load_stats_mut(&mut self) -> &mut Option; + fn clear_src(&mut self); + fn clear_dst(&mut self); + fn kind(&self) -> InstructionKind; + fn clone_on(&self, prog: &Program) -> Operation; + // {{{1 Debug + // ----------------------------------------------------------------------- + #[cfg(feature = "rtl_graph")] + fn name(&self) -> &str; + // }}} +} + +#[enum_dispatch(Operation)] +trait ToFlush +where + Self: Sized + Debug + std::hash::Hash, +{ + fn to_flush(&mut self) {} +} + +#[enum_dispatch(Operation)] +trait ProgManager +where + Self: Sized + Debug + std::hash::Hash + OperationTrait, +{ + // Analyzes the current program state to know if this operation can be added + // The blanket implementation handles the typical case where an operation + // has many sources and only a single destination + fn peek_prog(&self, prog: &mut Program) -> bool { + if self.dst()[0].is_some() { + let mut range: Vec<_> = self + .src() + .iter() + .map(|src| { + let meta = src.copy_meta().unwrap(); + match meta.get_pos() { + PosKind::REG => AtomicRegType::Existing(meta.as_reg().unwrap()), + PosKind::IMM => match meta.as_imm().unwrap() { + ImmId::Cst(_) => AtomicRegType::None, + ImmId::Var { .. } => AtomicRegType::NewRange(1), + }, + PosKind::PBS => AtomicRegType::None, + PosKind::MEM => AtomicRegType::NewRange(1), + PosKind::EMPTY => AtomicRegType::NewRange(1), + // EMPTY variables are a specially case for + // operations that result in a constant + // independently on the variable value itself, such + // as a-a + _ => panic!("Unexpected metavar position"), + } + }) + .collect(); + + if range.iter().any(|rng| !matches!(*rng, AtomicRegType::None)) { + range.push(AtomicRegType::NewRange(1)); + } + + prog.atomic_reg_range(range.as_slice()).is_some() + } else { + // This operation is not needed, just say yes + true + } + } + + // This blanket implementation handles the typical case where an operation + // has two sources and only a single destination + fn alloc1_prog(&mut self, prog: &mut Program) -> OpLock1 { + if let Some(dst) = self.dst()[0].as_ref() { + let mut a = self.src()[0].copy_meta().unwrap(); + let mut b = self.src()[1].copy_meta().unwrap(); + let mut d = prog.new_var(); + dst.set_meta(d.clone()); + + let alock = { + a.reg_alloc_mv(); + a.reg_lock() + }; + let block = { + b.reg_alloc_mv(); + b.reg_lock() + }; + + assert!((a.is_in(PosKind::REG) || a.is_cst()) && (b.is_in(PosKind::REG) || b.is_cst())); + + if !(a.is_cst() && b.is_cst()) { + d.reg_alloc_mv(); + } + + OpLock1 { + rd_lock: Some(vec![alock, block]), + wr_lock: Some(d.reg_lock()), + } + } else { + OpLock1::default() + } + } + + // Allocates program resources for this operation, including locking the + // necessary registers to itself and moving metavariables to registers + fn alloc_prog(&mut self, prog: &mut Program); + // Adds the operation to the program. All resources keep locked + fn add_prog(&mut self, prog: &mut Program); + // Free read resources + fn free_rd(&mut self); + // Free write resources + fn free_wr(&mut self); +} + +// Not every DOP is implemented, add more if you need more + +#[derive(Clone, Debug, Default)] +struct OpLock1 { + rd_lock: Option>, + wr_lock: Option, +} + +#[derive(Clone, Debug, Default)] +struct AddsData { + cnst: usize, + rd_lock: Option>, + wr_lock: Option, +} + +#[derive(Clone, Debug, Default)] +struct MacData { + lock: OpLock1, + mult: usize, +} + +#[derive(Clone, Debug)] +struct PbsData { + lut: Pbs, + rd_lock: Option, + wr_lock: Option>, + stmt_link: Option, +} + +rtl_op!("ADDS", Arith, AddsData); +rtl_op!("ADD", Arith, OpLock1); +rtl_op!("SUB", Arith, OpLock1); +rtl_op!("SUBS", Arith, AddsData); +rtl_op!("MAC", Arith, MacData); +rtl_op!("MULS", Arith, MacData); +rtl_op!("PBS", Pbs, PbsData); +rtl_op!("ST", MemSt, Option); + +impl ProgManager for AddsOp { + fn alloc_prog(&mut self, prog: &mut Program) { + self.data = if let Some(dst) = self.dst()[0].as_ref() { + let mut a = self.src()[0].copy_meta().unwrap(); + let mut d = prog.new_var(); + dst.set_meta(d.clone()); + + let alock = { + a.reg_alloc_mv(); + a.reg_lock() + }; + if !a.is_cst() { + d.reg_alloc_mv(); + } + + AddsData { + cnst: self.data.cnst, + rd_lock: Some(vec![alock]), + wr_lock: Some(d.reg_lock()), + } + } else { + AddsData::default() + } + } + + fn add_prog(&mut self, prog: &mut Program) { + if let Some(d) = self.dst[0].as_ref() { + let a = self.src[0].copy_meta().unwrap(); + let b = prog.new_imm(self.data.cnst); + let d = d.copy_meta().unwrap(); + d.add_raw(&a, &b, false); + } + } + + fn free_rd(&mut self) { + self.data.rd_lock = None; + } + + fn free_wr(&mut self) { + self.data.wr_lock = None; + } +} + +impl ProgManager for SubsOp { + fn alloc_prog(&mut self, prog: &mut Program) { + self.data = if let Some(dst) = self.dst()[0].as_ref() { + let mut a = self.src()[0].copy_meta().unwrap(); + let mut d = prog.new_var(); + dst.set_meta(d.clone()); + + let alock = { + a.reg_alloc_mv(); + a.reg_lock() + }; + if !a.is_cst() { + d.reg_alloc_mv(); + } + + AddsData { + cnst: self.data.cnst, + rd_lock: Some(vec![alock]), + wr_lock: Some(d.reg_lock()), + } + } else { + AddsData::default() + } + } + + fn add_prog(&mut self, prog: &mut Program) { + if let Some(d) = self.dst[0].as_ref() { + let a = self.src[0].copy_meta().unwrap(); + let b = prog.new_imm(self.data.cnst); + let d = d.copy_meta().unwrap(); + d.sub_raw(&a, &b, false); + } + } + + fn free_rd(&mut self) { + self.data.rd_lock = None; + } + + fn free_wr(&mut self) { + self.data.wr_lock = None; + } +} + +impl ProgManager for AddOp { + fn alloc_prog(&mut self, prog: &mut Program) { + self.data = self.alloc1_prog(prog) + } + + fn add_prog(&mut self, _: &mut Program) { + if let Some(d) = self.dst[0].as_ref() { + let a = self.src[0].copy_meta().unwrap(); + let b = self.src[1].copy_meta().unwrap(); + let d = d.copy_meta().unwrap(); + d.add_raw(&a, &b, false); + } + } + + fn free_rd(&mut self) { + self.data.rd_lock = None; + } + + fn free_wr(&mut self) { + self.data.wr_lock = None; + } +} + +impl ProgManager for SubOp { + fn alloc_prog(&mut self, prog: &mut Program) { + self.data = self.alloc1_prog(prog) + } + + fn add_prog(&mut self, _: &mut Program) { + if let Some(d) = self.dst[0].as_ref() { + let a = self.src[0].copy_meta().unwrap(); + let b = self.src[1].copy_meta().unwrap(); + let d = d.copy_meta().unwrap(); + d.sub_raw(&a, &b, false); + } + } + + fn free_rd(&mut self) { + self.data.rd_lock = None; + } + + fn free_wr(&mut self) { + self.data.wr_lock = None; + } +} + +impl ProgManager for MacOp { + fn alloc_prog(&mut self, prog: &mut Program) { + self.data.lock = self.alloc1_prog(prog) + } + + fn add_prog(&mut self, _: &mut Program) { + if let Some(d) = self.dst[0].as_ref() { + let a = self.src[0].copy_meta().unwrap(); + let b = self.src[1].copy_meta().unwrap(); + let d = d.copy_meta().unwrap(); + d.mac_raw(&a, self.data.mult as u8, &b); + } + } + + fn free_rd(&mut self) { + self.data.lock.rd_lock = None; + } + + fn free_wr(&mut self) { + self.data.lock.wr_lock = None; + } +} + +impl ProgManager for MulsOp { + fn alloc_prog(&mut self, prog: &mut Program) { + self.data.lock = if let Some(dst) = self.dst()[0].as_ref() { + let mut a = self.src()[0].copy_meta().unwrap(); + let mut d = prog.new_var(); + dst.set_meta(d.clone()); + + let alock = { + a.reg_alloc_mv(); + a.reg_lock() + }; + + assert!((a.is_in(PosKind::REG) || a.is_cst())); + + if !a.is_cst() { + d.reg_alloc_mv(); + } + + OpLock1 { + rd_lock: Some(vec![alock]), + wr_lock: Some(d.reg_lock()), + } + } else { + OpLock1::default() + }; + } + + fn add_prog(&mut self, prog: &mut Program) { + if let Some(d) = self.dst[0].as_ref() { + let a = self.src[0].copy_meta().unwrap(); + let d = d.copy_meta().unwrap(); + d.mul(&a, &prog.new_imm(self.data.mult)); + } + } + + fn free_rd(&mut self) { + self.data.lock.rd_lock = None; + } + + fn free_wr(&mut self) { + self.data.lock.wr_lock = None; + } +} + +impl ProgManager for PbsOp { + fn peek_prog(&self, prog: &mut Program) -> bool { + // Make sure there's at least one used destination + assert!(self.dst().iter().any(|d| d.is_some())); + + let mut range: Vec<_> = self + .src() + .iter() + .map(|src| { + let meta = src.copy_meta().unwrap(); + match meta.get_pos() { + PosKind::REG => AtomicRegType::Existing(meta.as_reg().unwrap()), + PosKind::MEM => AtomicRegType::NewRange(1), + PosKind::EMPTY => panic!("Cannot operate on an empty variable"), + _ => panic!("Unexpected metavar position"), + } + }) + .collect(); + range.push(AtomicRegType::NewRange(self.dst().len())); + + prog.atomic_reg_range(range.as_slice()).is_some() + } + + fn alloc_prog(&mut self, prog: &mut Program) { + let reg_start = prog + .atomic_reg_range(&[AtomicRegType::NewRange(self.dst.len())]) + .unwrap()[0]; + + let d = self + .dst() + .iter() + .enumerate() + .map(|(i, d)| { + let meta = prog.new_var(); + meta.force_reg_alloc(reg_start + i); + d.as_ref().inspect(|d| d.set_meta(meta.clone())); + meta + }) + .collect::>(); + + self.data.wr_lock = Some(d.into_iter().map(|mut d| d.reg_lock()).collect()); + // Assume at least one destination is needed + let mut a = self.src()[0].copy_meta().unwrap(); + a.reg_alloc_mv(); + + assert!( + a.is_in(PosKind::REG), + "Cannot do a PBS from something other than a register" + ); + self.data.rd_lock = Some(a.reg_lock()); + } + + fn add_prog(&mut self, prog: &mut Program) { + let pbs = prog.var_from(Some(VarPos::Pbs(self.data.lut.clone()))); + let tfhe_params = prog.params().clone().into(); + let src = self.src[0].copy_meta().unwrap(); + let dst = self + .data + .wr_lock + .as_ref() + .unwrap() + .iter() + .map(MetaVarCell::from) + .collect::>(); + + self.data.stmt_link = Some(MetaVarCell::pbs_raw( + &dst.iter().collect::>(), + &src, + &pbs, + false, + &tfhe_params, + )); + } + + fn free_rd(&mut self) { + self.data.rd_lock = None; + } + + fn free_wr(&mut self) { + self.data.wr_lock = None; + } +} + +impl ProgManager for StOp { + fn peek_prog(&self, prog: &mut Program) -> bool { + // If this is not needed or there's no meta to go to, it's definitely + // possible to add this operation to the program + if self.dst[0].is_none() || !self.dst[0].as_ref().unwrap().has_meta() { + return true; + } + + let meta = self.src[0].copy_meta().unwrap(); + let range = [match meta.get_pos() { + PosKind::REG => AtomicRegType::Existing(meta.as_reg().unwrap()), + PosKind::MEM => AtomicRegType::NewRange(1), + PosKind::EMPTY => panic!("Cannot operate on an empty variable"), + _ => panic!("Unexpected metavar position"), + }]; + prog.atomic_reg_range(&range).is_some() + } + + fn alloc_prog(&mut self, _: &mut Program) { + // There's no need to allocate anything if there's no destination or the + // destination has no meta yet, in which case we can simply copy the + // source meta to it + if self.dst[0].is_some() && self.dst[0].as_ref().unwrap().has_meta() { + let mut a = self.src()[0].copy_meta().unwrap(); + a.reg_alloc_mv(); + assert!( + a.is_in(PosKind::REG), + "Cannot move to a destination from a location other than a register" + ); + self.data = Some(a.reg_lock()); + } + } + + fn add_prog(&mut self, _: &mut Program) { + let rhs = self.src[0].copy_meta().unwrap(); + if let Some(dst) = self.dst[0].as_ref() { + if let Some(mut lhs) = dst.copy_meta() { + lhs <<= rhs.clone(); + } else { + dst.set_meta(rhs.clone()); + } + } + } + + fn free_rd(&mut self) { + self.data = None; + } + + fn free_wr(&mut self) {} +} + +impl AddsOp { + fn new_op(var: &VarCell, cnst: usize) -> OperationCell { + let op = AddsOp { + src: vec![var.clone()], + dst: vec![None], + uid: new_uid(), + load_stats: None, + data: AddsData { + cnst, + rd_lock: None, + wr_lock: None, + }, + }; + OperationCell(Rc::new(RefCell::new(Operation::ADDS(op)))) + } +} + +impl SubsOp { + fn new_op(var: &VarCell, cnst: usize) -> OperationCell { + let op = SubsOp { + src: vec![var.clone()], + dst: vec![None], + uid: new_uid(), + load_stats: None, + data: AddsData { + cnst, + rd_lock: None, + wr_lock: None, + }, + }; + OperationCell(Rc::new(RefCell::new(Operation::SUBS(op)))) + } +} + +impl AddOp { + fn new_op(lhs: &VarCell, rhs: &VarCell) -> OperationCell { + let op = AddOp { + src: vec![lhs.clone(), rhs.clone()], + dst: vec![None], + uid: new_uid(), + load_stats: None, + data: OpLock1::default(), + }; + OperationCell(Rc::new(RefCell::new(Operation::ADD(op)))) + } +} + +impl SubOp { + fn new_op(lhs: &VarCell, rhs: &VarCell) -> OperationCell { + let op = SubOp { + src: vec![lhs.clone(), rhs.clone()], + dst: vec![None], + uid: new_uid(), + load_stats: None, + data: OpLock1::default(), + }; + OperationCell(Rc::new(RefCell::new(Operation::SUB(op)))) + } +} + +impl MacOp { + fn new_op(mult: usize, coeff: &VarCell, acc: &VarCell) -> OperationCell { + let op = MacOp { + src: vec![coeff.clone(), acc.clone()], + dst: vec![None], + uid: new_uid(), + load_stats: None, + data: MacData { + mult, + lock: OpLock1::default(), + }, + }; + OperationCell(Rc::new(RefCell::new(Operation::MAC(op)))) + } +} + +impl MulsOp { + fn new_op(var: &VarCell, mult: usize) -> OperationCell { + let op = MulsOp { + src: vec![var.clone()], + dst: vec![None], + uid: new_uid(), + load_stats: None, + data: MacData { + mult, + lock: OpLock1::default(), + }, + }; + OperationCell(Rc::new(RefCell::new(Operation::MULS(op)))) + } +} + +impl PbsOp { + fn new_op(dst: &[VarCell], lut: &Pbs, lhs: &VarCell) -> OperationCell { + let op = PbsOp { + src: vec![lhs.clone()], + dst: dst.iter().map(|_| None).collect(), + data: PbsData { + lut: lut.clone(), + rd_lock: None, + wr_lock: None, + stmt_link: None, + }, + uid: new_uid(), + load_stats: None, + }; + OperationCell(Rc::new(RefCell::new(Operation::PBS(op)))) + } +} + +impl StOp { + fn new_op(src: &VarCell) -> OperationCell { + let op = StOp { + src: vec![src.clone()], + dst: vec![None], + uid: new_uid(), + load_stats: None, + data: None, + }; + OperationCell(Rc::new(RefCell::new(Operation::ST(op)))) + } +} + +impl ToFlush for AddsOp {} +impl ToFlush for SubsOp {} +impl ToFlush for AddOp {} +impl ToFlush for SubOp {} +impl ToFlush for MacOp {} +impl ToFlush for MulsOp {} +impl ToFlush for PbsOp { + fn to_flush(&mut self) { + if let Some(asm) = &mut self.data.stmt_link { + asm.to_flush(); + } + } +} +impl ToFlush for StOp {} + +#[enum_dispatch] +#[derive(EnumDiscriminants, Debug, Hash, PartialEq, Eq, Clone)] +#[strum_discriminants(name(OperationNames))] +#[strum_discriminants(derive(EnumString, Display))] +pub enum Operation { + ADDS(AddsOp), + SUBS(SubsOp), + ADD(AddOp), + SUB(SubOp), + MAC(MacOp), + MULS(MulsOp), + PBS(PbsOp), + ST(StOp), +} + +// Divide the operations into latency tiers to help the scheduler decide the +// best order. +impl Operation { + pub fn latency_tier(&self) -> usize { + match self { + Operation::PBS(_) => 0, + _ => 1, + } + } + + fn is_pbs(&self) -> bool { + matches!(self, Operation::PBS(_)) + } +} + +// All pointers are reference counted pointers in the tree, both drivers and +// loads. This is because the FW when building the tree will hold only end +// variables, while when scheduling we'll hold source variables. While +// scheduling the tree needs to be de-constructed carefully so that it can be +// fully dropped. +#[derive(Clone, Eq, Debug)] +pub struct OperationCell(Rc>); + +impl OperationCell { + fn borrow(&self) -> Ref<'_, Operation> { + self.0.borrow() + } + fn is_ready(&self) -> bool { + self.borrow().src().iter().all(|x| x.is_ready()) + } + fn copy_dst(&self) -> Vec> { + self.0.borrow().dst().clone() + } + fn copy_src(&self) -> Vec { + self.0.borrow().src().clone() + } + fn kind(&self) -> InstructionKind { + self.0.borrow().kind() + } + fn latency_tier(&self) -> usize { + self.0.borrow().latency_tier() + } + fn is_pbs(&self) -> bool { + self.0.borrow().is_pbs() + } + + pub fn set_load_stats(&self, stats: LoadStats) -> LoadStats { + *self.0.borrow_mut().load_stats_mut() = Some(stats.clone()); + stats + } + + fn set_dst(&self, idx: usize, dst: &VarCell) { + self.0.borrow_mut().dst_mut()[idx] = Some(dst.clone()); + } + + // Adds references from root to leaf, recursively + pub fn load(&self) { + self.copy_src().into_iter().for_each(|v| { + v.add_load(self); + v.load(); + }); + } + + // Removes all links from roots to leaves + fn unload(&self) { + self.0 + .borrow() + .src() + .iter() + .for_each(|s| s.remove_load(self)); + self.0.borrow_mut().dst_mut().iter_mut().for_each(|s| { + *s = None; + }); + } + + // Removes ourselves from the load list of any variable by following the + // source list and clears the source list + fn clear_src(&self) { + self.0 + .borrow() + .src() + .iter() + .for_each(|s| s.remove_load(self)); + self.0.borrow_mut().clear_src() + } + + // Removes ourselves from the driver pointer of any variable by following the + // destination list and clears the destination list + fn clear_dst(&self) { + self.0 + .borrow() + .dst() + .iter() + .filter(|s| s.is_some()) + .for_each(|s| s.as_ref().unwrap().clear_driver()); + self.0.borrow_mut().clear_dst() + } + + // The load of an operation is the amount of variables directly and + // indirectly driven by it. The load of a variable is the number of + // variables depending on it (excluding itself). + fn compute_load_stats(&self) -> LoadStats { + LoadStats { + depth: self + .copy_dst() + .into_iter() + .flatten() + .map(|d| d.copy_load_stats().depth + 1) + .max() + .unwrap_or(0), + } + } + + pub fn copy_load_stats(&self) -> LoadStats { + let load_stats = self.borrow().load_stats().clone(); + load_stats.unwrap_or_else(|| self.set_load_stats(self.compute_load_stats())) + } + + // Removes self from all sources and destinations so that the program can + // evict this variable and returns a list of operations that depends on this + // one + // You should drop the OperationCell holding this too after + fn remove(&self) -> Vec { + let loads = self + .borrow() + .dst() + .iter() + .filter_map(|dst| dst.as_ref().map(|d| d.copy_loads().into_iter())) + .flatten() + .collect::>(); + + // Remove and mark this operation as ready + self.clear_src(); + self.clear_dst(); + + loads.into_iter().filter(|op| op.is_ready()).collect() + } + + fn get_all_ops(&self) -> Vec { + let mut ret = vec![self.clone()]; + let mut other: Vec<_> = self + .borrow() + .src() + .iter() + .filter_map(|s| s.copy_driver()) + .flat_map(|d| d.0.get_all_ops()) + .collect(); + ret.append(&mut other); + ret + } + + fn to_flush(&self) { + self.0.borrow_mut().to_flush() + } + + fn peek_prog(&self, prog: Option<&mut Program>) -> bool { + prog.map(|prog| self.0.borrow_mut().peek_prog(prog)) + .unwrap_or(true) + } + + fn alloc_prog(&mut self, prog: Option<&mut Program>) { + if let Some(prog) = prog { + self.0.borrow_mut().alloc_prog(prog); + } + } + + fn add_prog(&self, prog: Option<&mut Program>) { + if let Some(prog) = prog { + self.0.borrow_mut().add_prog(prog) + } + } + + fn free_rd(&mut self) { + self.0.borrow_mut().free_rd(); + } + + // Free write resources + fn free_wr(&mut self) { + self.0.borrow_mut().free_wr(); + } + + fn clone_on(&self, prog: &Program) -> Self { + self.0.borrow().clone_on(prog).into() + } + + // {{{1 Debug + // ----------------------------------------------------------------------- + #[cfg(feature = "rtl_graph")] + fn copy_uid(&self) -> usize { + *self.0.borrow().uid() + } + #[cfg(feature = "rtl_graph")] + fn copy_name(&self) -> String { + String::from(self.borrow().name()) + } + #[cfg(feature = "rtl_graph")] + fn get_heads(&self) -> HashSet { + let heads: HashSet<_> = self + .borrow() + .src() + .iter() + .filter_map(|s| s.copy_driver()) + .flat_map(|d| d.0.get_heads()) + .collect(); + if heads.len() > 0 { + heads + } else { + [self.clone()].into_iter().collect() + } + } + // ----------------------------------------------------------------------- + // }}} +} + +impl std::hash::Hash for OperationCell { + fn hash(&self, state: &mut H) { + self.borrow().hash(state) + } +} + +impl From for OperationCell { + fn from(value: Operation) -> Self { + OperationCell(Rc::new(RefCell::new(value))) + } +} + +// These are implemented to order operations in the BinaryHeap of pending +// operations +impl Ord for OperationCell { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + Prio::from(other).cmp(&Prio::from(self)) + } +} + +impl PartialOrd for OperationCell { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for OperationCell { + fn eq(&self, other: &Self) -> bool { + matches!(self.cmp(other), std::cmp::Ordering::Equal) + } +} + +impl std::ops::Add for &VarCell { + type Output = VarCell; + + fn add(self, other: &VarCell) -> VarCell { + let var = VarCell::new(); + let new_op = AddOp::new_op(self, other); + var.set_driver(Some((new_op.clone(), 0))); + var + } +} + +impl std::ops::Add for &VarCell { + type Output = VarCell; + + fn add(self, other: usize) -> VarCell { + let var = VarCell::new(); + let new_op = AddsOp::new_op(self, other); + var.set_driver(Some((new_op.clone(), 0))); + var + } +} + +impl std::ops::Sub for &VarCell { + type Output = VarCell; + + fn sub(self, other: &VarCell) -> VarCell { + let var = VarCell::new(); + let new_op = SubOp::new_op(self, other); + var.set_driver(Some((new_op.clone(), 0))); + var + } +} + +impl std::ops::Sub for &VarCell { + type Output = VarCell; + + fn sub(self, other: usize) -> VarCell { + let var = VarCell::new(); + let new_op = SubsOp::new_op(self, other); + var.set_driver(Some((new_op.clone(), 0))); + var + } +} + +impl std::ops::Mul for &VarCell { + type Output = VarCell; + + fn mul(self, coeff: usize) -> VarCell { + let var = VarCell::new(); + let new_op = MulsOp::new_op(self, coeff); + var.set_driver(Some((new_op.clone(), 0))); + var + } +} + +impl std::ops::ShlAssign<&VarCell> for VarCell { + fn shl_assign(&mut self, rhs: &VarCell) { + let new_op = StOp::new_op(rhs); + self.set_driver(Some((new_op.clone(), 0))); + } +} + +// I was expecting more events to be waited for... +bitflags! { + #[derive(Clone)] + struct WaitEvents: u8 { + const RdUnlock = 0x1; + } +} + +// Used to emulate the ALU store add instructions to the program and manipulate +// the register file +struct Arch { + pe_store: PeStore, + program: Option, + cycle: usize, + events: BinaryHeap, + queued: HashMap>, + rd_pdg: HashMap>, + wr_pdg: HashMap>, + use_ipip: bool, + cfg: OpCfg, + timeout: Option, + waiting_for: WaitEvents, +} + +// An interface to the target architecture +// Responsible for simulating the architecture and inserting operations into the +// program +// TODO: The whole Arch could be a trait, so that this whole infrastructure +// could be re-used in other contexts outside our HPU firmware generation +impl Arch { + // interface + pub fn try_dispatch(&mut self, op: BinaryHeap) -> BinaryHeap { + // Postpone scheduling high latency operations until there's no other + // option to keep everything going. This is very heuristic, so this + // behavior could be turned off on an iop basis. + let mut max_tier = if self.cfg.use_tiers { + self.max_tier().unwrap_or(0) + } else { + 0 + }; + + self.waiting_for = WaitEvents::empty(); + + let ret = op + .into_sorted_vec() + .into_iter() + .filter_map(|mut op| { + if op.latency_tier() >= max_tier { + if let Some(id) = { + // Shortcut peeking the program if the PE won't + // accept our kind. Peeking the program is very + // heavy. + self.pe_store + .avail_kind() + .intersects(op.kind()) + .then_some(true) + .and_then(|_| { + let prog_ok = op.peek_prog(self.program.as_mut()); + self.waiting_for.set(WaitEvents::RdUnlock, !prog_ok); + prog_ok.then_some(true) + }) + .and_then(|_| self.pe_store.try_push(op.kind(), false)) + } { + max_tier = if self.cfg.use_tiers { + max_tier.max(op.latency_tier()) + } else { + 0 + }; + + op.alloc_prog(self.program.as_mut()); + + self.rd_pdg.entry(id).or_default().push_front(op); + trace!("rd_pdg: {:?}", self.rd_pdg); + + None + } else { + Some(op) + } + } else { + Some(op) + } + }) + .collect::>(); + + // Select the flush behavior + match (self.use_ipip, self.cfg.flush_behaviour) { + (true, _) | (false, FlushBehaviour::Opportunist) => { + self.probe_for_exec(Some(PeFlush::Force)); + } + (false, FlushBehaviour::NoPBS) => { + let flush = (!ret.iter().any(|i| i.is_pbs())).then_some(PeFlush::Force); + self.probe_for_exec(flush); + } + (false, FlushBehaviour::Patient) => { + self.probe_for_exec(None); + let flush = (ret.is_empty() && self.events.is_empty()).then_some(PeFlush::Force); + self.probe_for_exec(flush); + } + (false, FlushBehaviour::Timeout(_)) => { + self.probe_for_exec(None); + } + }; + + ret + } + + pub fn max_tier(&self) -> Option { + self.rd_pdg + .values() + .chain(self.queued.values()) + .chain(self.wr_pdg.values()) + .flatten() + .map(|op| op.latency_tier()) + .max() + } + + pub fn done(&mut self) -> Option { + assert!(!self.events.is_empty()); + + let waiting_for = self.waiting_for.clone(); + let mut waiting = (true, None); + + while let (true, _) = waiting { + trace!("---------- Processing Loop ------------"); + trace!("Events: {:?}", self.events); + trace!("rd_pdg: {:?}", self.rd_pdg); + trace!("queued: {:?}", self.queued); + trace!("wr_pdg: {:?}", self.wr_pdg); + trace!("---------------------------------------"); + + let event = { + let mut event = self.events.pop(); + if self.timeout.is_some() + && self.timeout.unwrap() + < event.as_ref().map(|x| x.at_cycle).unwrap_or(usize::MAX) + { + self.probe_for_exec(Some(PeFlush::Timeout)); + self.timeout = None; + if let Some(event) = event { + self.events.push(event); + } + event = self.events.pop(); + } + event + }; + + waiting = if let Some(isc_sim::Event { + at_cycle, + event_type, + }) = event + { + self.cycle = at_cycle; + + match event_type { + isc_sim::EventType::RdUnlock(_, id) => { + // update associated pe state + self.pe_store.rd_unlock(id); + self.probe_for_exec(None); + + let mut op = self.rd_pdg.get_mut(&id).unwrap().pop_back().unwrap(); + op.add_prog(self.program.as_mut()); + op.free_rd(); + self.queued.entry(id).or_default().push_front(op); + (!(waiting_for.intersects(WaitEvents::RdUnlock)), None) + } + isc_sim::EventType::BatchStart { pe_id, issued } => { + self.queued.entry(pe_id).and_modify(|fifo| { + let mut batch = fifo.split_off(fifo.len() - issued); + if self.cfg.flush { + batch.front_mut().unwrap().to_flush(); + } + let fifo = self.wr_pdg.entry(pe_id).or_default(); + batch.into_iter().for_each(|e| fifo.push_front(e)); + }); + (true, None) + } + isc_sim::EventType::WrUnlock(_, id) => { + // update associated pe state + self.pe_store.wr_unlock(id); + self.probe_for_exec(None); + + let mut op = self.wr_pdg.get_mut(&id).unwrap().pop_back().unwrap(); + op.free_wr(); + (false, Some(op)) + } + _ => panic!("Received an unexpected event: {event_type:?}"), + } + } else { + (false, None) + }; + } + waiting.1 + } + + pub fn busy(&self) -> bool { + (!self.events.is_empty()) + || (self.pe_store.pending() != 0) + || (self.rd_pdg.iter().any(|x| !x.1.is_empty())) + } + + pub fn cycle(&self) -> usize { + self.cycle + } + + fn report_usage(&self) -> PeStoreRpt { + PeStoreRpt::from(&self.pe_store) + } + + fn probe_for_exec(&mut self, flush: Option) { + self.events.extend( + self.pe_store + .probe_for_exec(self.cycle, flush) + .into_iter() + .filter( + |isc_sim::Event { + at_cycle: _, + event_type: ev, + }| { + match (self.cfg.flush_behaviour, ev) { + ( + FlushBehaviour::Timeout(timeout), + isc_sim::EventType::ReqTimeout(_, _), + ) => { + self.timeout = Some(self.cycle + timeout); + false + } + (FlushBehaviour::Timeout(_), isc_sim::EventType::DelTimeout(_, _)) => { + self.timeout = None; + false + } + (_, isc_sim::EventType::ReqTimeout(_, _)) + | (_, isc_sim::EventType::DelTimeout(_, _)) => false, + _ => true, + } + }, + ), + ); + } +} + +impl Arch { + fn from(program: &Program) -> Self { + let params = program.params(); + let op_cfg = program.op_cfg(); + let mut pe_store = PeStore::from(params.pe_cfg.clone()); + + if op_cfg.min_batch_size { + pe_store.set_min_batch_limit(); + } + + if !op_cfg.fill_batch_fifo { + pe_store.set_fifo_to_batch_limit(); + } + + Arch { + pe_store, + program: Some(program.clone()), + cycle: 0, + use_ipip: params.use_ipip, + events: BinaryHeap::new(), + queued: HashMap::new(), + rd_pdg: HashMap::new(), + wr_pdg: HashMap::new(), + cfg: op_cfg, + timeout: None, + waiting_for: WaitEvents::empty(), + } + } +} + +#[derive(Clone, Debug)] +pub struct Rtl(Vec); + +impl Rtl { + pub fn iter(&self) -> core::slice::Iter<'_, VarCell> { + self.0.iter() + } + + // Adds references from the root to leaf direction recursively + pub fn load(&mut self) { + self.iter().for_each(|v| v.load()); + } + + // Remove all loads either to move self into an iterator or to drop the + // whole tree + pub fn unload(&mut self) { + self.iter() + .filter_map(|v| v.copy_driver()) + .flat_map(|(d, _)| d.get_all_ops().into_iter()) + .for_each(|op| { + op.unload(); + }); + } + + #[allow(clippy::mutable_key_type)] + #[instrument(level = "trace")] + fn find_roots(from: &mut [VarCell]) -> HashSet { + let mut not_ready: HashSet = HashSet::new(); + let mut ready: HashSet = HashSet::new(); + let mut to_check: VecDeque = from + .iter() + .filter_map(|v| v.copy_driver().map(|(d, _)| d)) + .collect(); + let mut all: HashSet = HashSet::new(); + + while !to_check.is_empty() { + let op = to_check.pop_front().unwrap(); + + if !all.contains(&op) { + if op.is_ready() { + ready.insert(op.clone()); + } else { + not_ready.insert(op.clone()); + to_check.extend( + op.copy_src() + .into_iter() + .flat_map(|v| v.copy_driver().map(|(d, _)| d)), + ); + } + all.insert(op); + } + } + + ready.iter().for_each(|op| { + op.set_load_stats(op.compute_load_stats()); + }); + + ready + } + + #[instrument(level = "trace", skip(self, prog))] + pub fn raw_add(mut self, prog: &Program) -> (usize, Vec) { + self.load(); + + let mut arch = Arch::from(prog); + let mut todo: BinaryHeap<_> = Rtl::find_roots(&mut self.0).into_iter().collect(); + + self.write_dot(prog, 0); + + debug!( + "Running simulation for {:?}@{}bits", + prog.borrow().params.op_name, + prog.borrow().params.integer_w + ); + + trace!("todo: {:?}", &todo); + + while (!todo.is_empty()) || arch.busy() { + // Try to dispatch everything that is ready to be done + todo = arch.try_dispatch(todo); + trace!("todo: {:?}", &todo); + + if let Some(op) = arch.done() { + trace!("Removing {:?}", &op); + // Done is consumed here + let new = op.remove(); + trace!("new ready op {:?}", &new); + todo.extend(new.into_iter()); + self.write_dot(prog, arch.cycle()); + } + } + + debug!( + "arch report for {:?}@{}: {}, cycles estimate: {}", + prog.borrow().params.op_name, + prog.borrow().params.integer_w, + arch.report_usage(), + arch.cycle() + ); + + ( + arch.cycle(), + self.into_iter().map(|x| x.copy_meta().unwrap()).collect(), + ) + } + + #[instrument(level = "trace", skip(self, prog))] + pub fn add_to_prog(self, prog: &Program) -> Vec { + self.raw_add(prog).1 + } + + #[instrument(level = "trace", skip(self, prog))] + pub fn estimate(self, prog: &Program) -> usize { + self.raw_add(prog).0 + } +} + +impl std::ops::Add for Rtl { + type Output = Rtl; + fn add(self, rhs: Rtl) -> Self::Output { + self.into_iter().chain(rhs).collect::>().into() + } +} + +impl Drop for Rtl { + fn drop(&mut self) { + self.unload(); + } +} + +impl IntoIterator for Rtl { + type Item = VarCell; + type IntoIter = as IntoIterator>::IntoIter; + fn into_iter(mut self) -> Self::IntoIter { + self.unload(); + let mut vec = Vec::new(); + std::mem::swap(&mut self.0, &mut vec); + vec.into_iter() + } +} + +impl From> for Rtl { + fn from(value: Vec) -> Self { + Rtl(value) + } +} + +// {{{1 Debugging stuff +// ---------------------------------------------------------------------------- + +#[cfg(feature = "rtl_graph")] +use dot2; +#[cfg(feature = "rtl_graph")] +use std::borrow::Cow; + +impl Rtl { + #[cfg(feature = "rtl_graph")] + fn write_dot(&self, prog: &Program, cycle: usize) { + Graph::new( + prog.op_name().unwrap_or("default".into()), + prog.params().blk_w(), + cycle, + self.0 + .iter() + .filter_map(|v| v.copy_driver().and_then(|(d, _)| Some(d))) + .collect::>() + .as_slice(), + ) + .write(); + } + #[cfg(not(feature = "rtl_graph"))] + fn write_dot(&self, _prog: &Program, _: usize) {} +} + +#[cfg(feature = "rtl_graph")] +struct Graph { + name: String, + width: usize, + cycle: usize, + heads: HashSet, + nodes: HashSet, +} + +#[cfg(feature = "rtl_graph")] +use itertools::Itertools; +#[cfg(feature = "rtl_graph")] +use std::io::{Seek, Write}; +#[cfg(feature = "rtl_graph")] +impl Graph { + pub fn write(&self) { + let dir = format!("graph/{}/{}", self.width, self.name); + std::fs::DirBuilder::new() + .recursive(true) + .create(&dir) + .unwrap(); + let mut fid = std::fs::File::create(format!("{}/cycle{}.dot", &dir, self.cycle)).unwrap(); + dot2::render(self, &mut fid).unwrap(); + // Append rank information + fid.seek_relative(-2).expect("Seek failed"); + let head_str = self + .heads + .iter() + .map(|x| format!("N{}", x.copy_uid())) + .join(";"); + writeln!(fid, "{{ rank=same; {} }}\n}}", head_str).expect("Write failed"); + } + + pub fn get_nodes(roots: &[OperationCell]) -> HashSet { + roots + .iter() + .map(|g| g.get_all_ops().into_iter()) + .flatten() + .collect() + } + + pub fn get_heads(roots: &[OperationCell]) -> HashSet { + roots + .iter() + .flat_map(|g| g.get_heads().into_iter()) + .collect() + } + + pub fn new(name: String, width: usize, cycle: usize, roots: &[OperationCell]) -> Graph { + Graph { + name, + width, + cycle, + heads: Graph::get_heads(roots), + nodes: Graph::get_nodes(roots), + } + } +} + +#[cfg(feature = "rtl_graph")] +#[derive(Debug, Hash, Clone, PartialEq, Eq)] +struct GraphEdge { + from: OperationCell, + to: OperationCell, + port_id: usize, + loads: usize, + uid: usize, +} + +// Dot2 implementation for Operation +#[cfg(feature = "rtl_graph")] +impl<'a> dot2::Labeller<'a> for Graph { + type Node = OperationCell; + type Edge = GraphEdge; + type Subgraph = (); + + fn graph_id(&'a self) -> dot2::Result> { + dot2::Id::new(format!("RTL{}", self.cycle)) + } + + fn node_id(&'a self, n: &Self::Node) -> dot2::Result> { + dot2::Id::new(format!("N{}", n.borrow().uid())) + } + + fn node_label<'b>(&'b self, n: &Self::Node) -> dot2::Result> { + Ok(dot2::label::Text::LabelStr(Cow::from(String::from( + format!( + "{}(load={},uid={},dst_used={})", + n.copy_name(), + n.borrow() + .load_stats() + .and_then(|l| Some(format!("{:?}", l))) + .unwrap_or(String::from("None")), + n.copy_uid(), + n.borrow().dst().iter().filter_map(|d| Some(d)).count() + ), + )))) + } + + fn edge_label<'b>(&'b self, e: &Self::Edge) -> dot2::label::Text<'b> { + dot2::label::Text::LabelStr(format!("{}[{},{}]", e.uid, e.port_id, e.loads).into()) + } +} + +#[cfg(feature = "rtl_graph")] +impl<'a> dot2::GraphWalk<'a> for Graph { + type Node = OperationCell; + type Edge = GraphEdge; + type Subgraph = (); + + fn nodes(&self) -> dot2::Nodes<'a, Self::Node> { + self.nodes.iter().map(|x| x.clone()).collect() + } + + fn edges(&'a self) -> dot2::Edges<'a, Self::Edge> { + let hash_set: HashSet = self + .nodes + .iter() + .map(|g| { + g.copy_src() + .into_iter() + .filter_map(move |v| { + v.copy_driver().and_then(|(d, port_id)| { + Some((v.copy_uid(), g, d, port_id, v.copy_loads().len())) + }) + }) + .map(|(uid, g, d, port_id, loads)| GraphEdge { + from: d.clone(), + to: g.clone(), + uid, + port_id, + loads, + }) + }) + .flatten() + .filter(|x| self.nodes.contains(&x.from) && self.nodes.contains(&x.to)) + .collect(); + hash_set.into_iter().collect() + } + + fn source(&self, e: &Self::Edge) -> Self::Node { + e.from.clone() + } + + fn target(&self, e: &Self::Edge) -> Self::Node { + e.to.clone() + } +} + +impl Debug for Var { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.debug_struct("Var") + .field("uid", &self.uid) + .field("meta", &self.meta.as_ref()) + .field("loads", &self.loads.len()) + .finish() + } +} + +impl Debug for VarCell { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + self.borrow().fmt(f) + } +} + +// ---------------------------------------------------------------------------- +// }}} + +// vim: foldmethod=marker diff --git a/backends/tfhe-hpu-backend/src/interface/backend.rs b/backends/tfhe-hpu-backend/src/interface/backend.rs new file mode 100644 index 000000000..ad38a6ec7 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/backend.rs @@ -0,0 +1,932 @@ +/// Implement inner-view of Hpu backend +use super::*; +use crate::asm::PbsLut; +use crate::entities::*; +use crate::fw::isc_sim::PeConfigStore; +use crate::fw::{Fw, FwParameters}; +use crate::{asm, ffi}; +use rtl::FromRtl; + +use itertools::Itertools; +use std::collections::VecDeque; +use std::str::FromStr; +use std::sync::{mpsc, Arc, Mutex}; +use strum::VariantNames; + +use tracing::{debug, info, trace}; + +use rayon::prelude::*; + +pub struct HpuBackend { + // Low-level hardware handling + hpu_hw: ffi::HpuHw, + regmap: hw_regmap::FlatRegmap, + + // Extracted parameters + pub(crate) params: HpuParameters, + // Prevent to parse regmap at each polling iteration + #[cfg(not(feature = "hw-v80"))] + workq_addr: u64, + #[cfg(not(feature = "hw-v80"))] + ackq_addr: u64, + + // Key memory + bsk_key: memory::HugeMemory, + ksk_key: memory::HugeMemory, + + // Lut and Fw memory + lut_mem: memory::HugeMemory, + fw_mem: memory::HugeMemory, + init_fw_width: Vec, + + // Memory management + // Board memory is abstract as a bunch of ciphertext slot + // Used a dedicaed manager to handle lifetime of used slot + pub(crate) ct_mem: memory::CiphertextMemory, + + // HW Trace cut + trace_mem: memory::HugeMemory, + + // Work management + // Keep track of issued IOp and associated variables + cmd_q: VecDeque, + cmd_rx: mpsc::Receiver, +} + +pub struct HpuBackendLock(Mutex); + +impl HpuBackendLock { + fn new(inner: HpuBackend) -> Self { + Self(Mutex::new(inner)) + } +} +impl std::ops::Deref for HpuBackendLock { + type Target = Mutex; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} +unsafe impl Send for HpuBackendLock {} +unsafe impl Sync for HpuBackendLock {} + +#[derive(Clone)] +pub struct HpuBackendWrapped(Arc); + +impl HpuBackendWrapped { + pub fn new_wrapped(config: &config::HpuConfig) -> (Self, mpsc::Sender) { + let (be, cmd_api) = HpuBackend::new(config); + (Self(Arc::new(HpuBackendLock::new(be))), cmd_api) + } +} +impl std::ops::Deref for HpuBackendWrapped { + type Target = Arc; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} +unsafe impl Send for HpuBackendWrapped {} +unsafe impl Sync for HpuBackendWrapped {} + +/// Handle HpuBackend construction and initialisation +impl HpuBackend { + pub fn new(config: &config::HpuConfig) -> (Self, mpsc::Sender) { + let mut hpu_hw = ffi::HpuHw::new_hpu_hw( + &config.fpga.ffi, + std::time::Duration::from_micros(config.fpga.polling_us), + ); + let regmap_expanded = config + .fpga + .regmap + .iter() + .map(|f| f.expand()) + .collect::>(); + let regmap_str = regmap_expanded + .iter() + .map(|f| f.as_str()) + .collect::>(); + let regmap = hw_regmap::FlatRegmap::from_file(®map_str); + let mut params = HpuParameters::from_rtl(&mut hpu_hw, ®map); + + // In case this is not filled by from_rtl() + if params.ntt_params.min_pbs_nb.is_none() { + params.ntt_params.min_pbs_nb = Some(config.firmware.min_batch_size); + } + + // Init on-board memory + hpu_hw.init_mem(config, ¶ms); + + // Flush ack_q + // Ensure that no residue from previous execution were stall in the pipe + #[cfg(feature = "hw-v80")] + { + // TODO add ack flush to prevent error with previous stall execution + } + #[cfg(not(feature = "hw-v80"))] + { + let ackq_addr = (*regmap + .register() + .get("WorkAck::ackq") + .expect("Unknown register, check regmap definition") + .offset()) as u64; + loop { + let ack_code = hpu_hw.read_reg(ackq_addr); + if ack_code == ACKQ_EMPTY { + break; + } + } + } + + // Apply Rtl configuration + // Bpip use + let bpip_use_reg = regmap + .register() + .get("bpip::use") + .expect("Unknown register, check regmap definition"); + hpu_hw.write_reg( + *bpip_use_reg.offset() as u64, + bpip_use_reg.from_field( + [ + ("use_bpip", config.rtl.bpip_use as u32), + ("use_opportunism", config.rtl.bpip_use_opportunism as u32), + ] + .into(), + ), + ); + + // Bpip timeout + hpu_hw.write_reg( + *regmap + .register() + .get("bpip::timeout") + .expect("Unknown register, check regmap definition") + .offset() as u64, + config.rtl.bpip_timeout, + ); + + info!("{params:?}"); + debug!( + "Isc registers {:?}", + rtl::runtime::InfoIsc::from_rtl(&mut hpu_hw, ®map) + ); + debug!( + "PeMem registers {:?}", + rtl::runtime::InfoPeMem::from_rtl(&mut hpu_hw, ®map) + ); + debug!( + "PeAlu registers {:?}", + rtl::runtime::InfoPeAlu::from_rtl(&mut hpu_hw, ®map) + ); + debug!( + "PePbs registers {:?}", + rtl::runtime::InfoPePbs::from_rtl(&mut hpu_hw, ®map) + ); + + #[cfg(not(feature = "hw-v80"))] + let workq_addr = (*regmap + .register() + .get("WorkAck::workq") + .expect("Unknown register, check regmap definition") + .offset()) as u64; + #[cfg(not(feature = "hw-v80"))] + let ackq_addr = (*regmap + .register() + .get("WorkAck::ackq") + .expect("Unknown register, check regmap definition") + .offset()) as u64; + + // Allocate memory for Bsk + let bsk_props = { + let bsk_pc = ¶ms.pc_params.bsk_pc; + let bsk_size = hpu_lwe_bootstrap_key_size(¶ms); + + let cut_coefs = bsk_size.div_ceil(*bsk_pc); + let mem_cut = config + .board + .bsk_pc + .clone() + .into_iter() + .take(*bsk_pc) + .collect::>(); + memory::HugeMemoryProperties { mem_cut, cut_coefs } + }; + let bsk_key = memory::HugeMemory::alloc(&mut hpu_hw, bsk_props); + + // Allocate memory for Ksk + let ksk_props = { + let ksk_pc = ¶ms.pc_params.ksk_pc; + let ksk_size = hpu_lwe_keyswitch_key_size(¶ms); + + let cut_coefs = ksk_size.div_ceil(*ksk_pc); + let mem_cut = config + .board + .ksk_pc + .clone() + .into_iter() + .take(*ksk_pc) + .collect::>(); + + memory::HugeMemoryProperties { mem_cut, cut_coefs } + }; + let ksk_key = memory::HugeMemory::alloc(&mut hpu_hw, ksk_props); + + // Allocate memory for GlweLut + let lut_props = memory::HugeMemoryProperties { + mem_cut: vec![config.board.lut_pc], + cut_coefs: config.board.lut_mem * params.pbs_params.polynomial_size, + }; + let lut_mem = memory::HugeMemory::alloc(&mut hpu_hw, lut_props); + + // Allocate memory for Fw translation table + let fw_props = memory::HugeMemoryProperties { + mem_cut: vec![config.board.fw_pc], + cut_coefs: config.board.fw_size, // NB: here `size` is used as raw size (!= slot nb) + }; + let fw_mem = memory::HugeMemory::alloc(&mut hpu_hw, fw_props); + + // Allocate memory pool for Ct + // NB: Compute size of each cut. + // Cut are 4k aligned -> One cut match with page boundary but the second one (with body + // extra coefs) crossed it => Use an extra page in both to have same addr incr (and + // match Rtl behavior) + let cut_size_b = memory::page_align( + hpu_big_lwe_ciphertext_size(¶ms).div_ceil(params.pc_params.pem_pc) + * std::mem::size_of::(), + ); + let ct_props = memory::CiphertextMemoryProperties { + mem_cut: config.board.ct_pc.clone(), + // NB: Xrt only support page align memory allocation. Thus we round cut coefs to + // match the next 4k page boundary + cut_size_b, + slot_nb: config.board.ct_mem, + used_as_heap: config.board.heap_size, + retry_rate_us: config.fpga.polling_us, + }; + debug!("Ct_mem properties -> {:?}", ct_props); + let ct_mem = memory::CiphertextMemory::alloc(&mut hpu_hw, ®map, &ct_props); + + // load trace ptr from config (size does not matter so putting 256) + let trace_props = memory::HugeMemoryProperties { + mem_cut: vec![config.board.trace_pc], + cut_coefs: 256, + }; + let trace_mem = memory::HugeMemory::alloc(&mut hpu_hw, trace_props); + + // Construct channel for mt API + // Keep track of the sender for clone it later on + let (cmd_tx, cmd_rx) = mpsc::channel(); + + ( + Self { + hpu_hw, + regmap, + params, + #[cfg(not(feature = "hw-v80"))] + workq_addr, + #[cfg(not(feature = "hw-v80"))] + ackq_addr, + bsk_key, + ksk_key, + lut_mem, + fw_mem, + init_fw_width: Vec::new(), + ct_mem, + trace_mem, + cmd_q: VecDeque::new(), + cmd_rx, + }, + cmd_tx, + ) + } +} + +/// Bootstrapping Key handling +/// Only here to expose function to the user. Associated logic is handled by the backend +impl HpuBackend { + #[tracing::instrument(level = "debug", skip(self), ret)] + pub fn bsk_unset(&mut self) { + let Self { + ref mut hpu_hw, + regmap, + .. + } = self; + + // Extract register from regmap + let bsk_avail = regmap + .register() + .get("bsk_avail::avail") + .expect("Unknown register, check regmap definition"); + let bsk_reset = regmap + .register() + .get("bsk_avail::reset") + .expect("Unknown register, check regmap definition"); + + // Cache reset procedure + // 1. Wait for end of batch process (WARN: Not handled by this function) + // 2. Set bit reset_cache = 1 + // 3. Set bit key_avail = 0 + // 4. Wait for reset_cache_done = 1 + // 5. Set bit reset_cache = 0, set reset_cache_done = 0 + // -> Design is ready to receive a new key + hpu_hw.write_reg(*bsk_reset.offset() as u64, 0x1); + hpu_hw.write_reg(*bsk_avail.offset() as u64, 0x0); + loop { + let done = { + let val = hpu_hw.read_reg(*bsk_reset.offset() as u64); + let fields = bsk_reset.as_field(val); + + *fields.get("done").expect("Unknown field") != 0 + }; + if done { + break; + } + } + + hpu_hw.write_reg(*bsk_reset.offset() as u64, 0x0); + } + + #[tracing::instrument(level = "debug", skip(self, bsk), ret)] + pub fn bsk_set(&mut self, bsk: HpuLweBootstrapKeyOwned) { + let Self { + ref mut hpu_hw, + regmap, + params, + bsk_key, + .. + } = self; + + // Extract register from regmap + let bsk_avail = regmap + .register() + .get("bsk_avail::avail") + .expect("Unknown register, check regmap definition"); + let bsk_addr_pc = (0..params.pc_params.bsk_pc) + .map(|idx| { + let lsb_name = format!("hbm_axi4_addr_3in3::bsk_pc{idx}_lsb"); + let msb_name = format!("hbm_axi4_addr_3in3::bsk_pc{idx}_msb"); + let lsb = regmap + .register() + .get(&lsb_name) + .expect("Unknown register, check regmap definition"); + let msb = regmap + .register() + .get(&msb_name) + .expect("Unknown register, check regmap definition"); + (lsb, msb) + }) + .collect::>(); + + // Write key in associated buffer + for (id, bsk_cut) in bsk.as_view().into_container().into_iter().enumerate() { + bsk_key.write_cut_at(id, 0, bsk_cut); + #[cfg(feature = "io-dump")] + io_dump::dump( + bsk_cut, + params, + io_dump::DumpKind::Bsk, + io_dump::DumpId::Key(id), + ); + } + + // Write pc_addr in memory + for (addr, (lsb, msb)) in std::iter::zip(bsk_key.cut_paddr().iter(), bsk_addr_pc.iter()) { + hpu_hw.write_reg( + *msb.offset() as u64, + ((addr >> u32::BITS) & (u32::MAX) as u64) as u32, + ); + hpu_hw.write_reg(*lsb.offset() as u64, (addr & (u32::MAX as u64)) as u32); + } + + // Toggle avail bit + hpu_hw.write_reg(*bsk_avail.offset() as u64, 0x1); + } + + #[tracing::instrument(level = "debug", skip(self), ret)] + pub fn bsk_is_set(&self) -> bool { + let Self { hpu_hw, regmap, .. } = self; + + // Extract register from regmap + let bsk_avail = regmap + .register() + .get("bsk_avail::avail") + .expect("Unknown register, check regmap definition"); + + let val = hpu_hw.read_reg(*bsk_avail.offset() as u64); + let fields = bsk_avail.as_field(val); + + *fields.get("avail").expect("Unknown field") != 0 + } +} + +/// KeyswitchKey handling +/// Only here to expose function to the user. Associated logic is handled by the backend +impl HpuBackend { + #[tracing::instrument(level = "debug", skip(self), ret)] + pub fn ksk_unset(&mut self) { + let Self { + ref mut hpu_hw, + regmap, + .. + } = self; + + // Extract register from regmap + let ksk_avail = regmap + .register() + .get("ksk_avail::avail") + .expect("Unknown register, check regmap definition"); + let ksk_reset = regmap + .register() + .get("ksk_avail::reset") + .expect("Unknown register, check regmap definition"); + + // Cache reset procedure + // 1. Wait for end of batch process (WARN: Not handled by this function) + // 2. Set bit reset_cache = 1 + // 3. Set bit key_avail = 0 + // 4. Wait for reset_cache_done = 1 + // 5. Set bit reset_cache = 0, set reset_cache_done = 0 + // -> Design is ready to receive a new key + hpu_hw.write_reg(*ksk_reset.offset() as u64, 0x1); + hpu_hw.write_reg(*ksk_avail.offset() as u64, 0x0); + loop { + let done = { + let val = hpu_hw.read_reg(*ksk_reset.offset() as u64); + let fields = ksk_reset.as_field(val); + + *fields.get("done").expect("Unknown field") != 0 + }; + if done { + break; + } + } + + hpu_hw.write_reg(*ksk_reset.offset() as u64, 0x0); + } + #[tracing::instrument(level = "debug", skip(self, ksk), ret)] + pub fn ksk_set(&mut self, ksk: HpuLweKeyswitchKeyOwned) { + let Self { + ref mut hpu_hw, + regmap, + params, + ksk_key, + .. + } = self; + + // Extract register from regmap + let ksk_avail = regmap + .register() + .get("ksk_avail::avail") + .expect("Unknown register, check regmap definition"); + let ksk_addr_pc = (0..params.pc_params.ksk_pc) + .map(|idx| { + let lsb_name = format!("hbm_axi4_addr_1in3::ksk_pc{idx}_lsb"); + let msb_name = format!("hbm_axi4_addr_1in3::ksk_pc{idx}_msb"); + let lsb = regmap + .register() + .get(&lsb_name) + .expect("Unknown register, check regmap definition"); + let msb = regmap + .register() + .get(&msb_name) + .expect("Unknown register, check regmap definition"); + (lsb, msb) + }) + .collect::>(); + + // Write key in associated buffer + for (id, ksk_cut) in ksk.as_view().into_container().into_iter().enumerate() { + ksk_key.write_cut_at(id, 0, ksk_cut); + #[cfg(feature = "io-dump")] + io_dump::dump( + ksk_cut, + params, + io_dump::DumpKind::Ksk, + io_dump::DumpId::Key(id), + ); + } + + // Write pc_addr in memory + for (addr, (lsb, msb)) in std::iter::zip(ksk_key.cut_paddr().iter(), ksk_addr_pc.iter()) { + hpu_hw.write_reg( + *msb.offset() as u64, + ((addr >> u32::BITS) & (u32::MAX) as u64) as u32, + ); + hpu_hw.write_reg(*lsb.offset() as u64, (addr & (u32::MAX as u64)) as u32); + } + + // Toggle avail bit + hpu_hw.write_reg(*ksk_avail.offset() as u64, 0x1); + } + + #[tracing::instrument(level = "debug", skip(self), ret)] + pub fn ksk_is_set(&self) -> bool { + let Self { hpu_hw, regmap, .. } = self; + + // Extract register from regmap + let ksk_avail = regmap + .register() + .get("ksk_avail::avail") + .expect("Unknown register, check regmap definition"); + + let val = hpu_hw.read_reg(*ksk_avail.offset() as u64); + let fields = ksk_avail.as_field(val); + + *fields.get("avail").expect("Unknown field") != 0 + } +} + +/// Handle Glwe Lut initialisation +/// Lut and Fw are merged since +impl HpuBackend { + #[tracing::instrument(level = "debug", skip(self, gen_lut), ret)] + pub(crate) fn lut_init(&mut self, gen_lut: F) + where + F: Fn(HpuParameters, &asm::Pbs) -> HpuGlweLookuptableOwned, + { + let Self { + ref mut hpu_hw, + regmap, + params, + lut_mem, + .. + } = self; + + // Iterate over HwHpu::PbsLut + // Construct them with associated parameters set + // And upload them in memory + for lut_impl in asm::Pbs::list_all() { + let lut_gid = lut_impl.gid().0 as usize; + + // Write it in on-board memory + // Lut are encoded as trivial ciphertext. + // Thus to prevent useless memory xfer, only the Body polynomial is uploaded on Hw + let hpu_lut = gen_lut(params.clone(), &lut_impl); + + // NB: lut_mem are always on 1cut + let ofst = lut_gid * params.pbs_params.polynomial_size; + lut_mem.write_cut_at(0, ofst, hpu_lut.as_view().into_container()); + #[cfg(feature = "io-dump")] + io_dump::dump( + hpu_lut.as_ref(), + params, + io_dump::DumpKind::Glwe, + io_dump::DumpId::Lut(lut_gid), + ); + } + + // Configure Hpu register accordingly + // Extract register from regmap + let reg_lsb = regmap + .register() + .get("hbm_axi4_addr_1in3::glwe_pc0_lsb") + .expect("Unknown register, check regmap definition"); + let reg_msb = regmap + .register() + .get("hbm_axi4_addr_1in3::glwe_pc0_msb") + .expect("Unknown register, check regmap definition"); + + let lut_addr = lut_mem.cut_paddr()[0]; + hpu_hw.write_reg( + *reg_msb.offset() as u64, + ((lut_addr >> u32::BITS) & (u32::MAX) as u64) as u32, + ); + hpu_hw.write_reg( + *reg_lsb.offset() as u64, + (lut_addr & (u32::MAX as u64)) as u32, + ); + } +} + +/// HW trace initialisation +impl HpuBackend { + #[tracing::instrument(level = "debug", skip(self), ret)] + pub(crate) fn trace_init(&mut self) { + let Self { + ref mut hpu_hw, + regmap, + trace_mem, + .. + } = self; + + // Configure Hpu register accordingly + // Extract register from regmap + let reg_lsb = regmap + .register() + .get("hbm_axi4_addr_1in3::trc_pc0_lsb") + .expect("Unknown register, check regmap definition"); + let reg_msb = regmap + .register() + .get("hbm_axi4_addr_1in3::trc_pc0_msb") + .expect("Unknown register, check regmap definition"); + + let trace_addr = trace_mem.cut_paddr()[0]; + hpu_hw.write_reg( + *reg_msb.offset() as u64, + ((trace_addr >> u32::BITS) & (u32::MAX) as u64) as u32, + ); + hpu_hw.write_reg( + *reg_lsb.offset() as u64, + (trace_addr & (u32::MAX as u64)) as u32, + ); + } +} + +/// Handle Fw Lut and translation table init +impl HpuBackend { + #[tracing::instrument(skip(self, config))] + pub(crate) fn fw_init(&mut self, config: &config::HpuConfig) { + // Create Asm architecture properties and Fw instantiation + let pe_cfg = PeConfigStore::from((&self.params, config)); + let fw_name = + crate::fw::FwName::from_str(&config.firmware.implementation).unwrap_or_else(|_| { + panic!( + "Unknown firmware name {}, list of possible firmware names: {}", + config.firmware.implementation, + crate::fw::AvlblFw::VARIANTS.iter().join(",") + ); + }); + let fw = crate::fw::AvlblFw::new(&fw_name); + + // TODO Add RTL register for the nu value + let mut fw_params = FwParameters { + register: self.params.regf_params.reg_nb, + isc_depth: self.params.isc_params.depth, + heap_size: config.board.heap_size, + min_iop_size: self.params.isc_params.min_iop_size, + min_pbs_batch_w: self + .params + .ntt_params + .min_pbs_nb + .unwrap_or(self.params.ntt_params.batch_pbs_nb), + pbs_batch_w: self.params.ntt_params.batch_pbs_nb, + total_pbs_nb: self.params.ntt_params.total_pbs_nb, + msg_w: self.params.pbs_params.message_width, + carry_w: self.params.pbs_params.carry_width, + nu: 5, + integer_w: 0, + use_ipip: !config.rtl.bpip_use, + kogge_cfg: config.firmware.kogge_cfg.expand(), + op_cfg: config.firmware.op_cfg.clone(), + cur_op_cfg: config.firmware.op_cfg.default(), + pe_cfg, + op_name: None, + }; + + // Check that required number of integer_w don't overflow the lookup table space + let integer_w_max = config.firmware.integer_w.iter().max().unwrap_or(&0); + let blk_w_max = integer_w_max / fw_params.msg_w; + assert!( + blk_w_max < FW_TABLE_ENTRY, + "ERROR: requested {} fw configuration but current implementation only support {} entries", + config.firmware.integer_w.len(), + FW_TABLE_ENTRY + ); + let mut tr_table_ofst = FW_TABLE_ENTRY * 0x100; // Opcode is 8bit -> 256 words entry + let cut_ofst = self.fw_mem.cut_paddr()[0] as usize; + + for integer_w in config.firmware.integer_w.iter() { + // Update fw parameters with concrete integer_width + assert_eq!( + integer_w % fw_params.msg_w, + 0, + "ERROR: requested integer_w {integer_w} isn't compliant with MSG_W {}", + fw_params.msg_w + ); + let blk_w = integer_w / fw_params.msg_w; + fw_params.integer_w = *integer_w; + + // Generate Fw for standard operation + // -> All operation with an associated alias + let mut id_fw = asm::iop::IOP_LIST + .par_iter() + .map(|iop| { + let opcode = iop.opcode(); + let prog = fw.expand(&fw_params, iop); + (opcode.0 as usize, prog.tr_table()) + }) + .collect::>(); + + // Load custom IOp from file + for (name, asm_file) in config.firmware.custom_iop.iter() { + let iop = asm::AsmIOpcode::from_str(name) + .unwrap_or_else(|_| panic!("Invalid Custom Iop name {name}")); + let opcode = iop.opcode(); + let prog = asm::Program::::read_asm(&asm_file.expand()) + .unwrap_or_else(|_| panic!("Invalid custom_iop file {}", asm_file.expand())); + id_fw.push((opcode.0 as usize, prog.tr_table())); + } + + // Sanity check + let sync_opcode = asm::dop::DOpSync::opcode(); + for (id, fw_bytes) in id_fw.iter() { + // All IOp entry must be gte (MIN_IOP_SIZE-1) + // NB fw_bytes contain size + DOps -> gte MIN_IOP_SIZE + assert!( + fw_bytes.len() >= self.params.isc_params.min_iop_size, + "Error: IOp {id} is too short and could lead to sync_id overflow" + ); + // All IOp mustn't contain SYNC token + let mut sync_dop = fw_bytes + .iter() + .filter(|w| (((*w >> 24) & 0xff) as u8) == sync_opcode) + .peekable(); + assert!( + sync_dop.peek().is_none(), + "Error: IOp[0x{id:x}] contain SYNC. This break the min_iop_size requirement and + could lead to sync_id overflow" + ); + } + + // Sort by opcode and write Lut and translation table into memory + // NB: ucore is a 32b cpu => addr-lut/ translation word must be 32b word + id_fw.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + let blk_ofst = (blk_w - 1) * 0x100; // Opcode is 8bit -> 256 words per blk_w + + // Default tr_lut with fallback entry + // Uninit entries point to fist tr-table entry + let mut tr_lut = + vec![(cut_ofst + (tr_table_ofst * std::mem::size_of::())) as u32; 256]; + + for (id, fw_bytes) in id_fw.into_iter() { + // Store lookup addr + // NB: ucore expect addr with physical memory offset + // NB': ucore understand lut entry as ofst from PHYS_MEM => don't add cut_ofst in + // the entry + let byte_ofst = /* cut_ofst + */(tr_table_ofst * std::mem::size_of::()) as u32; + tr_lut[id] = byte_ofst; + + // Write tr-table + let fw_words = bytemuck::cast_slice::<_, u32>(fw_bytes.as_slice()); + self.fw_mem.write_cut_at(0, tr_table_ofst, fw_words); + tracing::debug!( + "Opcode::{id:x}[{} dops] @{tr_table_ofst:x} [{byte_ofst:x}]", + fw_words.len() + ); + tracing::trace!("TrTable::{fw_words:x?}"); + tr_table_ofst += fw_words.len(); + } + // Write lookup table all at once + self.fw_mem.write_cut_at(0, blk_ofst, tr_lut.as_slice()); + tracing::debug!( + "Fw[{blk_w}]:: lut entry @{blk_ofst:x} [{:x}]", + blk_ofst * std::mem::size_of::() + ); + tracing::trace!(" LutTable=> {tr_lut:x?}"); + + // Update init_fw_width list enable to runtime check + self.init_fw_width.push(*integer_w); + } + } +} + +impl HpuBackend { + #[tracing::instrument(skip(self, cmd))] + fn workq_push(&mut self, cmd: cmd::HpuCmd) -> Result<(), HpuInternalError> { + let Self { + ref mut hpu_hw, + #[cfg(not(feature = "hw-v80"))] + workq_addr, + cmd_q, + .. + } = self; + + // Check if issued command + // NB: fw_blk_width is 0 encoded => 0 ~ 1 block ciphertext + assert!( + self.init_fw_width.contains(&((cmd.op.fw_blk_width()+1)*self.params.pbs_params.message_width) + ), + "Requested integer width {:?} isn't configured in [Hpu: {:?}] and could lead to Undefined Behavior. Please check Hpu configuration file.", + (cmd.op.fw_blk_width()+1) * self.params.pbs_params.message_width, + self.init_fw_width + ); + + // Steps are as follow + // 1. Enforce that source ops are synced on Hw + cmd.src + .iter() + .map(|src| src.inner.lock().unwrap().try_hpu_sync()) + .collect::, _>>()?; + + // 2. Issue work to Hpu through workq + // Convert Iop in a stream of bytes + let op_words = cmd.op.to_words(); + tracing::debug!("Op Asm {}", cmd.op); + tracing::trace!("Op Words {:x?}", op_words); + + // Write them in workq entry + // NB: No queue full check was done ... + #[cfg(feature = "hw-v80")] + { + hpu_hw.iop_push(op_words.as_slice()); + } + #[cfg(not(feature = "hw-v80"))] + { + for w in op_words.iter() { + hpu_hw.write_reg(*workq_addr, *w); + } + } + + // Keep track of op in cmd_q for lifetime tracking + cmd_q.push_back(cmd); + + Ok(()) + } + + /// Poll ack_q + /// When ack received pop entry in cmd_q and update variable accordingly + #[tracing::instrument(level = "debug", skip(self))] + pub fn poll_ack_q(&mut self) -> Result { + let Self { + ref mut hpu_hw, + #[cfg(not(feature = "hw-v80"))] + ackq_addr, + cmd_q, + regmap, + .. + } = self; + + trace!( + "Isc registers {:?}", + rtl::runtime::InfoIsc::from_rtl(hpu_hw, regmap) + ); + trace!( + "PeMem registers {:?}", + rtl::runtime::InfoPeMem::from_rtl(hpu_hw, regmap) + ); + trace!( + "PeAlu registers {:?}", + rtl::runtime::InfoPeAlu::from_rtl(hpu_hw, regmap) + ); + trace!( + "PePbs registers {:?}", + rtl::runtime::InfoPePbs::from_rtl(hpu_hw, regmap) + ); + + #[cfg(feature = "hw-v80")] + { + let ack_nb = hpu_hw.iop_ack_rd(); + if ack_nb == 0 { + Ok(false) + } else { + tracing::debug!("Received ack {ack_nb} IOp ack. Pending cmd {}", cmd_q.len()); + for _ack in 0..ack_nb { + let ack_cmd = cmd_q.pop_front().unwrap(); + // TODO check that ack_code match with expected op msb + tracing::debug!("Received ack for IOp {}", ack_cmd.op); + // update dst state and drop srcs ref + ack_cmd + .dst + .iter() + .for_each(|dst| dst.inner.lock().unwrap().operation_done()); + } + Ok(true) + } + } + #[cfg(not(feature = "hw-v80"))] + { + let ack_code = hpu_hw.read_reg(*ackq_addr); + if ack_code != ACKQ_EMPTY { + let ack_cmd = cmd_q.pop_front().unwrap(); + // TODO check that ack_code match with expected op msb + tracing::debug!("Received ack {:x} for IOp {}", ack_code, ack_cmd.op); + // update dst state and drop srcs ref + ack_cmd + .dst + .iter() + .for_each(|dst| dst.inner.lock().unwrap().operation_done()); + Ok(true) + } else { + Ok(false) + } + } + } +} + +impl HpuBackend { + /// This function flush all pending cmd + pub(crate) fn flush_workq(&mut self) -> Result<(), HpuInternalError> { + while let Ok(cmd) = self.cmd_rx.try_recv() { + self.workq_push(cmd)?; + } + Ok(()) + } + /// This function flush all pending ack + pub(crate) fn flush_ackq(&mut self) -> Result<(), HpuInternalError> { + while self.poll_ack_q()? {} + Ok(()) + } +} + +impl Drop for HpuBackend { + fn drop(&mut self) { + // Release ffi allocated memory + // Couldn't rely on Drop trait of inner objects since it required reference to associated + // ffi backend + self.bsk_key.release(&mut self.hpu_hw); + self.ksk_key.release(&mut self.hpu_hw); + self.lut_mem.release(&mut self.hpu_hw); + self.fw_mem.release(&mut self.hpu_hw); + self.ct_mem.release(&mut self.hpu_hw); + } +} diff --git a/backends/tfhe-hpu-backend/src/interface/cmd.rs b/backends/tfhe-hpu-backend/src/interface/cmd.rs new file mode 100644 index 000000000..7be7ee499 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/cmd.rs @@ -0,0 +1,153 @@ +/// +/// Help with IOp management over HPU +/// Track IOp status and handle backward update of associated HpuVariable +use super::*; +use crate::asm::iop::{Immediat, Operand, OperandKind}; +use crate::asm::{IOp, IOpcode}; +use variable::HpuVarWrapped; + +/// Underlying type used for Immediat value; +pub type HpuImm = u128; + +/// Structure that hold an IOp with there associated operands +/// Wrap operands memory with the IOp for proper lifetime management +pub struct HpuCmd { + pub(crate) op: IOp, + pub(crate) dst: Vec, + pub(crate) src: Vec, + // NB: No need to track Immediat lifetime. It's simply constant completely held by the IOp + // definition +} + +impl HpuCmd { + pub fn new( + opcode: IOpcode, + dst: &[HpuVarWrapped], + src: &[HpuVarWrapped], + imm: &[HpuImm], + ) -> Self { + // TODO Check that dst/rhs_x backend match + // Check arguments compliance with IOp prototype if any + #[cfg(debug_assertions)] + if let Some(format) = crate::asm::iop::IOP_LUT.hex.get(&opcode) { + assert_eq!( + dst.len(), + format.proto.dst.len(), + "Error {}: Invalid number of dst arguments", + format.name + ); + assert_eq!( + src.len(), + format.proto.src.len(), + "Error {}: Invalid number of dst arguments", + format.name + ); + assert_eq!( + imm.len(), + format.proto.imm, + "Error {}: Invalid number of dst arguments", + format.name + ); + } + + // Extract Operands definition from HpuVar + let dst_op = dst + .iter() + .map(|var| { + Operand::new( + var.width as u8, + var.id.0 as u16, + 1, /* TODO handle vec source !? */ + Some(OperandKind::Dst), + ) + }) + .collect::>(); + let src_op = src + .iter() + .map(|var| { + Operand::new( + var.width as u8, + var.id.0 as u16, + 1, /* TODO handle vec source !? */ + Some(OperandKind::Src), + ) + }) + .collect::>(); + let imm_op = imm + .iter() + .map(|var| Immediat::from_cst(*var)) + .collect::>(); + + let op = IOp::new(opcode, dst_op, src_op, imm_op); + // TODO set op_width + + let dst = dst + .iter() + .map(|var| { + // Update dst state to OpPending + var.inner.lock().unwrap().operation_pending(); + (*var).clone() + }) + .collect::>(); + let src = src.iter().map(|var| (*var).clone()).collect::>(); + Self { op, dst, src } + } + + pub fn op(&self) -> &IOp { + &self.op + } +} + +/// Generic interface +impl HpuCmd { + pub fn exec_raw( + opcode: crate::asm::IOpcode, + dst: &[HpuVarWrapped], + rhs_ct: &[HpuVarWrapped], + rhs_imm: &[HpuImm], + ) { + // Create associated command + let cmd = Self::new(opcode, dst, rhs_ct, rhs_imm); + // Issue it on Hpubackend + dst.first() + .expect("Try to generate an IOp without any destination") + .cmd_api + .send(cmd) + .expect("Issue with cmd_api"); + } + + // TODO add more runtime check on prototype ? + pub fn exec( + proto: &crate::asm::iop::IOpProto, + opcode: crate::asm::IOpcode, + rhs_ct: &[HpuVarWrapped], + rhs_imm: &[HpuImm], + ) -> Vec { + let dst = proto + .dst + .iter() + .map(|m| rhs_ct[0].fork(*m)) + .collect::>(); + Self::exec_raw(opcode, &dst, rhs_ct, rhs_imm); + dst + } + + pub fn exec_assign( + proto: &crate::asm::iop::IOpProto, + opcode: crate::asm::IOpcode, + rhs_ct: &[HpuVarWrapped], + rhs_imm: &[HpuImm], + ) { + // Clone dst sub-array from srcs + let dst = std::iter::zip(proto.dst.iter(), rhs_ct.iter()) + .map(|(p, v)| { + debug_assert_eq!( + *p, v.mode, + "Assign with invalid prototype, rhs mode don't match" + ); + v.clone() + }) + .collect::>(); + Self::exec_raw(opcode, &dst, rhs_ct, rhs_imm); + } +} diff --git a/backends/tfhe-hpu-backend/src/interface/config.rs b/backends/tfhe-hpu-backend/src/interface/config.rs new file mode 100644 index 000000000..6c3d5975d --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/config.rs @@ -0,0 +1,163 @@ +//! Define Hpu configuration +//! Provide mechanism to load it from Toml-file + +use crate::ffi; +use crate::fw::rtl::config::RtlCfg; +use std::collections::{HashMap, HashSet}; + +/// ShellString +/// Thin wrapper around String that provide a method to interpolate it's content with environment +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct ShellString(String); +impl ShellString { + pub fn new(from: String) -> Self { + Self(from) + } + pub fn expand(&self) -> String { + // Regex that match on $MY_VAR or ${MY_VAR} + let shell_regex = regex::Regex::new(r"\$\{?([A-Za-z_][A-Za-z0-9_]*)\}?").unwrap(); + + // Replace each bash var occurrence with the associated environment variable value + let cow = shell_regex.replace_all(&self.0, |caps: ®ex::Captures| { + let shell_var = &caps[1]; + std::env::var(shell_var).unwrap_or_else(|_| { + panic!("Error: ShellString used env_var <{shell_var}> not found") + }) + }); + cow.to_string() + } +} + +/// Custom FromStr implementation to enable usage with clap CLI +impl std::str::FromStr for ShellString { + type Err = String; + + fn from_str(s: &str) -> Result { + Ok(Self(s.to_string())) + } +} + +/// Configuration of targeted FFI bridge with the Hw +/// Enable to select targeted ffi interface with specific properties +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub enum FFIMode { + V80 { + ami_id: usize, + qdma_h2c: ShellString, + qdma_c2h: ShellString, + }, + Xrt { + id: u32, + kernel: ShellString, + xclbin: ShellString, + }, + Sim { + ipc_name: ShellString, + }, +} + +/// Configuration of targeted Fpga +/// Define Bitstream and kernel properties +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct FpgaConfig { + pub regmap: Vec, + pub polling_us: u64, + pub ffi: FFIMode, +} + +/// Configuration of Rtl +/// Rtl has some internal knobs that could be configured +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct RtlConfig { + /// Select pe-pbs configuration Ipip or Bpip + pub bpip_use: bool, + /// When Bpip is used, select to use opportunistic behavior + pub bpip_use_opportunism: bool, + /// Timeout value to start Bpip even if batch isn't full + pub bpip_timeout: u32, +} + +/// On-board memory configuration +/// Define the Hbm pc properties and required memory size +/// NB: Hbm pc must match with `fpga/xr/kernel/${board}/cfg/${config}.cfg` +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct BoardConfig { + /// Ciphertext memory + /// Expressed the number of ciphertext slot to allocate + pub ct_mem: usize, + /// Depict the list of memories connected to ct master_axi + pub ct_pc: Vec, + /// Expressed the number of ct_mem slot used for heap + /// Heap is then used downward + pub heap_size: usize, + + /// Expressed the number of PbsLut slot to allocate + pub lut_mem: usize, + /// Depict the memory connected to glwe master_axi + pub lut_pc: ffi::MemKind, + + /// Expressed the size in u32 word allocated to Fw table + pub fw_size: usize, + /// Depict the memory connected to ucore fw master_axi + pub fw_pc: ffi::MemKind, + /// Depict the memory connected to trace manager + pub trace_pc: ffi::MemKind, + /// The trace memory depth in MB + pub trace_depth: usize, + + /// Depict the hbm_pc connected to bsk master_axi + pub bsk_pc: Vec, + /// Depict the hbm_pc connected to bsk master_axi + pub ksk_pc: Vec, +} + +/// Embedded Fw properties +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct FwConfig { + /// List of supported integer width + /// NB: Currently only one width is supported at a time + pub integer_w: HashSet, + + /// Kogge config filename + /// Used to depicts best tradeoff for kogge Add/Sub algorithm + pub kogge_cfg: ShellString, + + /// List of custom iop to load + /// IopName -> Iop asm file + pub custom_iop: HashMap, + + /// A per IOP configuration + pub op_cfg: RtlCfg, + + /// Defines the firmware implementation to use + pub implementation: String, + + /// Defines the minimum batch size for an accurate FW simulation (use this + /// while this information is not available as a register in the hardware) + pub min_batch_size: usize, +} + +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct HpuConfig { + pub fpga: FpgaConfig, + pub rtl: RtlConfig, + pub board: BoardConfig, + pub firmware: FwConfig, +} + +impl HpuConfig { + /// Provide Serde mechanisms from TOML file + pub fn from_toml(file: &str) -> Self { + let file_str = match std::fs::read_to_string(file) { + Ok(str) => str, + Err(err) => { + panic!("Error: `{file}`:: {err}"); + } + }; + + match toml::from_str(&file_str) { + Ok(cfg) => cfg, + Err(err) => panic!("Toml error in `{file}`: {err}"), + } + } +} diff --git a/backends/tfhe-hpu-backend/src/interface/device.rs b/backends/tfhe-hpu-backend/src/interface/device.rs new file mode 100644 index 000000000..893059e69 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/device.rs @@ -0,0 +1,237 @@ +//! Implement User-view of Hpu backend +//! Through this interface user is able to instantiate and configure a Hpu Backend +use super::config::HpuConfig; +use super::*; +use crate::entities::*; + +use std::sync::{atomic, mpsc, Arc}; + +pub struct HpuDevice { + config: HpuConfig, + pub(crate) backend: backend::HpuBackendWrapped, + pub(crate) ct_mem: memory::CiphertextMemory, + pub(crate) cmd_api: mpsc::Sender, + pub(crate) params: Arc, + bg_poll: Arc, + bg_handles: Option<(std::thread::JoinHandle<()>, std::thread::JoinHandle<()>)>, +} + +/// Provide constructor +/// Use a toml configuration file to properly construct HpuDevice +/// This configuration file contain xclbin/kernel information and associated register map +/// definition +impl HpuDevice { + pub fn from_config(config_toml: &str) -> Self { + let config = HpuConfig::from_toml(config_toml); + + Self::new(config) + } + + pub fn new(config: HpuConfig) -> Self { + // Create backend + let (backend, cmd_api) = backend::HpuBackendWrapped::new_wrapped(&config); + + // Get ref to ct_memory and associated params + let (ct_mem, params) = { + let be = backend.lock().unwrap(); + (be.ct_mem.clone(), be.params.clone()) + }; + let mut device = Self { + config, + backend, + ct_mem, + cmd_api, + params: Arc::new(params), + bg_poll: Arc::new(atomic::AtomicBool::new(false)), + bg_handles: None, + }; + + // Start polling thread in the background + device.run_polling(); + device + } +} + +impl Drop for HpuDevice { + fn drop(&mut self) { + // Required background polling thread to stop + // This enable proper release of the associated HpuBackend + self.bg_poll.store(false, atomic::Ordering::SeqCst); + + if let Some((workq_handle, ackq_handle)) = self.bg_handles.take() { + workq_handle + .join() + .expect("Work_queue Background thread failed to stop properly"); + ackq_handle + .join() + .expect("Ack_queue Background thread failed to stop properly"); + } + } +} + +/// Retrieved Hw parameters & configuration +impl HpuDevice { + pub fn params(&self) -> &HpuParameters { + &self.params + } + pub fn config(&self) -> &HpuConfig { + &self.config + } +} + +/// Global Key setup +impl HpuDevice { + /// Convert keys (i.e. Ksk/Bsk) is the correct format + /// Upload them in on-board memory and configure associated register entries + /// Also use the given server key to generate required set of GlweLut + /// Upload them in on-board memory and configure associated register entries + // TODO fixdeps + pub fn init( + &self, + bsk: HpuLweBootstrapKeyOwned, + ksk: HpuLweKeyswitchKeyOwned, + gen_lut: F, + ) where + F: Fn(HpuParameters, &crate::asm::Pbs) -> HpuGlweLookuptableOwned, + { + // Properly reset keys + self.bsk_unset(); + self.ksk_unset(); + + self.bsk_set(bsk); + self.ksk_set(ksk); + + // Init GlweLut ciphertext + self.lut_init(gen_lut); + + // Init Fw Lut and Translation table + self.fw_init(); + + // Init HW trace offset + self.trace_init(); + } + + /// Enforce a cleaan state of the HPU before workload execution + /// Currently only enforce proper state of the Ciphertext pool + /// i.e. No already allocated Ciphertext and no fragmentation + pub fn mem_sanitizer(&self) { + // Lock underlying backend + let backend = self.backend.lock().unwrap(); + + // Triggered Ciphertext pool defragmentation + backend.ct_mem.reorder_pool(); + } +} +/// Bootstrapping Key handling +/// Only here to expose function to the user. Associated logic is handled by the backend +impl HpuDevice { + pub fn bsk_unset(&self) { + let mut backend = self.backend.lock().unwrap(); + backend.bsk_unset(); + } + pub fn bsk_set(&self, bsk: HpuLweBootstrapKeyOwned) { + let mut backend = self.backend.lock().unwrap(); + backend.bsk_set(bsk); + } + pub fn bsk_is_set(&self) -> bool { + let backend = self.backend.lock().unwrap(); + backend.bsk_is_set() + } +} + +/// KeyswitchKey handling +/// Only here to expose function to the user. Associated logic is handled by the backend +impl HpuDevice { + pub fn ksk_unset(&self) { + let mut backend = self.backend.lock().unwrap(); + backend.ksk_unset(); + } + pub fn ksk_set(&self, ksk: HpuLweKeyswitchKeyOwned) { + let mut backend = self.backend.lock().unwrap(); + backend.ksk_set(ksk); + } + pub fn ksk_is_set(&self) -> bool { + let backend = self.backend.lock().unwrap(); + backend.ksk_is_set() + } +} + +/// GlweLut/ Fw handling +/// Only here to expose function to the user. Associated logic is handled by the backend +impl HpuDevice { + pub(crate) fn lut_init(&self, gen_lut: F) + where + F: Fn(HpuParameters, &crate::asm::Pbs) -> HpuGlweLookuptableOwned, + { + let mut backend = self.backend.lock().unwrap(); + backend.lut_init(gen_lut) + } + pub fn fw_init(&self) { + let mut backend = self.backend.lock().unwrap(); + backend.fw_init(&self.config); + } + pub fn trace_init(&self) { + let mut backend = self.backend.lock().unwrap(); + backend.trace_init(); + } +} + +/// Allocate new Hpu variable to hold ciphertext +/// Only here to expose function to the user. Associated logic is handled by the backend +impl HpuDevice { + /// Construct an Hpu variable from a vector of HpuLweCiphertext + pub fn new_var_from( + &self, + ct: Vec>, + mode: crate::asm::iop::VarMode, + ) -> HpuVarWrapped { + HpuVarWrapped::new_from( + self.ct_mem.clone(), + self.cmd_api.clone(), + self.params.clone(), + ct, + mode, + ) + } +} + +/// Spawn a background thread that handle periodically update HW state +/// WARN: Variable still required lock on HpuBackend for allocation. Thus ensure to release the lock +/// periodically NB: This should be replaced by Irq when available +impl HpuDevice { + fn run_polling(&mut self) { + let backend = self.backend.clone(); + let bg_poll = self.bg_poll.clone(); + let tick = std::time::Duration::from_micros(self.config.fpga.polling_us); + + if bg_poll.load(atomic::Ordering::SeqCst) { + // background threads already running + // -> nothing to do + return; + }; + + bg_poll.store(true, atomic::Ordering::SeqCst); + let bg_workq = (bg_poll.clone(), backend.clone()); + let bg_ackq = (bg_poll.clone(), backend.clone()); + self.bg_handles = Some(( + std::thread::spawn(move || { + while bg_workq.0.load(atomic::Ordering::SeqCst) { + std::thread::sleep(tick); + { + let mut be = bg_workq.1.lock().unwrap(); + be.flush_workq().expect("Hpu encounter internal error"); + } + } + }), + std::thread::spawn(move || { + while bg_ackq.0.load(atomic::Ordering::SeqCst) { + std::thread::sleep(tick); + { + let mut be = bg_ackq.1.lock().unwrap(); + be.flush_ackq().expect("Hpu encounter internal error"); + } + } + }), + )); + } +} diff --git a/backends/tfhe-hpu-backend/src/interface/io_dump.rs b/backends/tfhe-hpu-backend/src/interface/io_dump.rs new file mode 100644 index 000000000..aad719035 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/io_dump.rs @@ -0,0 +1,188 @@ +use std::fs::{File, OpenOptions}; +use std::io::{BufWriter, Write}; +/// Feature io_dump +/// Enable to log hpu object in hex file for debug purpose +use std::path::PathBuf; + +use crate::interface::memory::ciphertext::SlotId; +use crate::prelude::HpuParameters; + +thread_local! { + static HPU_IO_DUMP: std::cell::RefCell> = const { std::cell::RefCell::new(None) }; +} + +#[derive(Debug)] +pub enum DumpKind { + Bsk, + Ksk, + Glwe, + BlweIn, + BlweOut, +} + +impl DumpKind { + const FOLDER: [&'static str; 4] = ["key", "blwe/input", "blwe/output", "glwe"]; +} + +pub fn set_hpu_io_dump(dir_path: &str) { + // Enforce that given file_path exist + let path = PathBuf::from(dir_path); + if path.exists() { + if path.is_file() { + panic!("HPU_IO_DUMP: given file_path is a file. Directory expected"); + } + } else { + // Create it + std::fs::create_dir_all(&path).unwrap(); + } + // Create all subpath + for f in DumpKind::FOLDER.iter() { + let sub_path = path.join(f); + std::fs::create_dir_all(sub_path).unwrap(); + } + HPU_IO_DUMP.replace(Some(path)); +} + +#[derive(Debug)] +pub enum DumpId { + Slot(SlotId, usize), + Key(usize), + Lut(usize), +} + +pub fn dump>( + value: &[T], + params: &HpuParameters, + kind: DumpKind, + id: DumpId, +) { + HPU_IO_DUMP.with_borrow(|inner| { + if let Some(path) = inner { + // Open file + let file_path = match id { + DumpId::Slot(sid, cut) => match kind { + DumpKind::BlweIn => format!( + "{}/blwe/input/blwe_{:0>4x}_{cut:0>1x}.hex", + path.display(), + sid.0, + ), + DumpKind::BlweOut => format!( + "{}/blwe/output/blwe_{:0>4x}_{cut:0>1x}.hex", + path.display(), + sid.0, + ), + _ => panic!("Unexpected DumpId {id:?} with kind {kind:?}"), + }, + + DumpId::Key(cut) => match kind { + DumpKind::Bsk => format!("{}/key/bsk_{cut:0>1x}.hex", path.display(),), + DumpKind::Ksk => format!("{}/key/ksk_{cut:0>1x}.hex", path.display()), + _ => panic!("Unexpected DumpId {id:?} with kind {kind:?}"), + }, + DumpId::Lut(cut) => match kind { + DumpKind::Glwe => format!("{}/glwe/glwe_{cut:0>2x}.hex", path.display()), + _ => panic!("Unexpected DumpId {id:?} with kind {kind:?}"), + }, + }; + + // Open file + let mut wr_f = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(file_path) + .unwrap(); + + // Dump + // Based on configuration dump value must be shrunk to 32b (i.e. when contained + // information is <= 32) + let (word_bits, line_bytes) = match kind { + DumpKind::Bsk => (params.ntt_params.ct_width, params.pc_params.bsk_bytes_w), + DumpKind::Ksk => ( + (params.ks_params.lbz * params.ks_params.width) as u32, + params.pc_params.ksk_bytes_w, + ), + DumpKind::Glwe => (params.ntt_params.ct_width, params.pc_params.glwe_bytes_w), + DumpKind::BlweIn => (params.ntt_params.ct_width, params.pc_params.pem_bytes_w), + DumpKind::BlweOut => (params.ntt_params.ct_width, params.pc_params.pem_bytes_w), + }; + + // Shrink value to 32b when possible + if word_bits <= u32::BITS { + let value_32b = value + .into_iter() + .map(|x| { + let x_u32: u32 = x.as_(); + x_u32 + }) + .collect::>(); + value_32b + .as_slice() + .write_hex(&mut wr_f, line_bytes, Some("XX")); + } else { + value.write_hex(&mut wr_f, line_bytes, Some("XX")); + } + } + }) +} + +/// HexMem dump trait. +/// +/// Enable to generate a .mem in hex format from a rust structure. +/// `.mem` are used by RTL to load constants or stimulus +pub trait HexMem { + fn as_bytes(&self) -> &[u8]; + + fn write_hex(&self, into: &mut File, line_w: usize, pad_with: Option<&str>) { + // Use write buffer for performances purpose + let mut into_wrbfr = BufWriter::new(into); + + let bytes = self.as_bytes(); + + let lines = bytes.len() / line_w; + let residual = bytes.len() % line_w; + + // Write full lines + for l in 0..lines { + let cur_slice = &bytes[l * line_w..(l + 1) * line_w]; + for c in cur_slice.iter().rev() { + write!(into_wrbfr, "{:02x}", c).unwrap(); + } + writeln!(into_wrbfr).unwrap(); + } + + // Add padding if requested + if let Some(padder) = pad_with { + assert_eq!( + padder.len(), + 2, + "Padding str length must be 2 (u8 written in hex)." + ); + let pad_len = if 0 != residual { line_w - residual } else { 0 }; + for _ in 0..pad_len { + write!(into_wrbfr, "{padder}").unwrap(); + } + } + // Write residual line + let res_slice = &bytes[lines * line_w..]; + for c in res_slice.iter().rev() { + write!(into_wrbfr, "{:02x}", c).unwrap(); + } + writeln!(into_wrbfr).unwrap(); + } +} + +// Blanket implementation for primitive integer slice +impl HexMem for &[T] +where + T: num_traits::PrimInt, +{ + #[cfg(target_endian = "little")] + fn as_bytes(&self) -> &[u8] { + let len = std::mem::size_of_val(*self); + let ptr = self.as_ptr() as *const u8; + unsafe { std::slice::from_raw_parts(ptr, len) } + } + #[cfg(target_endian = "big")] + compile_error!("Macro implementation of HexMem trait only supported on Little-endian machine"); +} diff --git a/backends/tfhe-hpu-backend/src/interface/memory/ciphertext.rs b/backends/tfhe-hpu-backend/src/interface/memory/ciphertext.rs new file mode 100644 index 000000000..08a728544 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/memory/ciphertext.rs @@ -0,0 +1,250 @@ +//! +//! Memory manager for HPU +//! Memory is allocatod upfront and abstract as a set of slot +use crate::ffi; +use crossbeam::queue::ArrayQueue; + +/// Define the rate of WARNING on allocation retry +pub const ALLOC_RETRY_WARN_RATE: std::time::Duration = std::time::Duration::from_secs(1); + +/// Describe Slot position +/// Abstract from internal ASM type to help with future +#[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +pub struct SlotId(pub(crate) usize); + +/// Ciphertext could be spread over multiple HbmPc. +/// A Slot is describe as a position and a set of associated MemZone +pub struct CiphertextSlot { + pub(crate) id: SlotId, + pub(crate) mz: Vec, +} + +impl std::fmt::Debug for CiphertextSlot { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.id) + } +} + +impl CiphertextSlot { + fn alloc(ffi_hw: &mut ffi::HpuHw, id: SlotId, props: &CiphertextMemoryProperties) -> Self { + let mz = props + .mem_cut + .iter() + .map(|kind| { + let cut_props = ffi::MemZoneProperties { + mem_kind: *kind, + size_b: props.cut_size_b, + }; + ffi_hw.alloc(cut_props) + }) + .collect::>(); + CiphertextSlot { id, mz } + } + + fn release(&mut self, ffi_hw: &mut ffi::HpuHw) { + self.mz.iter_mut().for_each(|mz| ffi_hw.release(mz)); + } +} + +#[derive(Debug, Clone)] +pub struct CiphertextMemoryProperties { + pub mem_cut: Vec, + pub cut_size_b: usize, + pub slot_nb: usize, + pub used_as_heap: usize, + pub retry_rate_us: u64, +} + +#[derive(Debug, Clone)] +pub struct CiphertextMemory { + pub(crate) pool: std::sync::Arc>, + retry_rate_us: u64, +} + +impl std::ops::Deref for CiphertextMemory { + type Target = std::sync::Arc>; + + fn deref(&self) -> &Self::Target { + &self.pool + } +} + +/// Structure to keep track of Slot alongside pool +/// CiphertextSlot are automatically return back to pool on drop +#[derive(Debug)] +pub struct CiphertextBundle { + slots: Vec, + pool: CiphertextMemory, +} + +impl Drop for CiphertextBundle { + fn drop(&mut self) { + let Self { slots, pool, .. } = self; + while let Some(slot) = slots.pop() { + pool.push(slot) + .expect("Error: Release a slot in already full pool"); + } + } +} + +impl CiphertextBundle { + /// Bundle is characterized by its first slot + pub fn id(&self) -> &SlotId { + &self.slots[0].id + } + pub fn iter(&mut self) -> std::slice::Iter<'_, CiphertextSlot> { + self.slots.iter() + } + pub fn iter_mut(&mut self) -> std::slice::IterMut<'_, CiphertextSlot> { + self.slots.iter_mut() + } +} + +impl CiphertextMemory { + #[tracing::instrument(level = "trace", skip(ffi_hw, regmap), ret)] + pub fn alloc( + ffi_hw: &mut ffi::HpuHw, + regmap: &hw_regmap::FlatRegmap, + props: &CiphertextMemoryProperties, + ) -> Self { + let pool = (0..props.slot_nb) + .map(|cid| { + let id = SlotId(cid); + CiphertextSlot::alloc(ffi_hw, id, props) + }) + .collect::>(); + + let mut paddr = Vec::with_capacity(props.mem_cut.len()); + if !pool.is_empty() { + // Sanity check + // Slot must be contiguous in each cut + + for cut_nb in 0..props.mem_cut.len() { + let base_addr = pool[0].mz[cut_nb].paddr(); + paddr.push(base_addr); + + pool.iter().enumerate().for_each(|(i, slot)| { + let cont_addr = base_addr + (i * props.cut_size_b) as u64; + let real_addr = slot.mz[cut_nb].paddr(); + assert_eq!( + cont_addr, real_addr, + "Ct slot@{i} weren't contiguous in memory" + ); + }); + } + + // Extract LdSt_addr_pc register addr + let ldst_addr_pc = (0..props.mem_cut.len()) + .map(|idx| { + let lsb_name = format!("hbm_axi4_addr_1in3::ct_pc{idx}_lsb"); + let msb_name = format!("hbm_axi4_addr_1in3::ct_pc{idx}_msb"); + let lsb = regmap + .register() + .get(&lsb_name) + .expect("Unknown register, check regmap definition"); + let msb = regmap + .register() + .get(&msb_name) + .expect("Unknown register, check regmap definition"); + (lsb, msb) + }) + .collect::>(); + + // Write pc_addr in registers + for (addr, (lsb, msb)) in std::iter::zip(paddr.iter(), ldst_addr_pc.iter()) { + ffi_hw.write_reg( + *msb.offset() as u64, + ((addr >> u32::BITS) & (u32::MAX) as u64) as u32, + ); + ffi_hw.write_reg(*lsb.offset() as u64, (addr & (u32::MAX as u64)) as u32); + } + } + + // Store slot in ArrayQueue for MpMc access + let array_queue = ArrayQueue::new(props.slot_nb - props.used_as_heap); + for (idx, slot) in pool.into_iter().enumerate() { + if idx < (props.slot_nb - props.used_as_heap) { + array_queue.push(slot).expect("Check ArrayQueue allocation"); + } + // else slot is used by heap and shouldn't be handled by the ct pool + } + Self { + pool: std::sync::Arc::new(array_queue), + retry_rate_us: props.retry_rate_us, + } + } + + #[tracing::instrument(level = "trace", skip(ffi_hw), ret)] + pub fn release(&mut self, ffi_hw: &mut ffi::HpuHw) { + while let Some(mut slot) = self.pool.pop() { + slot.release(ffi_hw) + } + } +} + +impl CiphertextMemory { + /// Extract a bundle of contiguous slot in pool + #[tracing::instrument(level = "trace", skip(self), ret)] + pub fn get_bundle(&self, bundle_size: usize) -> CiphertextBundle { + // Implement sliding windows search for contiguous block + // TODO enhance this algorithm. Currently it's a naive implementation + let mut win_slots = Vec::with_capacity(self.pool.capacity()); + + // Check for contiguousnes and extend the window if necessary + loop { + let mut retry = std::time::Duration::from_micros(0); + let retry_rate = std::time::Duration::from_micros(self.retry_rate_us); + let slot = loop { + if let Some(slot) = self.pool.pop() { + break slot; + } else { + std::thread::sleep(retry_rate); + retry += retry_rate; + if retry >= ALLOC_RETRY_WARN_RATE { + tracing::warn!("Allocation struggle more than {retry:?} to get ciphertext from pool. Check that your algorithm memory allocation and associated Hpu configuration"); + retry = std::time::Duration::from_micros(0) + } + } + }; + win_slots.push(slot); + if win_slots.len() < bundle_size { + continue; + } + win_slots.sort_by(|a, b| a.id.partial_cmp(&b.id).unwrap()); + + // Check contiguous + for i in 0..=(win_slots.len() - bundle_size) { + let is_contiguous = + (0..bundle_size).all(|j| win_slots[i + j].id == SlotId(win_slots[i].id.0 + j)); + if is_contiguous { + let mut slots = Vec::with_capacity(bundle_size); + for (p, slot) in win_slots.into_iter().enumerate() { + if (p < i) || p > (i + bundle_size) { + // Return slot to pool + self.pool + .push(slot) + .expect("Error: Release a slot in already full pool"); + } else { + slots.push(slot) + } + } + return CiphertextBundle { + slots, + pool: self.clone(), + }; + } + } + } + } + + /// Enforce CiphertextMemory completeness and ordering + /// Use to prevent fragmentation between various workload + /// + /// Warn: This function could block in case of un-released ciphertext slots + #[tracing::instrument(level = "debug", skip(self), ret)] + pub fn reorder_pool(&self) { + let all_in_one_bundle = self.get_bundle(self.pool.capacity()); + std::hint::black_box(&all_in_one_bundle); + drop(all_in_one_bundle); + } +} diff --git a/backends/tfhe-hpu-backend/src/interface/memory/huge.rs b/backends/tfhe-hpu-backend/src/interface/memory/huge.rs new file mode 100644 index 000000000..aa9d7bcb4 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/memory/huge.rs @@ -0,0 +1,188 @@ +//! +//! Structure used to handle memory associated with Bsk/Ksk +//! Huge memory are composed of multiple cut. Furthermore, each cut is allocated on a set of +//! fix-size buffer. This is to mitigate a limitation of XRT memory allocation. + +use crate::ffi; + +// Some XRT constants +// Use to circumvent current XRT limitation with huge buffer +// Any buffer is sliced into chunk of at max MEM_CHUNK_SIZE to prevent issue with XRT allocator +#[allow(unused)] +const MEM_BANK_SIZE_MB: usize = 512; +const MEM_CHUNK_SIZE_B: usize = 16 * 1024 * 1024; + +#[derive(Debug)] +pub struct HugeMemoryProperties { + pub mem_cut: Vec, + pub cut_coefs: usize, +} + +pub struct HugeMemory { + cut_coefs: usize, + cut_mem: Vec>, + phantom: std::marker::PhantomData, +} +impl std::fmt::Debug for HugeMemory { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "HugeMemory{{cut_coefs: {}}}", + std::mem::size_of::(), + self.cut_coefs + ) + } +} + +impl HugeMemory { + /// This function allocate a set of memzone to store HugeMemory block + /// HugeMemory block is spread over multiple Hbm cut. Furthermore, due to size and XRT + /// limitation each cut is split on multiple buffer of 16MiB. + /// We allocate 16MiB buffer only ( the last one isn't shrunk to fit the required memory size) + #[tracing::instrument(level = "trace", skip(ffi_hw), ret)] + pub fn alloc(ffi_hw: &mut ffi::HpuHw, props: HugeMemoryProperties) -> Self { + assert_eq!( + 0, + MEM_CHUNK_SIZE_B % std::mem::size_of::(), + "Word width must divide MEM_CHUNK_SIZE_B" + ); + + let all_chunks = + usize::div_ceil(props.cut_coefs * std::mem::size_of::(), MEM_CHUNK_SIZE_B); + + let mut cut_mem = Vec::new(); + for mem_kind in props.mem_cut.into_iter() { + let mut cut_mz = Vec::new(); + let mut cur_mem_kind = mem_kind; + + for _chunk in 0..all_chunks { + let chunk_props = ffi::MemZoneProperties { + mem_kind: cur_mem_kind, + size_b: MEM_CHUNK_SIZE_B, + }; + // Update mem_kind if needed (i.e. update DDR offset for next chunk + cur_mem_kind = match cur_mem_kind { + ffi::MemKind::Ddr { offset } => ffi::MemKind::Ddr { + offset: offset + MEM_CHUNK_SIZE_B, + }, + ffi::MemKind::Hbm { .. } => cur_mem_kind, + }; + let mz = ffi_hw.alloc(chunk_props); + cut_mz.push(mz); + } + + // Sanity check + // cut buffer must be contiguous in memory + let base_addr = cut_mz[0].paddr(); + for (i, mz) in cut_mz[1..].iter().enumerate() { + let cont_addr = base_addr + ((i + 1) * MEM_CHUNK_SIZE_B) as u64; + let real_addr = mz.paddr(); + assert_eq!( + cont_addr, real_addr, + "HugeMemory chunk weren't contiguous in memory" + ); + } + + cut_mem.push(cut_mz); + } + Self { + cut_mem, + cut_coefs: props.cut_coefs, + phantom: std::marker::PhantomData::, + } + } + + /// This function release associated memzone + #[tracing::instrument(level = "trace", skip(ffi_hw), ret)] + pub fn release(&mut self, ffi_hw: &mut ffi::HpuHw) { + self.cut_mem + .iter_mut() + .flatten() + .for_each(|mz| ffi_hw.release(mz)); + } + + /// Write data slice into memory cut_id + /// NB: User specify offset in unit of data. + #[tracing::instrument(level = "trace", skip(data), ret)] + pub fn write_cut_at(&mut self, cut_id: usize, ofst: usize, data: &[T]) { + assert!( + ofst + data.len() <= self.cut_coefs, + "Invalid write size. Write stop beyond the HugeMemory boundaries" + ); + let cut = self + .cut_mem + .get_mut(cut_id) + .unwrap_or_else(|| panic!("Invalid cut_id: {cut_id}")); + + // Underlying memory is view as bytes memory + // Extract byte ofst and byte length + // NB: Don't use generic write method to prevent misunderstanding of ofst meaning + // Indeed, we must used a bytes offset to compute the sub-bfr id and thus keep a + // byte approach everywhere to prevent mismatch + let ofst_b = ofst * std::mem::size_of::(); + let len_b = std::mem::size_of_val(data); + + let bid_start = ofst_b / MEM_CHUNK_SIZE_B; + let bid_stop = (ofst_b + len_b) / MEM_CHUNK_SIZE_B; + let mut bid_ofst = ofst_b % MEM_CHUNK_SIZE_B; + + let mut rmn_data = len_b; + let mut data_ofst = 0; + + let data_bytes = bytemuck::cast_slice::(data); + for bfr in cut[bid_start..=bid_stop].iter_mut() { + let size_b = std::cmp::min(rmn_data, MEM_CHUNK_SIZE_B - bid_ofst); + bfr.write_bytes(bid_ofst, &data_bytes[data_ofst..data_ofst + size_b]); + bfr.sync(ffi::SyncMode::Host2Device); + data_ofst += size_b; + rmn_data -= size_b; + bid_ofst = 0; + } + } + + /// Read data slice from memory cut_id + /// NB: User specify offset in unit of data. + #[tracing::instrument(level = "trace", skip(data), ret)] + pub fn read_cut_at(&mut self, cut_id: usize, ofst: usize, data: &mut [T]) { + assert!( + ofst + data.len() <= self.cut_coefs, + "Invalid read size. Read stop beyond the HugeMemory boundaries" + ); + let cut = self.cut_mem.get_mut(cut_id).expect("Invalid cut_id"); + + // Underlying memory is view as bytes memory + // Extract byte ofst and byte length + // NB: Don't use generic write method to prevent misunderstanding of ofst meaning + // Indeed, we must used a bytes offset to compute the sub-bfr id and thus keep a + // byte approach everywhere to prevent mismatch + let ofst_b = ofst * std::mem::size_of::(); + let len_b = std::mem::size_of_val(data); + + let bid_start = ofst_b / MEM_CHUNK_SIZE_B; + let bid_stop = (ofst_b + len_b) / MEM_CHUNK_SIZE_B; + let mut bid_ofst = ofst_b % MEM_CHUNK_SIZE_B; + + let mut rmn_data = len_b; + let mut data_ofst = 0; + + let data_bytes = bytemuck::cast_slice_mut::(data); + for bfr in cut[bid_start..=bid_stop].iter_mut() { + let size_b = std::cmp::min(rmn_data, MEM_CHUNK_SIZE_B - bid_ofst); + bfr.sync(ffi::SyncMode::Device2Host); + bfr.read_bytes(bid_ofst, &mut data_bytes[data_ofst..data_ofst + size_b]); + data_ofst += size_b; + rmn_data -= size_b; + bid_ofst = 0; + } + } + + /// Return paddr of cuts + /// Use paddr of first buffer for Hw configuration + #[tracing::instrument(level = "trace", ret)] + pub fn cut_paddr(&self) -> Vec { + self.cut_mem + .iter() + .map(|cut| cut[0].paddr()) + .collect::>() + } +} diff --git a/backends/tfhe-hpu-backend/src/interface/memory/mod.rs b/backends/tfhe-hpu-backend/src/interface/memory/mod.rs new file mode 100644 index 000000000..41b204d22 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/memory/mod.rs @@ -0,0 +1,12 @@ +// Xrt required memory allocation to be page aligned +pub(crate) const MEM_PAGE_SIZE_B: usize = 4096; + +/// Compute the minimal size to keep page alignment +pub fn page_align(size_b: usize) -> usize { + size_b.div_ceil(MEM_PAGE_SIZE_B) * MEM_PAGE_SIZE_B +} + +pub(crate) mod ciphertext; +pub(crate) mod huge; +pub use ciphertext::{CiphertextBundle, CiphertextMemory, CiphertextMemoryProperties}; +pub use huge::{HugeMemory, HugeMemoryProperties}; diff --git a/backends/tfhe-hpu-backend/src/interface/mod.rs b/backends/tfhe-hpu-backend/src/interface/mod.rs new file mode 100644 index 000000000..38aec2ad7 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/mod.rs @@ -0,0 +1,40 @@ +mod backend; +mod cmd; +pub use cmd::{HpuCmd, HpuImm}; +mod config; +mod device; +mod memory; +pub mod rtl; +mod variable; + +#[cfg(feature = "io-dump")] +pub mod io_dump; + +use thiserror::Error; + +// Publicly export some types +pub const ACKQ_EMPTY: u32 = 0xdeadc0de; +pub const FW_TABLE_ENTRY: usize = 128; +pub use config::{BoardConfig, FFIMode, HpuConfig, ShellString}; +pub use device::HpuDevice; +pub use memory::page_align; +pub use variable::HpuVarWrapped; + +/// Common error type reported by Hpu +#[derive(Error, Debug, Clone, PartialEq, Eq)] +pub(crate) enum HpuInternalError { + #[error("Couldn't sync uninitialized variable.")] + UninitData, + + // Recoreverable errors + #[error("Couldn't sync yet. Operation is pending")] + OperationPending, +} + +/// Common error type exposed to user +#[derive(Error, Clone, Debug)] +pub enum HpuError { + // Recoreverable errors + #[error("Couldn't sync yet. Operation is pending")] + SyncPending(variable::HpuVarWrapped), +} diff --git a/backends/tfhe-hpu-backend/src/interface/rtl/mod.rs b/backends/tfhe-hpu-backend/src/interface/rtl/mod.rs new file mode 100644 index 000000000..da36ffa56 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/rtl/mod.rs @@ -0,0 +1,9 @@ +pub mod params; +pub mod runtime; + +use crate::ffi; +use hw_regmap::FlatRegmap; +/// Trait used to extract/parse information from Rtl registers +pub trait FromRtl { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self; +} diff --git a/backends/tfhe-hpu-backend/src/interface/rtl/params.rs b/backends/tfhe-hpu-backend/src/interface/rtl/params.rs new file mode 100644 index 000000000..4cf45db69 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/rtl/params.rs @@ -0,0 +1,462 @@ +//! +//! Extract architecture properties from RTL registers +//! Read Rtl parameters from registers. +//! NB: Some registers contains encoded value that must be converted to concrete one (i.e. +//! apps/ntt_moduls) +use parameters::HpuNttPrime; + +use super::*; +use crate::entities::*; + +// Set of constant defined in RTL and associated rust definition +// -> Cf. fpga/hw/common_lib/common_package/rtl/common_definition_pkg.sv +pub const NTT_CORE_ARCH_OFS: u32 = 5 << 8; +pub const MOD_NTT_NAME_OFS: u32 = 6 << 8; +pub const APPLICATION_NAME_OFS: u32 = 7 << 8; +pub const SIMULATION_CODE: u32 = 1; + +impl FromRtl for HpuParameters { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + let pbs_params = HpuPBSParameters::from_rtl(ffi_hw, regmap); + let ntt_params = HpuNttParameters::from_rtl(ffi_hw, regmap); + let ks_params = HpuKeyswitchParameters::from_rtl(ffi_hw, regmap); + let pc_params = HpuPcParameters::from_rtl(ffi_hw, regmap); + let regf_params = HpuRegfileParameters::from_rtl(ffi_hw, regmap); + let isc_params = HpuIscParameters::from_rtl(ffi_hw, regmap); + Self { + pbs_params, + ntt_params, + ks_params, + pc_params, + regf_params, + isc_params, + } + } +} + +impl FromRtl for HpuKeyswitchParameters { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + let ks_shape = regmap + .register() + .get("info::ks_structure") + .expect("Unknown register, check regmap definition"); + let shape_val = ffi_hw.read_reg(*ks_shape.offset() as u64); + let shape_fields = ks_shape.as_field(shape_val); + + let ks_info = regmap + .register() + .get("info::ks_crypto_param") + .expect("Unknown register, check regmap definition"); + let info_val = ffi_hw.read_reg(*ks_info.offset() as u64); + let info_fields = ks_info.as_field(info_val); + + Self { + width: *info_fields.get("mod_ksk_w").expect("Unknown field") as usize, + lbx: *shape_fields.get("x").expect("Unknown field") as usize, + lby: *shape_fields.get("y").expect("Unknown field") as usize, + lbz: *shape_fields.get("z").expect("Unknown field") as usize, + } + } +} +impl FromRtl for HpuNttParameters { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + let core_arch = HpuNttCoreArch::from_rtl(ffi_hw, regmap); + + // Values extracted from NttInternal register + let ntt_internal = regmap + .register() + .get("info::ntt_structure") + .expect("Unknown register, check regmap definition"); + let internal_val = ffi_hw.read_reg(*ntt_internal.offset() as u64); + let internal_fields = ntt_internal.as_field(internal_val); + + let radix = *internal_fields.get("radix").expect("Unknown field") as usize; + let psi = *internal_fields.get("psi").expect("Unknown field") as usize; + let delta = *internal_fields.get("delta").expect("Unknown field") as usize; + + // Values extracted from NttInternal register + let ntt_pbs_nb = regmap + .register() + .get("info::ntt_pbs") + .expect("Unknown register, check regmap definition"); + let pbs_nb_val = ffi_hw.read_reg(*ntt_pbs_nb.offset() as u64); + let pbs_nb_fields = ntt_pbs_nb.as_field(pbs_nb_val); + + let batch_pbs_nb = *pbs_nb_fields.get("batch_pbs_nb").expect("Unknown field") as usize; + let total_pbs_nb = *pbs_nb_fields.get("total_pbs_nb").expect("Unknown field") as usize; + + // Values extracted from NttModulo register + // Modulus isn't directly expressed, instead used custom encoding + let ntt_modulo = regmap + .register() + .get("info::ntt_modulo") + .expect("Unknown register, check regmap definition"); + let ntt_modulo_val = ffi_hw.read_reg(*ntt_modulo.offset() as u64); + + let prime_modulus = { + // Check register encoding + let field_code = ntt_modulo_val & (!0xFF_u32); + assert_eq!( + field_code, MOD_NTT_NAME_OFS, + "Invalid register encoding. Check register map definition" + ); + match (ntt_modulo_val & 0xFF) as u8 { + enum_id if enum_id == HpuNttPrime::GF64 as u8 => HpuNttPrime::GF64, + enum_id if enum_id == HpuNttPrime::Solinas3_32_17_13 as u8 => { + HpuNttPrime::Solinas3_32_17_13 + } + enum_id if enum_id == HpuNttPrime::Solinas2_44_14 as u8 => { + HpuNttPrime::Solinas2_44_14 + } + _ => panic!("Unknown NttModName encoding"), + } + }; + + // Values extracted from Application + // Not the cleanest way but some required ntt information are only available in the + // parameters set Thus parse extract HpuPBSParameters inside HpuNttParameters + let pbs_params = HpuPBSParameters::from_rtl(ffi_hw, regmap); + let stg_nb = pbs_params.polynomial_size.ilog(radix) as usize; + + Self { + core_arch, + min_pbs_nb: None, // TODO: Get this from a register + batch_pbs_nb, + total_pbs_nb, + ct_width: pbs_params.ciphertext_width as u32, + radix, + stg_nb, + prime_modulus, + psi, + delta, + } + } +} + +impl FromRtl for HpuNttCoreArch { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + // Values extracted from NttModulo register + // Modulus isn't directly expressed, instead used custom encoding + let ntt_core_arch = regmap + .register() + .get("info::ntt_architecture") + .expect("Unknown register, check regmap definition"); + let ntt_core_arch_val = ffi_hw.read_reg(*ntt_core_arch.offset() as u64); + + // Check register encoding + let field_code = ntt_core_arch_val & (!0xFF_u32); + assert_eq!( + field_code, NTT_CORE_ARCH_OFS, + "Invalid register encoding. Check register map definition" + ); + + match ntt_core_arch_val & 0xFF { + // NB: Previous arch aren't supported anymore + 3 => Self::WmmCompactPcg, + 4 => Self::WmmUnfoldPcg, + 5 => { + // Extract associated radix split + + let radix_cut = regmap + .register() + .get("info::ntt_rdx_cut") + .expect("Unknown register, check regmap definition"); + let radix_cut_val = ffi_hw.read_reg(*radix_cut.offset() as u64); + let cut_l = (0..(u32::BITS / 4)) + .map(|ofst| ((radix_cut_val >> (ofst * 4)) & 0xf) as u8) + .filter(|x| *x != 0) + .collect::>(); + Self::GF64(cut_l) + } + _ => panic!("Unknown NttCoreArch encoding"), + } + } +} + +impl FromRtl for HpuPcParameters { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + // Extract number of Pc for each channel + let hbm_pc = regmap + .register() + .get("info::hbm_axi4_nb") + .expect("Unknown register, check regmap definition"); + let hbm_pc_val = ffi_hw.read_reg(*hbm_pc.offset() as u64); + let hbm_pc_fields = hbm_pc.as_field(hbm_pc_val); + + let ksk_pc = *hbm_pc_fields.get("ksk_pc").expect("Unknown field") as usize; + let bsk_pc = *hbm_pc_fields.get("bsk_pc").expect("Unknown field") as usize; + let pem_pc = *hbm_pc_fields.get("pem_pc").expect("Unknown field") as usize; + + // Extract bus width for each channel + let ksk_bytes_w = { + let ksk_axi4_data_w = regmap + .register() + .get("info::hbm_axi4_dataw_ksk") + .expect("Unknown register, check regmap definition"); + let ksk_axi4_data_w_val = ffi_hw.read_reg(*ksk_axi4_data_w.offset() as u64); + // Value is in bit in rtl and SW expect bytes + ksk_axi4_data_w_val.div_ceil(u8::BITS) as usize + }; + let bsk_bytes_w = { + let bsk_axi4_data_w = regmap + .register() + .get("info::hbm_axi4_dataw_bsk") + .expect("Unknown register, check regmap definition"); + let bsk_axi4_data_w_val = ffi_hw.read_reg(*bsk_axi4_data_w.offset() as u64); + // Value is in bit in rtl and SW expect bytes + bsk_axi4_data_w_val.div_ceil(u8::BITS) as usize + }; + let pem_bytes_w = { + let pem_axi4_data_w = regmap + .register() + .get("info::hbm_axi4_dataw_pem") + .expect("Unknown register, check regmap definition"); + let pem_axi4_data_w_val = ffi_hw.read_reg(*pem_axi4_data_w.offset() as u64); + // Value is in bit in rtl and SW expect bytes + pem_axi4_data_w_val.div_ceil(u8::BITS) as usize + }; + let glwe_bytes_w = { + let glwe_axi4_data_w = regmap + .register() + .get("info::hbm_axi4_dataw_glwe") + .expect("Unknown register, check regmap definition"); + let glwe_axi4_data_w_val = ffi_hw.read_reg(*glwe_axi4_data_w.offset() as u64); + // Value is in bit in rtl and SW expect bytes + glwe_axi4_data_w_val.div_ceil(u8::BITS) as usize + }; + + Self { + ksk_pc, + bsk_pc, + pem_pc, + ksk_bytes_w, + bsk_bytes_w, + pem_bytes_w, + glwe_bytes_w, + } + } +} + +impl FromRtl for HpuRegfileParameters { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + let regf = regmap + .register() + .get("info::regf_structure") + .expect("Unknown register, check regmap definition"); + let regf_val = ffi_hw.read_reg(*regf.offset() as u64); + let regf_fields = regf.as_field(regf_val); + + Self { + reg_nb: *regf_fields.get("reg_nb").expect("Unknown field") as usize, + coef_nb: *regf_fields.get("coef_nb").expect("Unknown field") as usize, + } + } +} + +impl FromRtl for HpuIscParameters { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + let isc = regmap + .register() + .get("info::isc_structure") + .expect("Unknown register, check regmap definition"); + let isc_val = ffi_hw.read_reg(*isc.offset() as u64); + let isc_fields = isc.as_field(isc_val); + + Self { + min_iop_size: *isc_fields.get("min_iop_size").expect("Unknown field") as usize, + depth: *isc_fields.get("depth").expect("Unknown field") as usize, + } + } +} + +// Define parameters set as constants +// Used to easily derived IoMeasure version without duplication +pub const CONCRETE_BOOLEAN: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 586, + glwe_dimension: 2, + polynomial_size: 512, + lwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(0.00000000007069849454709433), + glwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev( + 0.0000000000000000000029403601535432533, + ), + pbs_base_log: 8, + pbs_level: 2, + ks_base_log: 5, + ks_level: 4, + message_width: 1, + carry_width: 0, + ciphertext_width: 32, +}; + +pub const MSG2_CARRY2: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 742, + glwe_dimension: 1, + polynomial_size: 2048, + lwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(9.039_924_320_497_611e-6_f64), + glwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(3.1529314934984704e-16_f64), + pbs_base_log: 19, + pbs_level: 1, + ks_base_log: 3, + ks_level: 5, + message_width: 2, + carry_width: 2, + ciphertext_width: u64::BITS as usize, +}; + +pub const MSG2_CARRY2_64B: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 710, + glwe_dimension: 2, + polynomial_size: 1024, + lwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(1.630_783_646_854_603e-5_f64), + glwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(3.1529314934984704e-16_f64), + pbs_base_log: 25, + pbs_level: 1, + ks_base_log: 2, + ks_level: 7, + message_width: 2, + carry_width: 2, + ciphertext_width: u64::BITS as usize, +}; + +pub const MSG2_CARRY2_44B: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 724, + glwe_dimension: 2, + polynomial_size: 1024, + lwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev( + 1.259_780_968_897_627_7e-5_f64, + ), + glwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(2.2737367544323206e-13_f64), + pbs_base_log: 20, + pbs_level: 1, + ks_base_log: 2, + ks_level: 7, + message_width: 2, + carry_width: 2, + ciphertext_width: 44, +}; + +pub const MSG2_CARRY2_64B_FAKE: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 724, + glwe_dimension: 2, + polynomial_size: 1024, + lwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev( + 1.259_780_968_897_627_7e-5_f64, + ), + glwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(2.2737367544323206e-13_f64), + pbs_base_log: 20, + pbs_level: 1, + ks_base_log: 2, + ks_level: 7, + message_width: 2, + carry_width: 2, + ciphertext_width: 64, +}; + +pub const MSG2_CARRY2_GAUSSIAN: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 834, + glwe_dimension: 1, + polynomial_size: 2048, + lwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev( + 3.553_990_235_944_282_5e-6_f64, + ), + glwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(2.845267479601915e-15_f64), + pbs_base_log: 23, + pbs_level: 1, + ks_base_log: 3, + ks_level: 5, + message_width: 2, + carry_width: 2, + ciphertext_width: 64, +}; + +pub const MSG2_CARRY2_TUNIFORM: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 887, + glwe_dimension: 1, + polynomial_size: 2048, + lwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev( + 3.553_990_235_944_282_5e-6_f64, + ), + glwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(2.845267479601915e-15_f64), + pbs_base_log: 22, + pbs_level: 1, + ks_base_log: 3, + ks_level: 5, + message_width: 2, + carry_width: 2, + ciphertext_width: 64, +}; + +pub const MSG2_CARRY2_PFAIL64_132B_GAUSSIAN_1F72DBA: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 804, + glwe_dimension: 1, + polynomial_size: 2048, + lwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(5.963_599_673_924_788e-6_f64), + glwe_noise_distribution: HpuNoiseDistributionInput::GaussianStdDev(2.8452674713391114e-15_f64), + pbs_base_log: 23, + pbs_level: 1, + ks_base_log: 2, + ks_level: 8, + message_width: 2, + carry_width: 2, + ciphertext_width: 64, +}; + +pub const MSG2_CARRY2_PFAIL64_132B_TUNIFORM_7E47D8C: HpuPBSParameters = HpuPBSParameters { + lwe_dimension: 839, + glwe_dimension: 1, + polynomial_size: 2048, + lwe_noise_distribution: HpuNoiseDistributionInput::TUniformBound(4), + glwe_noise_distribution: HpuNoiseDistributionInput::TUniformBound(17), + pbs_base_log: 23, + pbs_level: 1, + ks_base_log: 2, + ks_level: 7, + message_width: 2, + carry_width: 2, + ciphertext_width: 64, +}; + +impl FromRtl for HpuPBSParameters { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + let pbs_app = regmap + .register() + .get("info::application") + .expect("Unknown register, check regmap definition"); + let pbs_app_val = ffi_hw.read_reg(*pbs_app.offset() as u64); + + // Check register encoding + let field_code = pbs_app_val & (!0xFF_u32); + #[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] + { + if (field_code == 0) && (pbs_app_val == SIMULATION_CODE) { + tracing::warn!("Run an simulation backend with custom SIMU parameters set"); + return ffi_hw.get_pbs_parameters(); + } + } + #[cfg(any(feature = "hw-xrt", feature = "hw-v80"))] + { + assert_eq!( + field_code, APPLICATION_NAME_OFS, + "Invalid register encoding. Check register map definition" + ); + } + + match pbs_app_val & 0xFF { + 0 => CONCRETE_BOOLEAN, + 1 => MSG2_CARRY2, + 2 => { + let mut params = MSG2_CARRY2; + params.lwe_dimension = 2; + params + } + 3 => MSG2_CARRY2_64B, + 4 => MSG2_CARRY2_44B, + 9 => MSG2_CARRY2_64B_FAKE, + 10 => MSG2_CARRY2_GAUSSIAN, + 11 => MSG2_CARRY2_TUNIFORM, + 12 => MSG2_CARRY2_PFAIL64_132B_GAUSSIAN_1F72DBA, + 13 => MSG2_CARRY2_PFAIL64_132B_TUNIFORM_7E47D8C, + _ => panic!("Unknown TfheAppName encoding"), + } + } +} diff --git a/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs b/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs new file mode 100644 index 000000000..3e54ebaeb --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/rtl/runtime.rs @@ -0,0 +1,934 @@ +//! +//! Define structure and way to read them from register for all the +//! Hpu runtime information +use super::*; + +#[derive(Debug, Default)] +pub struct InfoPePbs { + /// Bpip used + bpip_use: bool, + /// Bpip use opportunism + bpip_use_opportunism: bool, + /// Bpip timeout + bpip_timeout: u32, + + /// PBS current BR-loop + br_loop: u16, + /// PBS current BR-loop parity + br_loop_c: u8, + + /// KS current KS-loop + ks_loop: u16, + /// KS current BR-loop parity + ks_loop_c: u8, + + /// pe_pbs pool_rp + pool_rp: u8, + /// pe_pbs pool_wp + pool_wp: u8, + /// pe_pbs ldg_pt + ldg_pt: u8, + /// pe_pbs ldb_pt + ldb_pt: u8, + + /// pe_pbs ks_in_rp + ks_in_rp: u8, + /// pe_pbs ks_in_wp + ks_in_wp: u8, + /// pe_pbs ks_out_rp + ks_out_rp: u8, + /// pe_pbs ks_out_wp + ks_out_wp: u8, + /// pe_pbs pbs_in_rp + pbs_in_rp: u8, + /// pe_pbs pbs_in_wp + pbs_in_wp: u8, + + /// pe_pbs IPIP flush last pbs_in_loop + ipip_flush_last_pbs_in_loop: u16, + /// pe_pbs BPIP batch that waits the trigger counter (Could be reset by user) + seq_bpip_waiting_batch_cnt: u32, + /// pe_pbs Count batch with filled with a given number of CT (Could be reset by user) + seq_bpip_batch_filling_cnt: [u32; 16], + + /// pe_pbs ack counter (Could be reset by user) + seq_ld_ack_cnt: u32, + + /// pe_pbs not full batch CMUX counter (Could be reset by user) + seq_cmux_not_full_batch_cnt: u32, + + /// pe_pbs BPIP batch counter (Could be reset by user) + seq_bpip_batch_cnt: u32, + /// pe_pbs BPIP batch triggered with a flush counter (Could be reset by user) + seq_bpip_batch_flush_cnt: u32, + /// pe_pbs BPIP batch triggered with a timeout counter (Could be reset by user) + seq_bpip_batch_timeout_cnt: u32, + + /// pe_pbs IPIP flush CMUX counter (Could be reset by user) + seq_ipip_flush_cnt: u32, + /// pe_pbs load BLWE reception max duration (Could be reset by user) + ldb_rcp_dur: u32, + /// pe_pbs load GLWE request max duration (Could be reset by user) + ldg_req_dur: u32, + /// pe_pbs load GLWE reception max duration (Could be reset by user) + ldg_rcp_dur: u32, + /// pe_pbs MMACC SXT reception duration (Could be reset by user) + mmacc_sxt_rcp_dur: u32, + + /// pe_pbs MMACC SXT request duration (Could be reset by user) + mmacc_sxt_req_dur: u32, + /// pe_pbs MMACC SXT command without b duration (Could be reset by user) + mmacc_sxt_cmd_wait_b_dur: u32, + + /// PEP input instruction counter (Could be reset by user) + pep_inst_cnt: u32, + /// PEP instruction acknowledge counter (Could be reset by user) + pep_ack_cnt: u32, + + /// pe_pbs load BSK slice reception max duration (Could be reset by user) + load_bsk_rcp_dur: [u32; 16], + /// pe_pbs load KSK slice reception max duration (Could be reset by user) + load_ksk_rcp_dur: [u32; 16], + + /// pe_pbs bsk_if req_br_loop_rp + bskif_req_br_loop_rp: u16, + /// pe_pbs bsk_if req_br_loop_wp + bskif_req_br_loop_wp: u16, + /// pe_pbs bsk_if req_prf_br_loop + bskif_req_prf_br_loop: u16, + /// pe_pbs bsk_if req_parity + bskif_req_parity: u8, + /// pe_pbs bsk_if req_assigned + bskif_req_assigned: u8, +} + +impl FromRtl for InfoPePbs { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + // Info structure have method to update + // Instead of redefine parsing here, use a default construct and update methods + let mut infos = Self::default(); + infos.update(ffi_hw, regmap); + infos + } +} + +/// Add facilities once created to update/reset some fields +impl InfoPePbs { + pub fn update(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.update_bpip(ffi_hw, regmap); + self.update_loop(ffi_hw, regmap); + self.update_pointer0(ffi_hw, regmap); + self.update_pointer1(ffi_hw, regmap); + self.update_pointer2(ffi_hw, regmap); + self.update_seq_bpip_waiting_batch_cnt(ffi_hw, regmap); + self.update_seq_bpip_batch_filling_cnt(ffi_hw, regmap); + self.update_seq_ld_ack_cnt(ffi_hw, regmap); + self.update_seq_cmux_not_full_batch_cnt(ffi_hw, regmap); + self.update_seq_bpip_batch_cnt(ffi_hw, regmap); + self.update_seq_bpip_batch_flush_cnt(ffi_hw, regmap); + self.update_seq_bpip_batch_timeout_cnt(ffi_hw, regmap); + self.update_seq_ipip_flush_cnt(ffi_hw, regmap); + self.update_ldb_rcp_dur(ffi_hw, regmap); + self.update_ldg_req_dur(ffi_hw, regmap); + self.update_ldg_rcp_dur(ffi_hw, regmap); + self.update_mmacc_sxt_rcp_dur(ffi_hw, regmap); + self.update_mmacc_sxt_req_dur(ffi_hw, regmap); + self.update_mmacc_sxt_cmd_wait_b_dur(ffi_hw, regmap); + self.update_pep_inst_cnt(ffi_hw, regmap); + self.update_pep_ack_cnt(ffi_hw, regmap); + self.update_load_bsk_rcp_dur(ffi_hw, regmap); + self.update_load_ksk_rcp_dur(ffi_hw, regmap); + self.update_pep_bskif_req_info_0(ffi_hw, regmap); + self.update_pep_bskif_req_info_1(ffi_hw, regmap); + } + + pub fn update_bpip(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg_use = regmap + .register() + .get("bpip::use") + .expect("Unknown register, check regmap definition"); + let val = ffi_hw.read_reg(*reg_use.offset() as u64); + let fields = reg_use.as_field(val); + self.bpip_use = *fields.get("use_bpip").expect("Unknown field") == 1; + self.bpip_use_opportunism = *fields + .get("use_opportunism") + .expect("Unknown field opportunism") + == 1; + let reg_timeout = regmap + .register() + .get("bpip::timeout") + .expect("Unknown register, check regmap definition"); + self.bpip_timeout = ffi_hw.read_reg(*reg_timeout.offset() as u64); + } + + pub fn update_loop(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_cmux_loop") + .expect("Unknown register, check regmap definition"); + let val = ffi_hw.read_reg(*reg.offset() as u64); + let fields = reg.as_field(val); + self.br_loop = *fields.get("br_loop").expect("Unknown field") as u16; + self.br_loop_c = *fields.get("br_loop_c").expect("Unknown field") as u8; + self.ks_loop = *fields.get("ks_loop").expect("Unknown field") as u16; + self.ks_loop_c = *fields.get("ks_loop_c").expect("Unknown field") as u8; + } + pub fn update_pointer0(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_pointer_0") + .expect("Unknown register, check regmap definition"); + let val = ffi_hw.read_reg(*reg.offset() as u64); + let fields = reg.as_field(val); + self.pool_rp = *fields.get("pool_rp").expect("Unknown field") as u8; + self.pool_wp = *fields.get("pool_wp").expect("Unknown field") as u8; + self.ldg_pt = *fields.get("ldg_pt").expect("Unknown field") as u8; + self.ldb_pt = *fields.get("ldb_pt").expect("Unknown field") as u8; + } + + pub fn update_pointer1(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_pointer_1") + .expect("Unknown register, check regmap definition"); + let val = ffi_hw.read_reg(*reg.offset() as u64); + let fields = reg.as_field(val); + self.ks_in_rp = *fields.get("ks_in_rp").expect("Unknown field") as u8; + self.ks_in_wp = *fields.get("ks_in_wp").expect("Unknown field") as u8; + self.ks_out_rp = *fields.get("ks_out_rp").expect("Unknown field") as u8; + self.ks_out_wp = *fields.get("ks_out_wp").expect("Unknown field") as u8; + } + + pub fn update_pointer2(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_pointer_2") + .expect("Unknown register, check regmap definition"); + let val = ffi_hw.read_reg(*reg.offset() as u64); + let fields = reg.as_field(val); + self.pbs_in_rp = *fields.get("pbs_in_rp").expect("Unknown field") as u8; + self.pbs_in_wp = *fields.get("pbs_in_wp").expect("Unknown field") as u8; + self.ipip_flush_last_pbs_in_loop = *fields + .get("ipip_flush_last_pbs_in_loop") + .expect("Unknown field") as u16; + } + + pub fn update_seq_bpip_waiting_batch_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_bpip_waiting_batch_cnt") + .expect("Unknown register, check regmap definition"); + self.seq_bpip_waiting_batch_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_seq_bpip_batch_filling_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + (1..16).for_each(|i| { + let reg_name = format!("runtime_1in3::pep_seq_bpip_batch_filling_cnt_{i}"); + let reg = regmap + .register() + .get(®_name) + .expect("Unknown register, check regmap definition"); + self.seq_bpip_batch_filling_cnt[i] = ffi_hw.read_reg(*reg.offset() as u64) + }); + } + + pub fn update_seq_ld_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_ld_ack_cnt") + .expect("Unknown register, check regmap definition"); + self.seq_ld_ack_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_seq_cmux_not_full_batch_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_cmux_not_full_batch_cnt") + .expect("Unknown register, check regmap definition"); + self.seq_cmux_not_full_batch_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_seq_bpip_batch_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_bpip_batch_cnt") + .expect("Unknown register, check regmap definition"); + self.seq_bpip_batch_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_seq_bpip_batch_flush_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_bpip_batch_flush_cnt") + .expect("Unknown register, check regmap definition"); + self.seq_bpip_batch_flush_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_seq_bpip_batch_timeout_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_bpip_batch_timeout_cnt") + .expect("Unknown register, check regmap definition"); + self.seq_bpip_batch_timeout_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_seq_ipip_flush_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_ipip_flush_cnt") + .expect("Unknown register, check regmap definition"); + self.seq_ipip_flush_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_ldb_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_ldb_rcp_dur") + .expect("Unknown register, check regmap definition"); + self.ldb_rcp_dur = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_ldg_req_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_ldg_req_dur") + .expect("Unknown register, check regmap definition"); + self.ldg_req_dur = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_ldg_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_ldg_rcp_dur") + .expect("Unknown register, check regmap definition"); + self.ldg_rcp_dur = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_mmacc_sxt_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_mmacc_sxt_rcp_dur") + .expect("Unknown register, check regmap definition"); + self.mmacc_sxt_rcp_dur = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_mmacc_sxt_req_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_mmacc_sxt_req_dur") + .expect("Unknown register, check regmap definition"); + self.mmacc_sxt_req_dur = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_mmacc_sxt_cmd_wait_b_dur( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + let reg = regmap + .register() + .get("runtime_1in3::pep_mmacc_sxt_cmd_wait_b_dur") + .expect("Unknown register, check regmap definition"); + self.mmacc_sxt_cmd_wait_b_dur = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_pep_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_inst_cnt") + .expect("Unknown register, check regmap definition"); + self.pep_inst_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_load_bsk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + (1..16).for_each(|i| { + let reg_name = format!("runtime_3in3::pep_load_bsk_rcp_dur_pc{i}"); + let reg = regmap + .register() + .get(®_name) + .expect("Unknown register, check regmap definition"); + self.load_bsk_rcp_dur[i] = ffi_hw.read_reg(*reg.offset() as u64) + }); + } + pub fn update_load_ksk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + (1..16).for_each(|i| { + let reg_name = format!("runtime_1in3::pep_load_ksk_rcp_dur_pc{i}"); + let reg = regmap + .register() + .get(®_name) + .expect("Unknown register, check regmap definition"); + self.load_ksk_rcp_dur[i] = ffi_hw.read_reg(*reg.offset() as u64) + }); + } + + pub fn update_pep_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_ack_cnt") + .expect("Unknown register, check regmap definition"); + self.pep_ack_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_pep_bskif_req_info_0(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_3in3::pep_bskif_req_info_0") + .expect("Unknown register, check regmap definition"); + let val = ffi_hw.read_reg(*reg.offset() as u64); + let fields = reg.as_field(val); + self.bskif_req_br_loop_rp = *fields.get("req_br_loop_rp").expect("Unknown field") as u16; + self.bskif_req_br_loop_wp = *fields.get("req_br_loop_wp").expect("Unknown field") as u16; + } + + pub fn update_pep_bskif_req_info_1(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_3in3::pep_bskif_req_info_1") + .expect("Unknown register, check regmap definition"); + let val = ffi_hw.read_reg(*reg.offset() as u64); + let fields = reg.as_field(val); + self.bskif_req_prf_br_loop = *fields.get("req_prf_br_loop").expect("Unknown field") as u16; + self.bskif_req_parity = *fields.get("req_parity").expect("Unknown field") as u8; + self.bskif_req_assigned = *fields.get("req_assigned").expect("Unknown field") as u8; + } + + #[allow(unused)] + pub fn reset(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.reset_seq_bpip_waiting_batch_cnt(ffi_hw, regmap); + self.reset_seq_bpip_batch_filling_cnt(ffi_hw, regmap); + self.reset_seq_ld_ack_cnt(ffi_hw, regmap); + self.reset_seq_cmux_not_full_batch_cnt(ffi_hw, regmap); + self.reset_seq_bpip_batch_cnt(ffi_hw, regmap); + self.reset_seq_bpip_batch_flush_cnt(ffi_hw, regmap); + self.reset_seq_bpip_batch_timeout_cnt(ffi_hw, regmap); + self.reset_seq_ipip_flush_cnt(ffi_hw, regmap); + self.reset_ldb_rcp_dur(ffi_hw, regmap); + self.reset_ldg_req_dur(ffi_hw, regmap); + self.reset_ldg_rcp_dur(ffi_hw, regmap); + self.reset_mmacc_sxt_rcp_dur(ffi_hw, regmap); + self.reset_mmacc_sxt_req_dur(ffi_hw, regmap); + self.reset_mmacc_sxt_cmd_wait_b_dur(ffi_hw, regmap); + self.reset_pep_inst_cnt(ffi_hw, regmap); + self.reset_pep_ack_cnt(ffi_hw, regmap); + self.reset_load_bsk_rcp_dur(ffi_hw, regmap); + self.reset_load_ksk_rcp_dur(ffi_hw, regmap); + } + #[allow(unused)] + pub fn reset_seq_bpip_waiting_batch_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_bpip_waiting_batch_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_seq_bpip_batch_filling_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + (1..16).for_each(|i| { + let reg_name = format!("runtime_1in3::pep_seq_bpip_batch_filling_cnt_{i}"); + let reg = regmap + .register() + .get(®_name) + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0) + }); + } + #[allow(unused)] + pub fn reset_seq_ld_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_ld_ack_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + + #[allow(unused)] + pub fn reset_seq_cmux_not_full_batch_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_cmux_not_full_batch_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + + #[allow(unused)] + pub fn reset_seq_bpip_batch_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_bpip_batch_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_seq_bpip_batch_flush_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_bpip_batch_flush_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_seq_bpip_batch_timeout_cnt( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + ) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_bpip_batch_timeout_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_seq_ipip_flush_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_seq_ipip_flush_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_ldb_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_ldb_rcp_dur") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_ldg_req_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_ldg_req_dur") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_ldg_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_ldg_rcp_dur") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_mmacc_sxt_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_mmacc_sxt_rcp_dur") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_mmacc_sxt_req_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_mmacc_sxt_req_dur") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_mmacc_sxt_cmd_wait_b_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_mmacc_sxt_cmd_wait_b_dur") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + + #[allow(unused)] + pub fn reset_pep_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_inst_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_pep_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pep_ack_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + + #[allow(unused)] + pub fn reset_load_bsk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + (1..16).for_each(|i| { + let reg_name = format!("runtime_3in3::pep_load_bsk_rcp_dur_pc{i}"); + let reg = regmap + .register() + .get(®_name) + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + }); + } + #[allow(unused)] + pub fn reset_load_ksk_rcp_dur(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + (1..16).for_each(|i| { + let reg_name = format!("runtime_1in3::pep_load_ksk_rcp_dur_pc{i}"); + let reg = regmap + .register() + .get(®_name) + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + }); + } +} + +#[derive(Default)] +pub struct PeMemInfo { + addr: u64, + data: [u32; 4], +} +impl std::fmt::Debug for PeMemInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{{addr: {:x}, data: {:0>8x?}}}", self.addr, self.data) + } +} + +#[derive(Debug, Default)] +pub struct InfoPeMem { + /// PEM load input instruction counter (Could be reset by user) + pem_load_inst_cnt: u32, + /// PEM load instruction acknowledge counter (Could be reset by user) + pem_load_ack_cnt: u32, + /// PEM store input instruction counter (Could be reset by user) + pem_store_inst_cnt: u32, + /// PEM store instruction acknowledge counter (Could be reset by user) + pem_store_ack_cnt: u32, + /// PEM load first addr/data + pem_ld_info: [PeMemInfo; 2], +} +impl FromRtl for InfoPeMem { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + // Info structure have method to update + // Instead of redefine parsing here, use a default construct and update methods + let mut infos = Self::default(); + infos.update(ffi_hw, regmap); + infos + } +} + +/// Add facilities once created to update/reset some fields +impl InfoPeMem { + pub fn update(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.update_pem_load_inst_cnt(ffi_hw, regmap); + self.update_pem_load_ack_cnt(ffi_hw, regmap); + self.update_pem_store_inst_cnt(ffi_hw, regmap); + self.update_pem_store_ack_cnt(ffi_hw, regmap); + self.update_pem_ld_info(ffi_hw, regmap, 0); + self.update_pem_ld_info(ffi_hw, regmap, 1); + } + + pub fn update_pem_load_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pem_load_inst_cnt") + .expect("Unknown register, check regmap definition"); + self.pem_load_inst_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_pem_load_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pem_load_ack_cnt") + .expect("Unknown register, check regmap definition"); + self.pem_load_ack_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_pem_store_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pem_store_inst_cnt") + .expect("Unknown register, check regmap definition"); + self.pem_store_inst_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_pem_store_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pem_store_ack_cnt") + .expect("Unknown register, check regmap definition"); + self.pem_store_ack_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_pem_ld_info( + &mut self, + ffi_hw: &mut ffi::HpuHw, + regmap: &FlatRegmap, + pc_idx: usize, + ) { + // Update addr field + self.pem_ld_info[pc_idx].addr = ["msb", "lsb"] + .iter() + .map(|n| { + let reg_name = format!("runtime_1in3::pem_load_info_1_pc{pc_idx}_{n}"); + let reg = regmap + .register() + .get(®_name) + .expect("Unknown register, check regmap definition"); + ffi_hw.read_reg(*reg.offset() as u64) + }) + .fold(0_u64, |acc, v| (acc << u32::BITS) + v as u64); + + // Update value field + (0..4).for_each(|i| { + let reg_name = format!("runtime_1in3::pem_load_info_0_pc{pc_idx}_{i}"); + let reg = regmap + .register() + .get(®_name) + .expect("Unknown register, check regmap definition"); + self.pem_ld_info[pc_idx].data[i] = ffi_hw.read_reg(*reg.offset() as u64); + }); + } + + #[allow(unused)] + pub fn reset(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.reset_pem_load_inst_cnt(ffi_hw, regmap); + self.reset_pem_load_ack_cnt(ffi_hw, regmap); + self.reset_pem_store_inst_cnt(ffi_hw, regmap); + self.reset_pem_store_ack_cnt(ffi_hw, regmap); + } + #[allow(unused)] + pub fn reset_pem_load_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pem_load_inst_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_pem_load_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pem_load_ack_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_pem_store_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pem_store_inst_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_pem_store_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pem_store_ack_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } +} +#[derive(Debug, Default)] +pub struct InfoPeAlu { + /// PEA input instruction counter (Could be reset by user) + pea_inst_cnt: u32, + /// PEA instruction acknowledge counter (Could be reset by user) + pea_ack_cnt: u32, +} +impl FromRtl for InfoPeAlu { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + // Info structure have method to update + // Instead of redefine parsing here, use a default construct and update methods + let mut infos = Self::default(); + infos.update(ffi_hw, regmap); + infos + } +} + +/// Add facilities once created to update/reset some fields +impl InfoPeAlu { + pub fn update(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.update_pea_inst_cnt(ffi_hw, regmap); + self.update_pea_ack_cnt(ffi_hw, regmap); + } + + pub fn update_pea_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pea_inst_cnt") + .expect("Unknown register, check regmap definition"); + self.pea_inst_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_pea_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pea_ack_cnt") + .expect("Unknown register, check regmap definition"); + self.pea_ack_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + #[allow(unused)] + pub fn reset(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.reset_pea_inst_cnt(ffi_hw, regmap); + self.reset_pea_ack_cnt(ffi_hw, regmap); + } + #[allow(unused)] + pub fn reset_pea_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pea_inst_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + #[allow(unused)] + pub fn reset_pea_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::pea_ack_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } +} + +#[derive(Default)] +pub struct InfoIsc { + /// ISC input instruction counter (Could be reset by user) + isc_inst_cnt: u32, + /// ISC instruction acknowledge sample counter (Could be reset by user) + isc_ack_cnt: u32, + + /// ISC 4 latest instructions received ([0] is the most recent) + isc_info: [u32; 4], +} + +impl FromRtl for InfoIsc { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + // Info structure have method to update + // Instead of redefine parsing here, use a default construct and update methods + let mut infos = Self::default(); + infos.update(ffi_hw, regmap); + infos + } +} + +impl std::fmt::Debug for InfoIsc { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{{isc_inst_cnt: {}, isc_ack_cnt: {}, isc_info: {:x?}}}", + self.isc_inst_cnt, self.isc_ack_cnt, self.isc_info + ) + } +} + +/// Add facilities once created to update/reset some fields +impl InfoIsc { + pub fn update(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.update_isc_inst_cnt(ffi_hw, regmap); + self.update_isc_ack_cnt(ffi_hw, regmap); + self.update_isc_info(ffi_hw, regmap); + } + + pub fn update_isc_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::isc_inst_cnt") + .expect("Unknown register, check regmap definition"); + self.isc_inst_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + pub fn update_isc_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::isc_ack_cnt") + .expect("Unknown register, check regmap definition"); + self.isc_ack_cnt = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_isc_info(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + for idx in 0..4 { + let name = format!("runtime_1in3::isc_latest_instruction_{idx}"); + let reg = regmap + .register() + .get(&name) + .expect("Unknown register, check regmap definition"); + self.isc_info[idx] = ffi_hw.read_reg(*reg.offset() as u64); + } + } + + #[allow(unused)] + pub fn reset(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.reset_isc_inst_cnt(ffi_hw, regmap); + self.reset_isc_ack_cnt(ffi_hw, regmap); + } + + #[allow(unused)] + pub fn reset_isc_inst_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::isc_inst_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } + + #[allow(unused)] + pub fn reset_isc_ack_cnt(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("runtime_1in3::isc_ack_cnt") + .expect("Unknown register, check regmap definition"); + ffi_hw.write_reg(*reg.offset() as u64, 0); + } +} + +#[derive(Debug, Default)] +pub struct ErrorHpu { + error_1in3: u32, + error_3in3: u32, +} +impl FromRtl for ErrorHpu { + fn from_rtl(ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) -> Self { + // Info structure have method to update + // Instead of redefine parsing here, use a default construct and update methods + let mut infos = Self::default(); + infos.update(ffi_hw, regmap); + infos + } +} + +impl ErrorHpu { + pub fn update(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + self.update_error_1in3(ffi_hw, regmap); + self.update_error_3in3(ffi_hw, regmap); + } + + pub fn update_error_1in3(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("status_1in3::error") + .expect("Unknown register, check regmap definition"); + self.error_1in3 = ffi_hw.read_reg(*reg.offset() as u64); + } + + pub fn update_error_3in3(&mut self, ffi_hw: &mut ffi::HpuHw, regmap: &FlatRegmap) { + let reg = regmap + .register() + .get("status_3in3::error") + .expect("Unknown register, check regmap definition"); + self.error_3in3 = ffi_hw.read_reg(*reg.offset() as u64); + } +} diff --git a/backends/tfhe-hpu-backend/src/interface/variable.rs b/backends/tfhe-hpu-backend/src/interface/variable.rs new file mode 100644 index 000000000..bc23ce4dd --- /dev/null +++ b/backends/tfhe-hpu-backend/src/interface/variable.rs @@ -0,0 +1,280 @@ +//! +//! Abstraction over Hpu ciphertext data +//! Handle lifetime management, deallocation and state inside HpuDevice. +use super::*; +use crate::asm::iop::VarMode; +use crate::entities::{HpuLweCiphertextOwned, HpuParameters}; +use crate::ffi; +use std::sync::{mpsc, Arc, Mutex}; + +#[derive(Debug)] +enum SyncState { + None, + CpuSync, + HpuSync, + BothSync, +} + +pub(crate) struct HpuVar { + bundle: memory::CiphertextBundle, + state: SyncState, + pending: usize, +} + +impl std::fmt::Debug for HpuVar { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "HpuVar<{{state: {:?}, bundle: {:?}}}", + self.state, self.bundle + ) + } +} + +/// Handle sync between Hpu and Cpu +impl HpuVar { + pub fn try_cpu_sync(&mut self) -> Result<(), HpuInternalError> { + if self.pending > 0 { + Err(HpuInternalError::OperationPending) + } else { + match self.state { + SyncState::CpuSync | SyncState::BothSync => Ok(()), + SyncState::HpuSync => { + for slot in self.bundle.iter_mut() { + slot.mz + .iter_mut() + .for_each(|mz| mz.sync(ffi::SyncMode::Device2Host)); + } + self.state = SyncState::BothSync; + Ok(()) + } + SyncState::None => Err(HpuInternalError::UninitData), + } + } + } + + pub(crate) fn try_hpu_sync(&mut self) -> Result<(), HpuInternalError> { + // Nb: synced on hpu could be achieved even with registered pending IOp + // Indeed, this is used for assign IOp since dst == src. + match self.state { + SyncState::None => { + if self.pending > 0 { + Ok(()) // Use of future result + } else { + Err(HpuInternalError::UninitData) + } + } + SyncState::HpuSync | SyncState::BothSync => Ok(()), + SyncState::CpuSync => { + for slot in self.bundle.iter_mut() { + slot.mz + .iter_mut() + .for_each(|mz| mz.sync(ffi::SyncMode::Host2Device)); + } + self.state = if self.pending > 0 { + SyncState::HpuSync + } else { + SyncState::BothSync + }; + Ok(()) + } + } + } +} + +impl HpuVar { + pub(crate) fn operation_pending(&mut self) { + self.pending += 1; + } + pub(crate) fn operation_done(&mut self) { + if self.pending > 0 { + self.pending -= 1; + self.state = SyncState::HpuSync; + } else { + panic!("`operation_done` called on variable without pending operations"); + } + } +} + +#[derive(Clone)] +pub struct HpuVarWrapped { + pub(crate) inner: Arc>, + pub(crate) id: memory::ciphertext::SlotId, + /// Reference to associated ct pool + pub(crate) pool: memory::CiphertextMemory, + /// Way to push cmd inside the backend without need of locking + pub(crate) cmd_api: mpsc::Sender, + pub(crate) params: Arc, + pub(crate) width: usize, + pub(crate) mode: VarMode, +} + +impl std::fmt::Debug for HpuVarWrapped { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "HpuVarWrapped{{ {:?} }}", self.id) + } +} + +/// Conversion function between inner type and HpuLweCiphertext +impl HpuVarWrapped { + fn new_in( + pool: memory::CiphertextMemory, + cmd_api: mpsc::Sender, + params: Arc, + width: usize, + mode: VarMode, + ) -> Self { + let bundle = pool.get_bundle(width); + + Self { + id: *bundle.id(), + pool, + cmd_api, + params, + width, + mode, + inner: Arc::new(Mutex::new(HpuVar { + bundle, + state: SyncState::None, + pending: 0, + })), + } + } + + pub(crate) fn new_from( + pool: memory::CiphertextMemory, + cmd_api: mpsc::Sender, + params: Arc, + ct: Vec>, + mode: VarMode, + ) -> Self { + let var = Self::new_in(pool, cmd_api, params, ct.len(), mode); + + // Write cpu_ct with correct interleaving in host buffer + // TODO check perf of mmap vs write + // Now value is considered CpuSync (i.e. data valid only on cpu-side) + { + let mut inner = var.inner.lock().unwrap(); + + for (slot, ct) in std::iter::zip(inner.bundle.iter_mut(), ct.into_iter()) { + #[cfg(feature = "io-dump")] + let params = ct.params().clone(); + for (id, cut) in ct.into_container().iter().enumerate() { + slot.mz[id].write(0, cut); + #[cfg(feature = "io-dump")] + io_dump::dump( + &cut.as_slice(), + ¶ms, + io_dump::DumpKind::BlweIn, + io_dump::DumpId::Slot(slot.id, id), + ); + } + } + inner.state = SyncState::CpuSync; + } + var + } + + /// Create a new HpuVarWrapped with same properties + /// Associated data is != only share properties + pub(crate) fn fork(&self, trgt_mode: VarMode) -> Self { + let Self { + pool, + cmd_api, + params, + width, + mode, + .. + } = self.clone(); + + let width = match (&mode, &trgt_mode) { + (_, VarMode::Bool) => 1, + (VarMode::Native, VarMode::Native) => width, + (VarMode::Native, VarMode::Half) => width / 2, + (VarMode::Half, VarMode::Native) => 2 * width, + (VarMode::Half, VarMode::Half) => width, + _ => panic!("Unsupported mode, couldn't use a Boolean to build a bigger variable"), + }; + Self::new_in(pool, cmd_api, params, width, trgt_mode) + } + + pub fn try_into(self) -> Result>, HpuError> { + // Check if value is available + let mut inner = self.inner.lock().unwrap(); + match inner.try_cpu_sync() { + Ok(_) => {} + Err(err) => { + drop(inner); + match err { + HpuInternalError::OperationPending => return Err(HpuError::SyncPending(self)), + HpuInternalError::UninitData => { + panic!("Encounter unrecoverable HpuInternalError: {err:?}") + } + } + } + } + + let mut ct = Vec::new(); + + for slot in inner.bundle.iter() { + // Allocate HpuLwe + // and view inner buffer as cut + let mut hpu_lwe = HpuLweCiphertextOwned::::new(0, (*self.params).clone()); + let mut hw_slice = hpu_lwe.as_mut_view().into_container(); + + // Copy from Xrt memory + #[allow(unused_variables)] + std::iter::zip(slot.mz.iter(), hw_slice.iter_mut()) + .enumerate() + .for_each(|(id, (mz, cut))| { + mz.read(0, cut); + #[cfg(feature = "io-dump")] + io_dump::dump( + &cut.as_ref(), + &self.params, + io_dump::DumpKind::BlweOut, + io_dump::DumpId::Slot(slot.id, id), + ); + }); + ct.push(hpu_lwe); + } + + Ok(ct) + } + + /// Retrieved a vector of HpuLweCiphertext from a Hpu variable + /// Blocking call that pool the Hpu Backend until variable is ready + pub fn into_ct(self) -> Vec> { + // TODO Replace pooling with IRQ when supported by the backend + let mut var = self; + loop { + var = match var.try_into() { + Ok(ct) => break ct, + Err(err) => match err { + HpuError::SyncPending(v) => v, + }, + } + } + } + + /// Wait end of pending operation and synced on Cpu side + /// Blocking call that pool the Hpu Backend until variable is ready + pub fn wait(&self) { + loop { + match self.inner.lock().unwrap().try_cpu_sync() { + Ok(_) => break, + Err(err) => match err { + HpuInternalError::OperationPending => {} + HpuInternalError::UninitData => { + panic!("Encounter unrecoverable HpuInternalError: {err:?}") + } + }, + } + } + } + + /// Check if inner value depicts a boolean + pub fn is_boolean(&self) -> bool { + self.mode == VarMode::Bool + } +} diff --git a/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs b/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs new file mode 100644 index 000000000..4902a6450 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/isc_trace/fmt.rs @@ -0,0 +1,165 @@ +use super::packed_struct::{Len, NoMoreBits, PackedStructLsb}; +use crate::asm::dop::DOp; +use bitvec::prelude::*; +use serde::Serialize; +use std::error::Error; +use std::fmt::Display; + +// TODO: We need to have some kind of trace versioning system to be able to +// retroactively support traces coming from different hardware versions + +pub static TRACE_W: usize = 16; + +#[derive(Debug, Serialize, PartialEq, Eq)] +pub enum IscQueryCmd { + NONE, + RDUNLOCK, + RETIRE, + REFILL, + ISSUE, +} + +#[derive(Debug)] +struct BadCmd; + +impl Display for BadCmd { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("No such command") + } +} + +impl Error for BadCmd { + fn description(&self) -> &str { + "No such command" + } +} + +impl Len for IscQueryCmd { + fn len() -> usize { + 3 + } +} + +impl PackedStructLsb for IscQueryCmd +where + O: bitvec::store::BitStore, +{ + fn from_bit_slice_le(slice: &BitSlice) -> Result> { + let bits = slice.get(0..3).ok_or(NoMoreBits)?.load::(); + match bits { + 0 => Ok(IscQueryCmd::NONE), + 1 => Ok(IscQueryCmd::RDUNLOCK), + 2 => Ok(IscQueryCmd::RETIRE), + 3 => Ok(IscQueryCmd::REFILL), + 4 => Ok(IscQueryCmd::ISSUE), + _ => Err(BadCmd.into()), + } + } +} + +#[derive(Debug, Serialize)] +pub struct IscPoolState { + pub(super) pdg: bool, + pub(super) rd_pdg: bool, + pub(super) vld: bool, + pub(super) wr_lock: u32, + pub(super) rd_lock: u32, + //pub(super) issue_lock: u32, + pub(super) sync_id: u32, +} + +impl Len for IscPoolState { + fn len() -> usize { + 21 + } +} + +impl PackedStructLsb for IscPoolState +where + O: bitvec::store::BitStore, +{ + fn from_bit_slice_le(slice: &BitSlice) -> Result> { + Ok(IscPoolState { + pdg: *(slice.get(0).ok_or(NoMoreBits)?), + rd_pdg: *(slice.get(1).ok_or(NoMoreBits)?), + vld: *(slice.get(2).ok_or(NoMoreBits)?), + wr_lock: slice.get(3..10).ok_or(NoMoreBits)?.load::(), + rd_lock: slice.get(10..17).ok_or(NoMoreBits)?.load::(), + //issue_lock: slice.get(17..24).ok_or(NoMoreBits)?.load::(), + sync_id: slice.get(17..21).ok_or(NoMoreBits)?.load::(), + }) + } +} + +#[derive(Debug, Serialize)] +pub struct IscTrace { + pub(super) state: IscPoolState, + pub(super) cmd: IscQueryCmd, + pub(super) insn: Option, + pub(super) insn_asm: Option, + pub(super) timestamp: u32, +} + +impl PackedStructLsb for IscTrace +where + O: bitvec::store::BitStore, +{ + fn from_bit_slice_le(slice: &BitSlice) -> Result> { + let lwe_k_w = 10; + let slice = slice.get(lwe_k_w..).ok_or(NoMoreBits)?; + + let state = IscPoolState::from_bit_slice_le(slice)?; + let slice = slice.get(IscPoolState::len()..).ok_or(NoMoreBits)?; + + let cmd = IscQueryCmd::from_bit_slice_le(slice)?; + let slice = slice.get(IscQueryCmd::len()..).ok_or(NoMoreBits)?; + + let insn = match cmd { + IscQueryCmd::REFILL | IscQueryCmd::NONE => None, + _ => { + let insn = u32::from_bit_slice_le(slice)?; + let dop = DOp::from_hex(insn)?; + Some(dop) + } + }; + + let slice = slice.get(u32::len()..).ok_or(NoMoreBits)?; + let timestamp = u32::from_bit_slice_le(slice)?; + let insn_asm = insn.as_ref().map(|dop| format!("{dop}")); + + Ok(IscTrace { + state, + cmd, + insn, + insn_asm, + timestamp, + }) + } +} + +impl Len for IscTrace { + fn len() -> usize { + TRACE_W + } +} + +#[derive(Serialize, Debug)] +pub struct IscTraceStream(pub(super) Vec); + +impl IscTraceStream { + pub fn sort(&mut self) { + self.0.sort_by_key(|k| k.timestamp) + } + + pub fn from_bytes(bytes: &[u8]) -> IscTraceStream { + let view = bytes.view_bits::(); + IscTraceStream( + view.chunks(TRACE_W * 8) + .filter_map(|c| IscTrace::from_bit_slice_le(c).ok()) + .collect(), + ) + } +} + +#[cfg(test)] +mod test; diff --git a/backends/tfhe-hpu-backend/src/isc_trace/fmt/test/data.rs b/backends/tfhe-hpu-backend/src/isc_trace/fmt/test/data.rs new file mode 100644 index 000000000..e0fb30e13 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/isc_trace/fmt/test/data.rs @@ -0,0 +1,205 @@ +pub(super) static V80_TEST_DATA: [u16; 856] = [ + 0x0000, 0x8000, 0x0001, 0x0000, 0x4bd4, 0x3524, 0x0000, 0x0000, 0x1800, 0x0000, 0x0002, 0x0000, + 0x4bf2, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, 0x4c3c, 0x3524, 0x0000, 0x0000, + 0x1800, 0x0000, 0x0806, 0x0000, 0x4c5a, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x4ca4, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, 0x4d0c, 0x3524, 0x0000, 0x0000, + 0x0000, 0x8000, 0x0001, 0x0000, 0x4d74, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x4ddc, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, 0x4e44, 0x3524, 0x0000, 0x0000, + 0x1800, 0x0000, 0x0216, 0x0000, 0x4e62, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x4eac, 0x3524, 0x0000, 0x0000, 0x1800, 0x0000, 0x0a1a, 0x0000, 0x4eca, 0x3524, 0x0000, 0x0000, + 0x0000, 0x8000, 0x0001, 0x0000, 0x4f14, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x4f7c, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, 0x4fe4, 0x3524, 0x0000, 0x0000, + 0x0000, 0x8000, 0x0001, 0x0000, 0x504c, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x50b4, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, 0x511c, 0x3524, 0x0000, 0x0000, + 0x1800, 0x0000, 0x042a, 0x0000, 0x513a, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x5184, 0x3524, 0x0000, 0x0000, 0x1800, 0x0000, 0x0c2e, 0x0000, 0x51a2, 0x3524, 0x0000, 0x0000, + 0x0000, 0x8000, 0x0001, 0x0000, 0x51ec, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x5254, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0000, 0x0000, 0x528a, 0x3524, 0x0000, 0x0000, + 0x1400, 0x0000, 0x0001, 0x0000, 0x52d6, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x5320, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, 0x5388, 0x3524, 0x0000, 0x0000, + 0x0000, 0x8000, 0x0001, 0x0000, 0x53f0, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x5458, 0x3524, 0x0000, 0x0000, 0x1800, 0x0000, 0x063e, 0x0000, 0x5476, 0x3524, 0x0000, 0x0000, + 0x0000, 0x8000, 0x0001, 0x0000, 0x54c0, 0x3524, 0x0000, 0x0000, 0x1800, 0x0000, 0x0e42, 0x0000, + 0x54de, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, 0x5528, 0x3524, 0x0000, 0x0000, + 0x0000, 0x8000, 0x0001, 0x0000, 0x5590, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, + 0x55f8, 0x3524, 0x0000, 0x0000, 0x0000, 0x8000, 0x0001, 0x0000, 0x5660, 0x3524, 0x0000, 0x0000, + 0x0000, 0x8000, 0x0001, 0x0000, 0x56b0, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0804, 0x0000, + 0x59a2, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x0805, 0x0000, 0x59ee, 0x3524, 0x0000, 0x0000, + 0x1800, 0x0000, 0x000a, 0x1001, 0x5a08, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0214, 0x0000, + 0x5e4a, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x0215, 0x0000, 0x5e96, 0x3524, 0x0000, 0x0000, + 0x1c00, 0x8000, 0x0a18, 0x0000, 0x62f2, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x0a19, 0x0000, + 0x633e, 0x3524, 0x0000, 0x0000, 0x1800, 0x0000, 0x0a1e, 0x1006, 0x6358, 0x3524, 0x0000, 0x0000, + 0x1c00, 0x8000, 0x0428, 0x0000, 0x67ee, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x0429, 0x0000, + 0x683a, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0c2c, 0x0000, 0x6efe, 0x3524, 0x0000, 0x0000, + 0x1400, 0x0000, 0x0c2d, 0x0000, 0x6f4a, 0x3524, 0x0000, 0x0000, 0x1800, 0x0000, 0x1432, 0x100b, + 0x6f64, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x063c, 0x0000, 0x73c2, 0x3524, 0x0000, 0x0000, + 0x1400, 0x0000, 0x063d, 0x0000, 0x740e, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0e40, 0x0000, + 0x7912, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x0e41, 0x0000, 0x795e, 0x3524, 0x0000, 0x0000, + 0x1800, 0x0000, 0x1e46, 0x1010, 0x7978, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0008, 0x1001, + 0x7b34, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x0009, 0x1001, 0x7b80, 0x3524, 0x0000, 0x0000, + 0x1800, 0x0000, 0x040e, 0x0003, 0x7b9f, 0x3524, 0x0000, 0x0000, 0x1800, 0x0000, 0x0412, 0x0001, + 0x7bbb, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x040c, 0x0003, 0x7d23, 0x3524, 0x0000, 0x0000, + 0x1c00, 0x8000, 0x0410, 0x0001, 0x7e37, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0a1c, 0x1006, + 0x9bb8, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x0a1d, 0x1006, 0x9c04, 0x3524, 0x0000, 0x0000, + 0x1c00, 0x8000, 0x1430, 0x100b, 0xbc44, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x1431, 0x100b, + 0xbc90, 0x3524, 0x0000, 0x0000, 0x1c00, 0x8000, 0x1e44, 0x1010, 0xdcb4, 0x3524, 0x0000, 0x0000, + 0x1400, 0x0000, 0x1e45, 0x1010, 0xdd00, 0x3524, 0x0000, 0x0000, 0x1400, 0x0000, 0x040d, 0x0003, + 0xadd7, 0x3540, 0x0000, 0x0000, 0x1800, 0x0000, 0x0e1e, 0x1003, 0xadf0, 0x3540, 0x0000, 0x0000, + 0x1400, 0x0000, 0x0411, 0x0001, 0xaf1f, 0x3540, 0x0000, 0x0000, 0x1800, 0x0000, 0x1012, 0x1000, + 0xaf3a, 0x3540, 0x0000, 0x0000, 0x1c00, 0x8000, 0x1010, 0x1000, 0xb5fe, 0x3540, 0x0000, 0x0000, + 0x1400, 0x0000, 0x1011, 0x1000, 0xb64a, 0x3540, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0e1c, 0x1003, + 0xcf34, 0x3540, 0x0000, 0x0000, 0x1400, 0x0000, 0x0e1d, 0x1003, 0xcf80, 0x3540, 0x0000, 0x0000, + 0x1800, 0x0000, 0x0e22, 0x0003, 0xcf9f, 0x3540, 0x0000, 0x0000, 0x1800, 0x0000, 0x0e26, 0x0001, + 0xcfbb, 0x3540, 0x0000, 0x0000, 0x1c00, 0x8000, 0x0e20, 0x0003, 0xd123, 0x3540, 0x0000, 0x0000, + 0x1c00, 0x8000, 0x0e24, 0x0001, 0xd21b, 0x3540, 0x0000, 0x0000, 0x1400, 0x0000, 0x0e21, 0x0003, + 0x2f37, 0x355c, 0x0000, 0x0000, 0x1800, 0x0000, 0x1832, 0x1008, 0x2f50, 0x355c, 0x0000, 0x0000, + 0x1400, 0x0000, 0x0e25, 0x0001, 0x3063, 0x355c, 0x0000, 0x0000, 0x1800, 0x0000, 0x1226, 0x1000, + 0x307e, 0x355c, 0x0000, 0x0000, 0x1c00, 0x8000, 0x1224, 0x1000, 0x3742, 0x355c, 0x0000, 0x0000, + 0x1400, 0x0000, 0x1225, 0x1000, 0x378e, 0x355c, 0x0000, 0x0000, 0x1c00, 0x8000, 0x1830, 0x1008, + 0x5094, 0x355c, 0x0000, 0x0000, 0x1400, 0x0000, 0x1831, 0x1008, 0x50e0, 0x355c, 0x0000, 0x0000, + 0x1800, 0x0000, 0x1836, 0x0003, 0x50ff, 0x355c, 0x0000, 0x0000, 0x1800, 0x0000, 0x183a, 0x0001, + 0x511b, 0x355c, 0x0000, 0x0000, 0x1c00, 0x8000, 0x1834, 0x0003, 0x5283, 0x355c, 0x0000, 0x0000, + 0x1c00, 0x8000, 0x1838, 0x0001, 0x537b, 0x355c, 0x0000, 0x0000, 0x1400, 0x0000, 0x1835, 0x0003, + 0x4d0f, 0x3577, 0x0000, 0x0000, 0x1800, 0x0000, 0x2246, 0x100d, 0x4d28, 0x3577, 0x0000, 0x0000, + 0x1400, 0x0000, 0x1839, 0x0001, 0x4e57, 0x3577, 0x0000, 0x0000, 0x1800, 0x0000, 0x143a, 0x1000, + 0x4e72, 0x3577, 0x0000, 0x0000, 0x1c00, 0x8000, 0x1438, 0x1000, 0x5536, 0x3577, 0x0000, 0x0000, + 0x1400, 0x0000, 0x1439, 0x1000, 0x5582, 0x3577, 0x0000, 0x0000, 0x1c00, 0x8000, 0x2244, 0x100d, + 0x6e6c, 0x3577, 0x0000, 0x0000, 0x1400, 0x0000, 0x2245, 0x100d, 0x6eb8, 0x3577, 0x0000, 0x0000, + 0x1800, 0x0000, 0x224a, 0x0001, 0x6ed7, 0x3577, 0x0000, 0x0000, 0x1c00, 0x8000, 0x2248, 0x0001, + 0x705b, 0x3577, 0x0000, 0x0000, 0x1400, 0x0000, 0x2249, 0x0001, 0x8b67, 0x3592, 0x0000, 0x0000, + 0x1800, 0x0000, 0x164a, 0x1000, 0x8b82, 0x3592, 0x0000, 0x0000, 0x1c00, 0x8000, 0x1648, 0x1000, + 0x9492, 0x3592, 0x0000, 0x0000, 0x1400, 0x0000, 0x1649, 0x1000, 0x94de, 0x3592, 0x0000, 0x0000, + 0x1800, 0x0000, 0xfffe, 0x0003, 0x94f9, 0x3592, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, +]; + +pub(super) static V80_TEST_DATA2: [u8; 2048] = [ + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x3a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x01, 0x50, 0x00, 0x00, 0xa5, 0x3a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xca, 0x3a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x50, 0x01, 0x00, 0xe5, 0x3a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x3b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x07, 0x48, 0x00, 0x00, 0x25, 0x3b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x4a, 0x3b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x09, 0x48, 0x01, 0x00, 0x65, 0x3b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x3b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x40, 0x00, 0x00, 0xa5, 0x3b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xca, 0x3b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x0f, 0x40, 0x01, 0x00, 0xe5, 0x3b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x3c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x13, 0x38, 0x00, 0x00, 0x25, 0x3c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x4a, 0x3c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x15, 0x38, 0x01, 0x00, 0x65, 0x3c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x3c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x30, 0x00, 0x00, 0xa5, 0x3c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xca, 0x3c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x1b, 0x30, 0x01, 0x00, 0xe5, 0x3c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x3d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x1f, 0x28, 0x00, 0x00, 0x25, 0x3d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x4a, 0x3d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x28, 0x01, 0x00, 0x65, 0x3d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x3d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x25, 0x20, 0x00, 0x00, 0xa5, 0x3d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xca, 0x3d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x27, 0x20, 0x01, 0x00, 0xe5, 0x3d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x3e, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x00, 0x50, 0x00, 0x00, 0x3f, 0x3e, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x00, 0x50, 0x00, 0x00, 0x65, 0x3e, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x3e, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xca, 0x3e, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x3f, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x4a, 0x3f, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x3f, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xca, 0x3f, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x40, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x4a, 0x40, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x40, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x2b, 0x18, 0x00, 0x00, 0xa5, 0x40, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xca, 0x40, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x02, 0x50, 0x01, 0x00, 0xe5, 0x40, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x02, 0x50, 0x01, 0x00, 0x0b, 0x41, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x30, 0x41, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x70, 0x41, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x41, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x41, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x30, 0x42, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x70, 0x42, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x42, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x42, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x2d, 0x18, 0x01, 0x00, 0x0b, 0x43, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x06, 0x48, 0x00, 0x00, 0x65, 0x43, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x06, 0x48, 0x00, 0x00, 0x8b, 0x43, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x43, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x43, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x30, 0x44, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x70, 0x44, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x44, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x44, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x30, 0x45, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x70, 0x45, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x45, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x05, 0x80, 0x00, 0x08, 0xca, 0x45, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x45, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x30, 0x46, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x70, 0x46, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x46, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x46, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x31, 0x10, 0x00, 0x00, 0x0b, 0x47, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x30, 0x47, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x70, 0x47, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x08, 0x48, 0x01, 0x00, 0x8b, 0x47, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x08, 0x48, 0x01, 0x00, 0xb1, 0x47, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xd6, 0x47, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x0b, 0x03, 0x02, 0x08, 0xf0, 0x47, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x16, 0x48, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x56, 0x48, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x96, 0x48, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xd6, 0x48, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x16, 0x49, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x56, 0x49, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x96, 0x49, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x33, 0x10, 0x01, 0x00, 0xb1, 0x49, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xd6, 0x49, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x0c, 0x40, 0x00, 0x00, 0x0b, 0x4a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x0c, 0x40, 0x00, 0x00, 0x31, 0x4a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x56, 0x4a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x96, 0x4a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0xd6, 0x4a, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x16, 0x4b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x56, 0x4b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x50, 0x00, 0x00, 0x97, 0x4b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x02, 0x50, 0x01, 0x00, 0xd7, 0x4b, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x06, 0x48, 0x00, 0x00, 0x17, 0x4c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x08, 0x48, 0x01, 0x00, 0x57, 0x4c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x0c, 0x40, 0x00, 0x00, 0x97, 0x4c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x37, 0x08, 0x00, 0x00, 0xb1, 0x4c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x0e, 0x40, 0x01, 0x00, 0xe5, 0x4c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x0e, 0x40, 0x01, 0x00, 0x0b, 0x4d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x0e, 0x40, 0x01, 0x00, 0x31, 0x4d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x11, 0x86, 0x03, 0x08, 0x4a, 0x4d, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x39, 0x08, 0x01, 0x00, 0x05, 0x4f, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x12, 0x38, 0x00, 0x00, 0x53, 0x4f, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x12, 0x38, 0x00, 0x00, 0x79, 0x4f, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x12, 0x38, 0x00, 0x00, 0x9f, 0x4f, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x47, 0x53, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x14, 0x38, 0x01, 0x00, 0xaf, 0x53, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x14, 0x38, 0x01, 0x00, 0xd5, 0x53, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x14, 0x38, 0x01, 0x00, 0xfb, 0x53, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x17, 0x09, 0x05, 0x08, 0x14, 0x54, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x01, 0x00, 0xcf, 0x55, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x18, 0x30, 0x00, 0x00, 0x1d, 0x56, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x18, 0x30, 0x00, 0x00, 0x43, 0x56, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x18, 0x30, 0x00, 0x00, 0x69, 0x56, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x04, 0x80, 0x00, 0x08, 0x82, 0x56, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x04, 0x80, 0x00, 0x08, 0xa8, 0x56, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x04, 0x80, 0x00, 0x08, 0xce, 0x56, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x43, 0x58, 0x00, 0x00, 0x89, 0x58, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x1a, 0x30, 0x01, 0x00, 0xd7, 0x58, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x1a, 0x30, 0x01, 0x00, 0xfd, 0x58, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xc0, 0x1a, 0x30, 0x01, 0x00, 0x23, 0x59, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x1d, 0x8c, 0x06, 0x08, 0x3c, 0x59, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x18, 0x00, 0x00, 0x00, 0x45, 0x58, 0x01, 0x00, 0x97, 0x5c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x1c, 0x00, 0x00, 0x40, 0x1e, 0x28, 0x00, 0x00, 0xe5, 0x5c, 0xb0, 0x41, 0x01, 0x00, 0x00, + 0x00, 0x14, 0x00, 0x00, 0x80, 0x1e, 0x28, 0x00, 0x00, 0x0b, 0x5d, 0xb0, 0x41, 0x01, 0x00, 0x00, +]; diff --git a/backends/tfhe-hpu-backend/src/isc_trace/fmt/test/mod.rs b/backends/tfhe-hpu-backend/src/isc_trace/fmt/test/mod.rs new file mode 100644 index 000000000..8b284cd50 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/isc_trace/fmt/test/mod.rs @@ -0,0 +1,56 @@ +use super::*; +mod data; + +#[test] +fn isc_trace_simple() { + let bytes: Vec = vec![ + 0x180000000, + 0xf1b6e5ec, + 0x2000000200001800, + 0xf1b6e608, + 0x180000000, + 0xf1b6e670, + 0x180000000, + 0xf1b6e710, + 0x180000000, + 0xf1b6e760, + 0x2000000080001c00, + 0xf1b7074c, + 0x2000000100001400, + 0xf1b70798, + 0x2000000200001800, + 0xf1b707b4, + 0x2000000080001c00, + 0xf1b728f0, + 0x2000000100001400, + 0xf1b7293c, + 0x2000000200001800, + 0xf1b72958, + 0x2000000080001c00, + 0xf1b74a94, + 0x2000000100001400, + 0xf1b74ae0, + 0x3fffe00001800, + 0xf1b74afd, + ]; + + let byte_view = bytemuck::try_cast_slice::<_, u8>(bytes.as_slice()).unwrap(); + let stream = IscTraceStream::from_bytes(byte_view); + println!("stream: {:?}", stream); +} + +#[test] +fn isc_trace_v80() { + let bytes = &data::V80_TEST_DATA; + let byte_view = bytemuck::try_cast_slice::<_, u8>(bytes.as_slice()).unwrap(); + let stream = IscTraceStream::from_bytes(byte_view); + println!("stream: {:?}", stream); +} + +#[test] +fn isc_trace_v80_2() { + let bytes = &data::V80_TEST_DATA2; + let byte_view = bytemuck::try_cast_slice::<_, u8>(bytes.as_slice()).unwrap(); + let stream = IscTraceStream::from_bytes(byte_view); + println!("stream: {:?}", stream); +} diff --git a/backends/tfhe-hpu-backend/src/isc_trace/mod.rs b/backends/tfhe-hpu-backend/src/isc_trace/mod.rs new file mode 100644 index 000000000..fff52556a --- /dev/null +++ b/backends/tfhe-hpu-backend/src/isc_trace/mod.rs @@ -0,0 +1,63 @@ +pub mod fmt; +pub mod packed_struct; + +use crate::ffi; +pub use crate::isc_trace::fmt::{IscQueryCmd, IscTraceStream, TRACE_W}; + +pub struct TraceDump { + trace: Vec, +} + +use tracing::trace; + +impl TraceDump { + pub fn new_from( + hpu_hw: &mut ffi::HpuHw, + regmap: &hw_regmap::FlatRegmap, + depth: usize, + ) -> TraceDump { + let size_b = ((depth * 1024 * 1024) / TRACE_W) * TRACE_W; + + let mut trace: Vec = vec![0; size_b]; + + let offset_reg: Vec = ["trc_pc0_lsb", "trc_pc0_msb"] + .into_iter() + .map(|name| { + let reg = regmap + .register() + .get(&format!("hbm_axi4_addr_1in3::{}", name)) + .expect("Unknown register, check regmap definition"); + hpu_hw.read_reg(*reg.offset() as u64) as usize + }) + .collect(); + let offset = offset_reg[0] + (offset_reg[1] << 32); + + trace!( + target = "TraceDump", + "Reading @0x{:x} size_b: {}", + offset, + size_b + ); + + let cut_props = ffi::MemZoneProperties { + mem_kind: ffi::MemKind::Ddr { offset }, + size_b, + }; + let mut mz = hpu_hw.alloc(cut_props); + mz.sync(ffi::SyncMode::Device2Host); + mz.read(0, trace.as_mut_slice()); + TraceDump { trace } + } +} + +impl From for IscTraceStream { + fn from(value: TraceDump) -> Self { + IscTraceStream( + IscTraceStream::from_bytes(value.trace.as_slice()) + .0 + .into_iter() + .filter(|i| i.cmd != IscQueryCmd::NONE) + .collect(), + ) + } +} diff --git a/backends/tfhe-hpu-backend/src/isc_trace/packed_struct.rs b/backends/tfhe-hpu-backend/src/isc_trace/packed_struct.rs new file mode 100644 index 000000000..99584c405 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/isc_trace/packed_struct.rs @@ -0,0 +1,149 @@ +use bitvec::prelude::*; +use std::error::Error; +use std::fmt; +use std::fmt::{Debug, Display, Formatter}; + +/// Macro used to define packed struct +macro_rules! integer_packed_struct { + ($x: ident) => { + impl PackedStructLsb for $x + where + O: bitvec::store::BitStore, + { + fn from_bit_slice_le(slice: &BitSlice) -> Result> { + if slice.len() != 0 { + Ok(slice[0..($x::BITS as usize).min(slice.len())].load::<$x>()) + } else { + Err(NoMoreBits)? + } + } + fn to_bit_slice_le(&self, dst: &mut BitSlice) -> Result<(), Box> { + if dst.len() == 0 { + Err(NoMoreBits)? + } else { + let size = dst.len().min($x::len()); + dst[0..size].clone_from_bitslice(&self.try_view_bits::()?[0..size]); + Ok(()) + } + } + } + + impl Len for $x { + fn len() -> usize { + $x::BITS as usize + } + } + }; +} + +#[derive(Debug)] +pub struct NoMoreBits; + +impl Error for NoMoreBits { + fn source(&self) -> Option<&(dyn Error + 'static)> { + None + } + + fn description(&self) -> &str { + "No more bits to unpack" + } +} + +impl Display for NoMoreBits { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{}", self.to_string()) + } +} + +pub trait Len { + fn len() -> usize; +} + +pub trait PackedStructLsb +where + O: bitvec::store::BitStore, + Self: Sized + Len, +{ + fn from_bit_slice_le(slice: &BitSlice) -> Result>; + fn to_bit_slice_le(&self, _dst: &mut BitSlice) -> Result<(), Box> { + Ok(()) + } +} + +impl Len for bool { + fn len() -> usize { + 1 + } +} + +impl PackedStructLsb for bool +where + O: bitvec::store::BitStore, +{ + fn from_bit_slice_le(slice: &BitSlice) -> Result> { + if slice.len() != 0 { + Ok(slice[0]) + } else { + Err(NoMoreBits)? + } + } + fn to_bit_slice_le(&self, dst: &mut BitSlice) -> Result<(), Box> { + if dst.len() > bool::len() { + Err(NoMoreBits)? + } else { + dst.set(0, *self); + Ok(()) + } + } +} + +integer_packed_struct!(u8); +integer_packed_struct!(u16); +integer_packed_struct!(u32); +integer_packed_struct!(u64); + +#[cfg(test)] +mod packed_struct_tests { + use super::*; + + #[test] + fn simple() { + let mut bytes: [u8; 3] = [0x00, 0x00, 0x00]; + let out_view = bytes.view_bits_mut::(); + + let byte0: u8 = 0xFF; + let byte1: u8 = 0x00; + let byte2: u8 = 0xF0; + + byte0.to_bit_slice_le(&mut out_view[0..7]).unwrap(); + byte1.to_bit_slice_le(&mut out_view[7..15]).unwrap(); + byte2.to_bit_slice_le(&mut out_view[15..23]).unwrap(); + + print!("Struct partially deserialized 0x{:?}\n", bytes); + + let bytes: [u8; 3] = [0xBA, 0xBE, 0x12]; + let mut view = bytes.view_bits::(); + + for _ in bytes { + print!("next u8: {:X}\n", u8::from_bit_slice_le(&view).unwrap()); + view = &view[u8::len()..]; + } + } + + #[test] + fn bitvec() { + use bitvec::prelude::*; + + let raw = [ + 0x8_Fu8, + // 7 0 + 0x0_1u8, + // 15 8 + 0b1111_0010u8, + // ^ sign bit + // 23 16 + ]; + let asd = &raw.view_bits::()[4..20]; + assert_eq!(asd.load_le::(), 0x2018u16,); + } +} diff --git a/backends/tfhe-hpu-backend/src/lib.rs b/backends/tfhe-hpu-backend/src/lib.rs new file mode 100644 index 000000000..986763b66 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/lib.rs @@ -0,0 +1,18 @@ +#[cfg(all(feature = "hw-v80", feature = "hw-xrt"))] +compile_error! {"hw-v80 and hw-xrt features are used to select the targeted fpga family. Only one fpga family can be used at a time thus these features are mutually exclusive. Only enable one of them at a time. "} + +mod entities; + +#[cfg(not(feature = "utils"))] +mod ffi; +#[cfg(feature = "utils")] +pub mod ffi; +#[cfg(feature = "utils")] +pub mod isc_trace; + +pub mod interface; + +pub mod asm; +pub mod fw; + +pub mod prelude; diff --git a/backends/tfhe-hpu-backend/src/prelude.rs b/backends/tfhe-hpu-backend/src/prelude.rs new file mode 100644 index 000000000..261ba9181 --- /dev/null +++ b/backends/tfhe-hpu-backend/src/prelude.rs @@ -0,0 +1,27 @@ +/// Load entities and type related stuff +pub use super::entities::*; + +/// Export Hw_hpu for asm definition +pub use super::asm as hpu_asm; + +/// Export hw_regmap. +/// Prevent version mismatch between user code and backend +pub use hw_regmap as hpu_regmap; + +/// Load Hw-interface stuff +/// Warn: Enabling this feature required xrt for build and run +pub use super::interface::{ + page_align, BoardConfig, FFIMode, HpuCmd, HpuConfig, HpuDevice, HpuError, HpuImm, + HpuVarWrapped, ShellString, ACKQ_EMPTY, +}; + +#[cfg(feature = "io-dump")] +/// Expose io_dump init function +pub use super::interface::io_dump::set_hpu_io_dump; + +#[cfg(not(any(feature = "hw-xrt", feature = "hw-v80")))] +/// Expose simulation interface +pub use super::ffi::{ + sim::ipc::{IpcSim, MemoryAck, MemoryReq, MemorySim, RegisterAck, RegisterReq, RegisterSim}, + MemKind, SyncMode, +}; diff --git a/backends/tfhe-hpu-backend/src/utils/dop_fmt.rs b/backends/tfhe-hpu-backend/src/utils/dop_fmt.rs new file mode 100644 index 000000000..9eb4174fb --- /dev/null +++ b/backends/tfhe-hpu-backend/src/utils/dop_fmt.rs @@ -0,0 +1,68 @@ +//! +//! Application used to handle Asm/Hex translation +//! It could be used to convert a single Op or a list of them + +use std::str::FromStr; +use tfhe_hpu_backend::asm::dop::ToHex; +use tfhe_hpu_backend::asm::{self}; + +/// Define CLI arguments +use clap::Parser; +#[derive(clap::Parser, Debug, Clone)] +#[clap(long_about = "DOp format management")] +pub struct Args { + // Input/Output configuration -------------------------------------------- + /// Convert from the given file. If file not available cast String in AsmOp + #[clap(short, long, value_parser)] + from: String, + + /// Output file + #[clap(short, long, value_parser)] + to: String, +} + +fn main() -> Result<(), anyhow::Error> { + let args = Args::parse(); + println!("User Options: {args:?}"); + + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(true) + // Display source code line numbers + .with_line_number(true) + .without_time() + // Build & register the subscriber + .init(); + + // Create output path and ensure that folder exists ---------------------------- + let out_p = std::path::Path::new(&args.to); + std::fs::create_dir_all(out_p.parent().unwrap())?; + + // Infer input mode ------------------------------------------------------------ + let op_file = std::path::Path::new(&args.from); + if op_file.exists() { + // read op from file + match ( + asm::Program::::read_asm(&args.from), + asm::Program::::read_hex(&args.from), + ) { + (Ok(p), ..) => p.write_hex(&args.to)?, + (Err(_), Ok(p)) => p.write_asm(&args.to)?, + (Err(dop_asm), Err(dop_hex)) => { + eprintln!("Failed to parse {}:", args.from); + eprintln!("\t DOp Asm parser => {dop_asm}"); + eprintln!("\t DOp Hex parser => {dop_hex}"); + panic!("Error: Impossible to decode instruction, check file encoding"); + } + } + } else { + let dop = asm::dop::DOp::from_str(&args.from)?; + let hex = dop.to_hex(); + println!("dop: {} -> 0x{:x}", dop, hex); + } + Ok(()) +} diff --git a/backends/tfhe-hpu-backend/src/utils/fw.rs b/backends/tfhe-hpu-backend/src/utils/fw.rs new file mode 100644 index 000000000..7eaaab69c --- /dev/null +++ b/backends/tfhe-hpu-backend/src/utils/fw.rs @@ -0,0 +1,201 @@ +//! +//! Application used to handle Iop/Dop translation +//! It could be used to convert a single IOp or a list of them + +use std::path::Path; +use tfhe_hpu_backend::asm; +use tfhe_hpu_backend::fw::isc_sim::PeConfigStore; +use tfhe_hpu_backend::fw::rtl::config::{FlushBehaviour, OpCfg, RtlCfg}; +use tfhe_hpu_backend::fw::{self, Fw, FwParameters}; + +/// Define CLI arguments +use clap::Parser; +use tfhe_hpu_backend::prelude::{HpuConfig, HpuParameters, ShellString}; +#[derive(clap::Parser, Debug, Clone)] +#[clap(long_about = "Translate IOp or Stream of IOps in DOps stream")] +pub struct Args { + // Configuration ----------------------------------------------------- + /// Toml top-level configuration file + /// Enable to retrieved runtime configuration register + #[clap( + long, + value_parser, + default_value = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml" + )] + pub config: ShellString, + + /// Hpu rtl parameters + /// Enable to retrieved the associated tfhe-rs parameters and other Rtl parameters + #[clap( + long, + value_parser, + default_value = "${HPU_MOCKUP_DIR}/params/gaussian_64b_fast.toml" + )] + pub params: ShellString, + + /// Supported nu + /// Number of linear operation supported + #[clap(long, value_parser, default_value_t = 5)] + nu: usize, + + /// Fw kind + #[clap(long, value_parser, default_value = "Ilp")] + fw_kind: fw::FwName, + + /// Number of Heap slots + #[clap(long, value_parser, default_value_t = 512)] + heap: usize, + + /// Kogge configuration file + #[clap( + long, + value_parser, + default_value = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/kogge_cfg.toml" + )] + kogge_cfg: ShellString, + + /// Use ipip configuration + #[clap(long, value_parser, default_value_t = false)] + use_ipip: bool, + + /// Use ipip configuration + #[clap(long, value_parser, default_value_t = false)] + use_bpip_opportunism: bool, + + /// Try to fill the batch fifo + #[clap(long, value_parser, default_value_t = true)] + fill_batch_fifo: bool, + + /// Use the minimum batch size for a PE + #[clap(long, value_parser, default_value_t = false)] + min_batch_size: bool, + + /// Use the minimum batch size for a PE + #[clap(long, value_parser, default_value_t = false)] + use_tiers: bool, + + /// Flush PBS batches to force a specific scheduling + #[clap(long, value_parser, default_value_t = true)] + flush: bool, + + /// Flush PBS batches behaviour + /// Available options are + /// Patient, + /// NoPBS, + /// Opportunist, + /// Timeout(usize), + #[clap(long, value_parser, default_value = "Patient")] + flush_behaviour: FlushBehaviour, + + /// Integer bit width + #[clap(long, value_parser, default_value_t = 8)] + integer_w: usize, + + // Override params -------------------------------------------------- + // Quick way to override parameters through ClI instead of editing the + // configuration file + // Used to override some parameters at runtime + /// Override Number of Register + #[clap(long, value_parser)] + register: Option, + + /// Override HPU lookahead buffer depth + /// Number of instruction that are considered in advance + #[clap(long, value_parser)] + isc_depth: Option, + + // Input/Output configuration -------------------------------------------- + /// Expand the given IOpcode + /// NB: couldn't use `convert_file` and `expand` at the same time + #[clap(short, long, value_parser)] + expand: Vec, + + /// Output folder + #[clap(long, value_parser, default_value = "output")] + out_folder: String, +} + +fn main() -> Result<(), anyhow::Error> { + let args = Args::parse(); + println!("User Options: {args:?}"); + + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(false) + // Display source code line numbers + .with_line_number(false) + .without_time() + // Build & register the subscriber + .init(); + + // Create output folder/file --------------------------------------------------- + let dirpath = Path::new(&args.out_folder); + std::fs::create_dir_all(dirpath).unwrap(); + + // Load config/parameters from configuration file ------------------------------------ + let config = HpuConfig::from_toml(args.config.expand().as_str()); + let params = { + let mut rtl_params = HpuParameters::from_toml(&args.params.expand()); + + // Override some parameters if required + if let Some(register) = args.register.as_ref() { + rtl_params.regf_params.reg_nb = *register; + } + if let Some(isc_depth) = args.isc_depth.as_ref() { + rtl_params.isc_params.depth = *isc_depth; + } + rtl_params + }; + let pe_cfg = PeConfigStore::from((¶ms, &config)); + let fw_params = FwParameters { + register: params.regf_params.reg_nb, + isc_depth: params.isc_params.depth, + heap_size: args.heap, + min_iop_size: params.isc_params.min_iop_size, + min_pbs_batch_w: config.firmware.min_batch_size, + total_pbs_nb: params.ntt_params.total_pbs_nb, + pbs_batch_w: params.ntt_params.batch_pbs_nb, + msg_w: params.pbs_params.message_width, + carry_w: params.pbs_params.carry_width, + nu: args.nu, + integer_w: args.integer_w, + use_ipip: args.use_ipip, + kogge_cfg: args.kogge_cfg.expand(), + op_cfg: RtlCfg::from(OpCfg { + fill_batch_fifo: args.fill_batch_fifo, + min_batch_size: args.min_batch_size, + use_tiers: args.use_tiers, + flush: args.flush, + flush_behaviour: args.flush_behaviour, + }), + cur_op_cfg: OpCfg::default(), + pe_cfg, + op_name: Default::default(), + }; + println!("Fw parameters after override with CLI: {fw_params:?}"); + + let expand_list = if args.expand.is_empty() { + asm::iop::IOP_LIST.to_vec() + } else { + args.expand.clone() + }; + + for iop in expand_list.iter() { + let base_file = format!("{}_{}b.dop", iop.to_string().trim(), args.integer_w); + + let asm_p = dirpath.join(Path::new(&format!("{base_file}.asm"))); + let hex_p = dirpath.join(Path::new(&format!("{base_file}.hex"))); + + // Instantiate Fw and start translation ---------------------------------------- + let fw = fw::AvlblFw::new(&args.fw_kind); + let prog = fw.expand(&fw_params, iop); + prog.write_asm(asm_p.as_os_str().to_str().unwrap())?; + prog.write_hex(hex_p.as_os_str().to_str().unwrap())?; + } + + Ok(()) +} diff --git a/backends/tfhe-hpu-backend/src/utils/hputil.rs b/backends/tfhe-hpu-backend/src/utils/hputil.rs new file mode 100644 index 000000000..9369c613a --- /dev/null +++ b/backends/tfhe-hpu-backend/src/utils/hputil.rs @@ -0,0 +1,290 @@ +//! +//! Utility application used to probe Hpu status +//! Enable manual step by step debug +use tfhe_hpu_backend::ffi; +use tfhe_hpu_backend::interface::rtl; +use tfhe_hpu_backend::interface::rtl::FromRtl; +use tfhe_hpu_backend::prelude::*; + +use tfhe_hpu_backend::isc_trace::{IscTraceStream, TraceDump}; + +use clap::{Parser, Subcommand, ValueEnum}; +use clap_num::maybe_hex; + +use std::fs::File; + +use tracing_subscriber::fmt::MakeWriter; + +#[derive(Clone, Debug, Subcommand)] +pub enum Command { + #[clap(about = "Read register")] + Read { + /// Register name + #[arg(short, long)] + name: String, + #[arg(short, long, default_value_t = 1)] + range: usize, + }, + + #[clap(about = "Write register")] + Write { + /// Register name + #[arg(short, long)] + name: String, + #[arg(short, long, value_parser=maybe_hex::)] + value: u32, + }, + + #[clap(about = "Dump given register section")] + Dump { + /// Section name + #[arg(index = 1)] + name: Vec
, + }, + + #[clap(about = "Reset given register section")] + Reset { + /// Section name + #[arg(index = 1)] + name: Vec
, + }, + #[clap(about = "Flush ackq")] + Flush, + + #[clap(about = "Memory Zone read (Hbm)")] + MzRead { + /// Hbm pc + #[arg(long, value_parser=maybe_hex::)] + pc: usize, + /// Hbm size + #[arg(short, long, value_parser=maybe_hex::)] + size: usize, + }, + + #[clap(about = "Memory Zone write (Hbm)")] + MzWrite { + /// Hbm pc + #[arg(long, value_parser=maybe_hex::)] + pc: usize, + /// Hbm size + #[arg(short, long, value_parser=maybe_hex::)] + size: usize, + // Pattern to write in Mz + #[arg(short, long, value_parser=maybe_hex::)] + pattern: u8, + }, + + #[clap(about = "Trace Dump")] + TraceDump { + #[arg(short, long, default_value_t = String::from("trace.json"))] + file: String, + }, +} + +#[derive(Clone, Debug, ValueEnum)] +pub enum Section { + PePbs, + PeMem, + PeAlu, + Isc, + Arch, +} + +#[derive(Clone, Debug, Parser)] +struct CliArgs { + // Fpga configuration ----------------------------------------------------- + #[arg(short, long, default_value_t = 0)] + fpga_id: u32, + #[arg( + short, + long, + default_value = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml" + )] + pub config: ShellString, + + #[command(subcommand)] + cmd: Command, +} + +fn main() { + let args = CliArgs::parse(); + + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(false) + // Display source code line numbers + .with_line_number(false) + .without_time() + // Build & register the subscriber + .init(); + + // Load fpga configuration from file + let config = HpuConfig::from_toml(&args.config.expand()); + + // Instantiate bare-minimum abstraction around XRT ----------------------- + let mut hpu_hw = ffi::HpuHw::new_hpu_hw( + &config.fpga.ffi, + std::time::Duration::from_micros(config.fpga.polling_us), + ); + let regmap = { + let regmap_expanded = config + .fpga + .regmap + .iter() + .map(|f| f.expand()) + .collect::>(); + let regmap_str = regmap_expanded + .iter() + .map(|f| f.as_str()) + .collect::>(); + hw_regmap::FlatRegmap::from_file(®map_str) + }; + + // Init the memory backend + let params = HpuParameters::from_rtl(&mut hpu_hw, ®map); + hpu_hw.init_mem(&config, ¶ms); + + // Handle user command -------------------------------------------------- + match args.cmd { + Command::Read { name, range } => { + let reg_start = regmap + .register() + .get(&name) + .expect("Unknown register, check regmap definition"); + let addr_start = *reg_start.offset() as u64; + + println!("Start read register {name} @{addr_start:0>8x}"); + for idx in 0..range { + let addr = addr_start + (idx * std::mem::size_of::()) as u64; + let val = hpu_hw.read_reg(addr); + println!(" @{addr:0>8x} -> {val:0>8x}"); + } + } + Command::Write { name, value } => { + let reg = regmap + .register() + .get(&name) + .expect("Unknown register, check regmap definition"); + let addr = *reg.offset() as u64; + + println!("Write {value:0>8x} in register {name} @{addr:0>8x}"); + hpu_hw.write_reg(addr, value); + } + Command::Dump { name } => { + for sec in name { + match sec { + Section::PePbs => println!( + "PePbs registers {:?}", + rtl::runtime::InfoPePbs::from_rtl(&mut hpu_hw, ®map) + ), + Section::PeMem => println!( + "PeMem registers {:?}", + rtl::runtime::InfoPeMem::from_rtl(&mut hpu_hw, ®map) + ), + Section::PeAlu => println!( + "PeAlu registers {:?}", + rtl::runtime::InfoPeAlu::from_rtl(&mut hpu_hw, ®map) + ), + Section::Isc => println!( + "Isc registers {:?}", + rtl::runtime::InfoIsc::from_rtl(&mut hpu_hw, ®map) + ), + Section::Arch => println!( + "Arch registers {:?}", + HpuParameters::from_rtl(&mut hpu_hw, ®map) + ), + } + } + } + Command::Reset { name } => { + for sec in name { + match sec { + Section::PePbs => { + println!(" Reset PePbs registers"); + let mut sec = rtl::runtime::InfoPePbs::from_rtl(&mut hpu_hw, ®map); + sec.reset(&mut hpu_hw, ®map); + } + Section::PeMem => { + println!(" Reset PeMem registers"); + let mut sec = rtl::runtime::InfoPeMem::from_rtl(&mut hpu_hw, ®map); + sec.reset(&mut hpu_hw, ®map); + } + Section::PeAlu => { + println!(" Reset PeAlu registers"); + let mut sec = rtl::runtime::InfoPeAlu::from_rtl(&mut hpu_hw, ®map); + sec.reset(&mut hpu_hw, ®map); + } + Section::Isc => { + println!(" Reset Isc registers"); + let mut sec = rtl::runtime::InfoIsc::from_rtl(&mut hpu_hw, ®map); + sec.reset(&mut hpu_hw, ®map); + } + Section::Arch => { + println!(" Arch registers couldn't be reset"); + } + } + } + } + Command::Flush => loop { + #[cfg(feature = "hw-v80")] + { + // TODO add ack flush to prevent error with previous stall execution + } + #[cfg(not(feature = "hw-v80"))] + { + let ackq_addr = (*regmap + .register() + .get("WorkAck::ackq") + .expect("Unknown register, check regmap definition") + .offset()) as u64; + let ack_code = hpu_hw.read_reg(ackq_addr); + println!("Flush ackq -> {ack_code:0>8x}"); + if ack_code == ACKQ_EMPTY { + break; + } + } + }, + Command::MzRead { pc, size } => { + let mut bfr = vec![0xff_u8; size]; + + let cut_props = ffi::MemZoneProperties { + mem_kind: ffi::MemKind::Hbm { pc }, + size_b: size, + }; + let mut mz = hpu_hw.alloc(cut_props); + mz.sync(ffi::SyncMode::Device2Host); + mz.read(0, bfr.as_mut_slice()); + if let Ok(bfr_u64) = bytemuck::try_cast_slice::<_, u64>(bfr.as_slice()) { + println!("MemZone content [u64]: {bfr_u64:x?}"); + } else if let Ok(bfr_u32) = bytemuck::try_cast_slice::<_, u32>(bfr.as_slice()) { + println!("MemZone content [u32]: {bfr_u32:x?}"); + } else if let Ok(bfr_u16) = bytemuck::try_cast_slice::<_, u16>(bfr.as_slice()) { + println!("MemZone content [u16]: {bfr_u16:x?}"); + } else { + println!("MemZone content [u8]: {bfr:x?}"); + } + } + Command::MzWrite { pc, size, pattern } => { + let bfr = vec![pattern; size]; + let cut_props = ffi::MemZoneProperties { + mem_kind: ffi::MemKind::Hbm { pc }, + size_b: size, + }; + let mut mz = hpu_hw.alloc(cut_props); + mz.write(0, bfr.as_slice()); + mz.sync(ffi::SyncMode::Host2Device); + } + Command::TraceDump { file: filename } => { + let trace = TraceDump::new_from(&mut hpu_hw, ®map, config.board.trace_depth); + let parsed = IscTraceStream::from(trace); + + let file = File::create(filename).expect("Failed to create or open trace dump file"); + serde_json::to_writer_pretty(file.make_writer(), &parsed) + .expect("Could not write trace dump"); + } + } +} diff --git a/backends/tfhe-hpu-backend/src/utils/iop_fmt.rs b/backends/tfhe-hpu-backend/src/utils/iop_fmt.rs new file mode 100644 index 000000000..547bcea6a --- /dev/null +++ b/backends/tfhe-hpu-backend/src/utils/iop_fmt.rs @@ -0,0 +1,67 @@ +//! +//! Application used to handle Asm/Hex translation +//! It could be used to convert a single Op or a list of them + +use std::str::FromStr; + +use tfhe_hpu_backend::asm; + +/// Define CLI arguments +use clap::Parser; +#[derive(clap::Parser, Debug, Clone)] +#[clap(long_about = "IOp format management")] +pub struct Args { + // Input/Output configuration -------------------------------------------- + /// Convert from the given file. If file not available cast String in AsmOp + #[clap(short, long, value_parser)] + from: String, + + /// Output file + #[clap(short, long, value_parser)] + to: String, +} + +fn main() -> Result<(), anyhow::Error> { + let args = Args::parse(); + println!("User Options: {args:?}"); + + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(true) + // Display source code line numbers + .with_line_number(true) + .without_time() + // Build & register the subscriber + .init(); + + // Create output path and ensure that folder exists ---------------------------- + let out_p = std::path::Path::new(&args.to); + std::fs::create_dir_all(out_p.parent().unwrap())?; + + // Infer input mode ------------------------------------------------------------ + let op_file = std::path::Path::new(&args.from); + if op_file.exists() { + // read op from file + match ( + asm::Program::::read_asm(&args.from), + asm::Program::::read_hex(&args.from), + ) { + (Ok(p), ..) => p.write_hex(&args.to)?, + (Err(_), Ok(p)) => p.write_asm(&args.to)?, + (Err(iop_asm), Err(iop_hex)) => { + eprintln!("Failed to parse {}:", args.from); + eprintln!("\t IOp Asm parser => {iop_asm}"); + eprintln!("\t IOp Hex parser => {iop_hex}"); + panic!("Error: Impossible to decode instruction, check file encoding"); + } + } + } else { + let iop = asm::iop::IOp::from_str(&args.from)?; + println!("iop: {} -> 0x{:0>8x?}", iop, iop.to_words()); + } + Ok(()) +} diff --git a/mockups/tfhe-hpu-mockup/Cargo.toml b/mockups/tfhe-hpu-mockup/Cargo.toml new file mode 100644 index 000000000..db4511a43 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "tfhe-hpu-mockup" +version = "0.1.0" +edition = "2021" +authors = ["Zama Hardware team"] +license = "BSD-3-Clause-Clear" +description = "Simulation model of HPU hardware." +readme = "README.md" + +[features] +default = [] +isc-order-check = [] + +[dependencies] +tfhe = { path = "../../tfhe", features = ["hpu", "hpu-debug"] } + +ipc-channel = "0.18.3" + +strum = { version = "0.26.2", features = ["derive"] } +strum_macros = "0.26.2" +bytemuck = "1.16.0" + +clap = { version = "4.4.4", features = ["derive"] } +clap-num = "*" +anyhow = "1.0.82" +tracing = "0.1.40" +tracing-subscriber = { version = "0.3.18", features = ["env-filter", "json"] } +serde_json = "1.0" +rand = "0.8.5" +serde = { version = "1", features = ["derive"] } +bitflags = "2.6.0" + +[[bin]] +name = "hpu_mockup" +path = "src/mockup.rs" +required-features = [] diff --git a/mockups/tfhe-hpu-mockup/Justfile b/mockups/tfhe-hpu-mockup/Justfile new file mode 100644 index 000000000..e069cbe1c --- /dev/null +++ b/mockups/tfhe-hpu-mockup/Justfile @@ -0,0 +1,50 @@ +set fallback := true + +_targets: + @just --list --unsorted --list-heading $'Available targets:\n' --list-prefix " " + +# Default values --------------------------------------------------------------- +DEFAULT_RUST_LOG := "info" +DEFAULT_MOCKUP_PARAMS := "gaussian_64b_fast.toml" +DEFAULT_TEST_ITER := "5" +CARGO_HOME := "$(pwd)/../.." +BUILD_PROFILE := "release" + +### Build target --------------------------------------------------------------- +# NB: Build mockup and user software all at once to reduce build time +build_all: + #!/usr/bin/env bash + set -euxo pipefail + cd {{CARGO_HOME}} + cargo build --profile {{BUILD_PROFILE}} --features="hpu" \ + --bin hpu_mockup \ + --example hpu_bench\ + --example hpu_hlapi + +### Mockup related target ------------------------------------------------------ +mockup MOCKUP_PARAMS=DEFAULT_MOCKUP_PARAMS RUST_LOG=DEFAULT_RUST_LOG: build_all + #!/usr/bin/env bash + set -euxo pipefail + RUST_LOG={{RUST_LOG}} {{CARGO_HOME}}/target/{{BUILD_PROFILE}}/hpu_mockup \ + --params $HPU_MOCKUP_DIR/params/{{MOCKUP_PARAMS}} \ + --report-out mockup_rpt --report-trace + +# UserApp related target ------------------------------------------------------- +# Run Hpu benchmark +hpu_bench ARGS="" RUST_LOG=DEFAULT_RUST_LOG: build_all + #!/usr/bin/env bash + set -euxo pipefail + RUST_LOG={{RUST_LOG}} {{CARGO_HOME}}/target/{{BUILD_PROFILE}}/examples/hpu_bench {{ARGS}} + +# Run Hpu HlApi showcase +hpu_hlapi ARGS="" RUST_LOG=DEFAULT_RUST_LOG: build_all + #!/usr/bin/env bash + set -euxo pipefail + RUST_LOG={{RUST_LOG}} {{CARGO_HOME}}/target/{{BUILD_PROFILE}}/examples/hpu_hlapi {{ARGS}} + +# Run Hpu testsuite +hpu_test ARGS="u8" TEST_ITER=DEFAULT_TEST_ITER RUST_LOG=DEFAULT_RUST_LOG: build_all + #!/usr/bin/env bash + set -euxo pipefail + cd {{CARGO_HOME}} + RUST_LOG={{RUST_LOG}} HPU_TEST_ITER={{TEST_ITER}} cargo test --profile {{BUILD_PROFILE}} --test hpu -- {{ARGS}} diff --git a/mockups/tfhe-hpu-mockup/LICENSE b/mockups/tfhe-hpu-mockup/LICENSE new file mode 100644 index 000000000..48312e88a --- /dev/null +++ b/mockups/tfhe-hpu-mockup/LICENSE @@ -0,0 +1,28 @@ +BSD 3-Clause Clear License + +Copyright © 2025 ZAMA. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this +list of conditions and the following disclaimer in the documentation and/or other +materials provided with the distribution. + +3. Neither the name of ZAMA nor the names of its contributors may be used to endorse +or promote products derived from this software without specific prior written permission. + +NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. +THIS SOFTWARE IS PROVIDED BY THE ZAMA AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +ZAMA OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, +OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/mockups/tfhe-hpu-mockup/Readme.md b/mockups/tfhe-hpu-mockup/Readme.md new file mode 100644 index 000000000..71ae857a9 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/Readme.md @@ -0,0 +1,170 @@ +# TFHE-hpu-mockup + +## Brief +Simulation _drop-in-replacement_ implementation of HPU Hardware. +This mockup implementation could be paired seamlessly with `tfhe-hpu-backend` compiled without any hardware support (i.e. `hpu-v80` or `hpu-xrt`). +Indeed, without hardware support, `tfhe-hpu-backend` calls to low-level FFI are replaced by IPC calls and could be intercepted by this mockup implementation. + +Objectives of this mockup are as follows: +* Transparent integration with User application: +> User must have nothing to change to his application code. +> Generated trace must match with the one obtained on the real hardware (except timestamp) + +* Stimulus generation +> Obtained results must be bit-accurate in order to generate golden stimuli for RTL simulation +> RTL parameters must be fully configurable at runtime to easily generate stimuli for any configuration + +* Firmware development +> Generate accurate performance estimation and tracing capabilities to help the development/optimization of HPU firmware + +### Mockup structure +Without hardware support `tfhe-hpu-backend` falls back to a simulation FFI interface (i.e. `ffi-sim`). This interface binds to an IPC channel and forwards the FFI call over IPC with a simple Cmd/Payload message and Request/Ack protocol. The Mockup binds to those IPC and answer is to request like the real hardware. + +On its side, the mockup answers to backend IPC requests and simulates the hardware behavior. +The internal structure of the mockup is organized around modules to emulate the hardware behavior. It contains the following modules: +* `memory`: Emulates memory such as Ddr and Hbm (only from a behavioral point of view). It enables to allocate/release chunk of memory. Those chunks could be read/write through the IPC with the same Sync mechanisms as the real hardware. +* `regmap`: Emulates the RTL register map. It converts concrete TFHE/RTL parameters into register value. +* `ucore`: Emulates the ucore behavior. It is in charge of reading the DOp stream from the HBM and patching the template operations in a same manner as it is done in the real hardware, to get the same patched DOp ucode. + +> NB: Modeling of the `instruction_scheduler` is required for efficient firmware generation and thus directly done inside `tfhe-hpu-backend` crate in `isc_sim` modules. The mockup reuse this implementation directly. + +The Mockup is a standalone binary that must be run before the User application code. +The use of two binaries enables to: +* Expose a wide range of mockup configuration without impacting the User application +* Have distinct streams of log: One for the mockup and one for the User application. Thus the trace log of the User application is unchanged compared to the real Hardware. + +Below an overview of the internal structure of the Mockup. +![HPU Mockup](./figures/tfhe-hpu-mockup.excalidraw.png) + +After the Mockup starts, it registers an IPC configuration channel in a file that could be read by the `ffi-sim` to establish a connection. +Once done, the following steps occur: + +> NB: Used filename is set in the TOML configuration file in FFI section. Environment variables could be used in the filename. +> ```toml +> [fpga.ffi.Sim] +> ipc_name="/tmp/${USER}/hpu_mockup_ipc" +> ``` + +1. Use configuration channel to exchange a set of IPC endpoints: one for registers access and one for memory management. Those channels implement a simple Cmd/Payload message and Request/Ack protocol. +2. `tfhe-hpu-backend` reads registers through the associated IPC channel and retrieves the associated mockup parameters. +3. `tfhe-hpu-backend` allocates required on-board memory. It then uploads the firmware translation table (Use to expand IOp in a stream of DOps), and the set of TFHE server keys. Then, it uploads the input ciphertext. +4. Once all input data is synced on the mockup, `tfhe-hpu-backend` triggered IOp execution by pushing operation in the `WorkQ`. + 4.1 HPU behavioral model retrieves the associated DOps stream from the HBM memory. For this purpose it uses the `ucore` module. This module reads the memory and patches the obtained stream to have concrete DOp to execute (The firmware translation table has some templated DOp that must be translated into concrete one before execution) + 4.2 DOp stream is then injected in the `instruction scheduler` to obtain the real execution order and the performance estimation. + 4.3 HPU behavioral model retrieves key material from the memory (i.e. in HPU format) and converts them back in Cpu format. + 4.4 HPU behavioral model executes the DOp with the help of `tfhe-rs` operation. + 4.5 When needed, execution model reads the ciphertext from the `regfile`. + > NB: Ciphertexts are stored in the `regfile` in HPU format and translated back to CPU format before execution. +5. When IOp execution is finished the Mockup notifies the `tfhe-hpu-backend` through the `AckQ`. +6. `tfhe-hpu-backend` retrieves the results from the HBM with the help of IPC channels. + + +### Mockup CLI and configuration +The mockup application is configured by two files: +1. Configuration (i.e. `--config` CLI knob) +It's the same as the one used by the `tfhe-hpu-backend`. It's used by the mockup application to retrieve the `ffi-sim` configuration, the register map, as well as the expected memory layout of the on-board memory. + +2. Parameters (i.e. `--params` CLI knob) +This file is used to retrieve the inner RTL parameters such as: +* TFHE parameter set +* NTT internal structure +* Instruction scheduler properties +* PC configuration for each HBM connection + +Other optional configuration knobs are available: +* `--freq-hz`, `--register`, `isc-depth`: These knobs are used to override some parameters on the flight. They are useful for quick exploration. +* `--dump-out`, `--dump-reg`: Use for RTL stimuli generation and debug +* `--report-out`, `report-trace`: Use for detailed analysis of the performances report +* `--nops`: Disable tfhe-rs computation. Obsviously led to incorrect behavior but accurate performance estimation. +* `--log-out`: Write trace message in the given file instead of stdio. + +On top of that `tfhe-hpu-mockup` could generate a detailed set of trace points at runtime to help during the debug/exploration phase (e.g. When writing new Hpu firmware). +Those trace points rely on `tokio-tracing` and could be activated on a path::verbosity based through the `RUST_LOG` environment variable. +For example the following value will enable the info trace for all the design and the debug one for the ucore submodule: +`RUST_LOG=info,tfhe_hpu_mockup::modules::ucore=debug`. + +> NB: With the mockup estimated IOp performances must be read from the mockup log, not from the user application report. +> Indeed, the user application reports the execution time of the mockup binary not the expected performance on real Hpu hardware. + + +## Example +The following section explains how to run the user application examples on the mockup backend. +> NB: The use of the mockup instead of the real hardware is transparent for the user application. +> Only changes in the configuration file are required, and no hardware support should be activated during compilation (i.e. features like `hpu-v80` or `hpu-xrt`). + + +### HPU configuration selection +For convenience a simulation configuration is available in `backends/tfhe-hpu-backend/config_store/sim`. +Select the desired configuration with help of `setup_hpu.sh`: + +```bash +source setup_hpu.sh --config sim +``` + +> NB: For convenience, a Justfile is provided with different targets to build and start User/Mockup applications. +> Open two terminals and, for example +> Start `just mockup` in the first one and start `just hpu_bench` in the second one. +> For list of available target use `just` + +### Start mockup application +Two parameter kinds are provided for convenience: +* `mockups/tfhe-hpu-mockup/params/tfhers_*_fast.ron`: + > Use a fake parameters set with small lwe_n. Simulation is fast. Useful for debug and test + +* `mockups/tfhe-hpu-mockup/params/tfhers_*.ron`: +> Use real Hardware parameter set. Simulation is slow, but it enables to generate bit-accurate results and accurate performances estimation. Useful for RTL stimulus generation and FW exploration. + +```bash +# Example of CLI for building/running mockup application +# For convenience, `just mockup` could be also used +cargo build --release --bin hpu_mockup +./target/release/hpu_mockup \ + --params mockups/tfhe-hpu-mockup/params/gaussian_64b_fast.ron \ + [--freq-hz --register --isc-depth] + [--dump-out mockup_out/ --dump-reg]\ + [--report-out mockup_rpt/ --report-trace] +``` + +### Start user application +In the snippets below, `hpu_bench` is selected but any application using HPU hardware could be used. + +```bash +# Example of CLI for building/running hpu_bench application +# For convenienc, `just hpu_bench` could be also used +cargo build --release --features="hpu" --example hpu_bench +# Start MUL and ADD IOp on 64b integer +./target/release/examples/hpu_bench --integer-w 64 --iop MUL --iop ADD +``` + + +## Test +Hpu test framework could also be started with the mockup. It relies on the same configuration mechanism (c.f. [setup_hpu.sh](#HPU-configuration-selection)). + +> In same fashion as example a dedicated Justfile entry is available for the test +> Open two terminals and run the following +> Start `just mockup` in the first one and start `just hpu_test` in the second one. + + +### Start mockup application +Mockup starting process is the same as for examples (c.f. [Start Mockup](#Start-mockup-application)). + +### Start test framework +In the snippets below, hpu test framework is started on 8b integer. + +```bash +# Example of CLI for running hpu test framework +# For convenience, `just hpu_test` could be also used +# Test is defined for 8/16/32/64 and 128 bits integer, without specification all integer width are started +cargo run --release --features="hpu" --test hpu -- u8 + +# Filtering could also be used to sub-categories +# Available sub-categories are: +# * alus: for ct x scalar arithmetic IOps +# * alu: for ct x ct arithmetic IOps +# * bitwise: for ct x ct bitwise IOps +# * cmp: for comparison IOps +# * ternary: for if-then-else and like IOps +# * algo: for IOps dedicated to offload sub-algorithm like ERC_20 +# Command below only run comparison IOps, for convenience, `just hpu_test "cmp"` could be also used +cargo run --release --features="hpu" --test hpu -- cmp +``` diff --git a/mockups/tfhe-hpu-mockup/figures/tfhe-hpu-mockup.excalidraw.png b/mockups/tfhe-hpu-mockup/figures/tfhe-hpu-mockup.excalidraw.png new file mode 100644 index 000000000..5dcbfa7e1 Binary files /dev/null and b/mockups/tfhe-hpu-mockup/figures/tfhe-hpu-mockup.excalidraw.png differ diff --git a/mockups/tfhe-hpu-mockup/params/gaussian_44b.toml b/mockups/tfhe-hpu-mockup/params/gaussian_44b.toml new file mode 100644 index 000000000..be1bb2f57 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/params/gaussian_44b.toml @@ -0,0 +1,44 @@ +[pbs_params] + lwe_dimension=724 + glwe_dimension=2 + polynomial_size=1024 + lwe_noise_distribution={GaussianStdDev=1.2597809688976277e-05} + glwe_noise_distribution={GaussianStdDev=2.2737367544323206e-13} + pbs_base_log= 20 + pbs_level= 1 + ks_base_log= 2 + ks_level= 7 + message_width= 2 + carry_width= 2 + ciphertext_width= 44 + +[ntt_params] + core_arch="WmmUnfoldPcg" + min_pbs_nb= 10 + batch_pbs_nb= 16 + total_pbs_nb= 32 + ct_width= 44 + radix= 2 + stg_nb= 10 + prime_modulus="Solinas2_44_14" + psi= 32 + delta= 5 +[ks_params] + width= 21 + lbx= 2 + lby= 32 + lbz= 3 +[pc_params] + ksk_pc= 4 + ksk_bytes_w= 64 + bsk_pc= 4 + bsk_bytes_w= 64 + pem_pc= 2 + pem_bytes_w= 64 + glwe_bytes_w= 64 +[regf_params] + reg_nb= 64 + coef_nb= 32 +[isc_params] + min_iop_size= 4 + depth= 64 diff --git a/mockups/tfhe-hpu-mockup/params/gaussian_44b_fast.toml b/mockups/tfhe-hpu-mockup/params/gaussian_44b_fast.toml new file mode 100644 index 000000000..49777aca0 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/params/gaussian_44b_fast.toml @@ -0,0 +1,44 @@ +[pbs_params] + lwe_dimension=20 + glwe_dimension=2 + polynomial_size=1024 + lwe_noise_distribution={GaussianStdDev=0.0} + glwe_noise_distribution={GaussianStdDev=0.0} + pbs_base_log=20 + pbs_level=1 + ks_base_log=2 + ks_level=7 + message_width=2 + carry_width=2 + ciphertext_width=44 + +[ntt_params] + core_arch="WmmCompactPcg" + min_pbs_nb= 10 + batch_pbs_nb= 16 + total_pbs_nb= 32 + ct_width= 44 + radix= 2 + stg_nb= 10 + prime_modulus="Solinas2_44_14" + psi= 32 + delta= 5 +[ks_params] + width= 21 + lbx= 2 + lby= 32 + lbz= 3 +[pc_params] + ksk_pc= 4 + ksk_bytes_w= 64 + bsk_pc= 4 + bsk_bytes_w= 64 + pem_pc= 2 + pem_bytes_w= 64 + glwe_bytes_w= 64 +[regf_params] + reg_nb= 64 + coef_nb= 32 +[isc_params] + min_iop_size= 4 + depth= 64 diff --git a/mockups/tfhe-hpu-mockup/params/gaussian_64b.toml b/mockups/tfhe-hpu-mockup/params/gaussian_64b.toml new file mode 100644 index 000000000..02f44714d --- /dev/null +++ b/mockups/tfhe-hpu-mockup/params/gaussian_64b.toml @@ -0,0 +1,47 @@ +[pbs_params] + lwe_dimension=724 + glwe_dimension=2 + polynomial_size=1024 + lwe_noise_distribution={GaussianStdDev= 1.2597809688976277e-05} + glwe_noise_distribution={GaussianStdDev= 2.2737367544323206e-13} + pbs_base_log= 20 + pbs_level= 1 + ks_base_log= 2 + ks_level= 7 + message_width= 2 + carry_width= 2 + ciphertext_width= 64 + +[ntt_params] + core_arch= {GF64=[5,5]} + min_pbs_nb= 10 + batch_pbs_nb= 12 + total_pbs_nb= 32 + ct_width= 64 + radix= 2 + stg_nb= 10 + prime_modulus= "GF64" + psi= 32 + delta= 5 + +[ks_params] + width= 21 + lbx= 2 + lby= 32 + lbz= 3 + +[pc_params] + ksk_pc= 4 + ksk_bytes_w= 32 + bsk_pc= 4 + bsk_bytes_w= 32 + pem_pc= 2 + pem_bytes_w= 32 + glwe_bytes_w= 32 + +[regf_params] + reg_nb= 64 + coef_nb= 32 +[isc_params] + min_iop_size= 4 + depth= 64 diff --git a/mockups/tfhe-hpu-mockup/params/gaussian_64b_fast.toml b/mockups/tfhe-hpu-mockup/params/gaussian_64b_fast.toml new file mode 100644 index 000000000..da30451cd --- /dev/null +++ b/mockups/tfhe-hpu-mockup/params/gaussian_64b_fast.toml @@ -0,0 +1,44 @@ +[pbs_params] + lwe_dimension=20 + glwe_dimension=2 + polynomial_size=1024 + lwe_noise_distribution={GaussianStdDev=0.0} + glwe_noise_distribution={GaussianStdDev=0.0} + pbs_base_log=20 + pbs_level=1 + ks_base_log=2 + ks_level=7 + message_width=2 + carry_width=2 + ciphertext_width=64 + +[ntt_params] + core_arch= {GF64=[5,5]} + min_pbs_nb= 10 + batch_pbs_nb= 12 + total_pbs_nb= 32 + ct_width= 64 + radix= 2 + stg_nb= 10 + prime_modulus="GF64" + psi= 32 + delta= 5 +[ks_params] + width= 21 + lbx= 2 + lby= 32 + lbz= 3 +[pc_params] + ksk_pc= 4 + ksk_bytes_w= 32 + bsk_pc= 4 + bsk_bytes_w= 32 + pem_pc= 2 + pem_bytes_w= 32 + glwe_bytes_w= 32 +[regf_params] + reg_nb= 64 + coef_nb= 32 +[isc_params] + min_iop_size= 4 + depth= 64 diff --git a/mockups/tfhe-hpu-mockup/params/gaussian_64b_pfail64.toml b/mockups/tfhe-hpu-mockup/params/gaussian_64b_pfail64.toml new file mode 100644 index 000000000..d0e54851d --- /dev/null +++ b/mockups/tfhe-hpu-mockup/params/gaussian_64b_pfail64.toml @@ -0,0 +1,47 @@ +[pbs_params] + lwe_dimension=804 + glwe_dimension=1 + polynomial_size=2048 + lwe_noise_distribution={GaussianStdDev= 5.963599673924788e-6} + glwe_noise_distribution={GaussianStdDev= 2.8452674713391114e-15} + pbs_base_log= 23 + pbs_level= 1 + ks_base_log= 2 + ks_level= 8 + message_width= 2 + carry_width= 2 + ciphertext_width= 64 + +[ntt_params] + core_arch= {GF64=[5,6]} + min_pbs_nb= 6 + batch_pbs_nb= 12 + total_pbs_nb= 32 + ct_width= 64 + radix= 2 + stg_nb= 11 + prime_modulus= "GF64" + psi= 32 + delta= 5 + +[ks_params] + width= 21 + lbx= 3 + lby= 64 + lbz= 3 + +[pc_params] + ksk_pc= 16 + ksk_bytes_w= 32 + bsk_pc= 8 + bsk_bytes_w= 32 + pem_pc= 2 + pem_bytes_w= 32 + glwe_bytes_w= 32 + +[regf_params] + reg_nb= 64 + coef_nb= 32 +[isc_params] + min_iop_size= 4 + depth= 64 diff --git a/mockups/tfhe-hpu-mockup/params/gaussian_64b_pfail64_psi64.toml b/mockups/tfhe-hpu-mockup/params/gaussian_64b_pfail64_psi64.toml new file mode 100644 index 000000000..83722fcd5 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/params/gaussian_64b_pfail64_psi64.toml @@ -0,0 +1,48 @@ +[pbs_params] + lwe_dimension=804 + glwe_dimension=1 + polynomial_size=2048 + lwe_noise_distribution={GaussianStdDev= 5.963599673924788e-6} + glwe_noise_distribution={GaussianStdDev= 2.8452674713391114e-15} + pbs_base_log= 23 + pbs_level= 1 + ks_base_log= 2 + ks_level= 8 + message_width= 2 + carry_width= 2 + ciphertext_width= 64 + opportunistic=true + +[ntt_params] + core_arch= {GF64=[5,6]} + min_pbs_nb= 11 + batch_pbs_nb= 12 + total_pbs_nb= 32 + ct_width= 64 + radix= 2 + stg_nb= 11 + prime_modulus= "GF64" + psi= 64 + delta= 5 + +[ks_params] + width= 21 + lbx= 3 + lby= 64 + lbz= 3 + +[pc_params] + ksk_pc= 16 + ksk_bytes_w= 32 + bsk_pc= 8 + bsk_bytes_w= 32 + pem_pc= 2 + pem_bytes_w= 32 + glwe_bytes_w= 32 + +[regf_params] + reg_nb= 64 + coef_nb= 32 +[isc_params] + min_iop_size= 4 + depth= 64 diff --git a/mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml b/mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml new file mode 100644 index 000000000..98e5690c1 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/params/tuniform_64b_fast.toml @@ -0,0 +1,44 @@ +[pbs_params] + lwe_dimension=20 + glwe_dimension=1 + polynomial_size=2048 + lwe_noise_distribution={TUniformBound= 0} + glwe_noise_distribution={TUniformBound= 0} + pbs_base_log=23 + pbs_level=1 + ks_base_log=2 + ks_level=7 + message_width=2 + carry_width=2 + ciphertext_width=64 + +[ntt_params] + core_arch= {GF64=[5,6]} + min_pbs_nb= 10 + batch_pbs_nb= 12 + total_pbs_nb= 32 + ct_width= 64 + radix= 2 + stg_nb= 11 + prime_modulus="GF64" + psi= 32 + delta= 5 +[ks_params] + width= 21 + lbx= 2 + lby= 32 + lbz= 3 +[pc_params] + ksk_pc= 4 + ksk_bytes_w= 32 + bsk_pc= 4 + bsk_bytes_w= 32 + pem_pc= 2 + pem_bytes_w= 32 + glwe_bytes_w= 32 +[regf_params] + reg_nb= 64 + coef_nb= 32 +[isc_params] + min_iop_size= 4 + depth= 64 diff --git a/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml b/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml new file mode 100644 index 000000000..c40e6dc2e --- /dev/null +++ b/mockups/tfhe-hpu-mockup/params/tuniform_64b_pfail64_psi64.toml @@ -0,0 +1,48 @@ +[pbs_params] + lwe_dimension=839 + glwe_dimension=1 + polynomial_size=2048 + lwe_noise_distribution={TUniformBound= 4} + glwe_noise_distribution={TUniformBound= 17} + pbs_base_log= 23 + pbs_level= 1 + ks_base_log= 2 + ks_level= 7 + message_width= 2 + carry_width= 2 + ciphertext_width= 64 + opportunistic=true + +[ntt_params] + core_arch= {GF64=[5,6]} + min_pbs_nb= 11 + batch_pbs_nb= 12 + total_pbs_nb= 32 + ct_width= 64 + radix= 2 + stg_nb= 11 + prime_modulus= "GF64" + psi= 64 + delta= 5 + +[ks_params] + width= 21 + lbx= 3 + lby= 64 + lbz= 3 + +[pc_params] + ksk_pc= 16 + ksk_bytes_w= 32 + bsk_pc= 8 + bsk_bytes_w= 32 + pem_pc= 2 + pem_bytes_w= 32 + glwe_bytes_w= 32 + +[regf_params] + reg_nb= 64 + coef_nb= 32 +[isc_params] + min_iop_size= 4 + depth= 64 diff --git a/mockups/tfhe-hpu-mockup/src/ipc.rs b/mockups/tfhe-hpu-mockup/src/ipc.rs new file mode 100644 index 000000000..a082641d1 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/ipc.rs @@ -0,0 +1,102 @@ +//! Manage binding with tfhe-rs application +//! +//! Create a file with binding information and automatically refresh it +//! when connection close + +use ipc_channel::ipc; +use tfhe::tfhe_hpu_backend::prelude::*; + +pub struct Ipc { + ipc_name: String, + ipc: IpcSim, +} + +impl Ipc { + pub fn new(ipc_name: &str) -> Self { + Self { + ipc_name: ipc_name.to_string(), + ipc: IpcSim::new_bind_on(ipc_name), + } + } + + fn ipc_reset(&mut self) { + let new_ipc = IpcSim::new_bind_on(&self.ipc_name); + self.ipc = new_ipc; + } + + /// Recv next register request if any + pub fn register_req(&mut self) -> Option { + let req = { + let IpcSim { register, .. } = &self.ipc; + let RegisterSim { req, .. } = register; + req + }; + + match req.try_recv() { + Ok(cmd) => { + tracing::trace!("RegisterReq Recv {cmd:x?}"); + Some(cmd) + } + Err(err) => match &err { + ipc::TryRecvError::IpcError(kind) => match kind { + ipc::IpcError::Disconnected => { + self.ipc_reset(); + None + } + _ => panic!("Encounter Ipc error {err:?}"), + }, + ipc::TryRecvError::Empty => None, + }, + } + } + + /// Send register ack + pub fn register_ack(&mut self, ack: RegisterAck) { + let ack_tx = { + let IpcSim { register, .. } = &self.ipc; + let RegisterSim { ack, .. } = register; + ack + }; + + // Silently drop error + let _ = ack_tx.send(ack); + } + + /// Recv next memory request if any + pub fn memory_req(&mut self) -> Option { + let req = { + let IpcSim { memory, .. } = &self.ipc; + let MemorySim { req, .. } = memory; + req + }; + + match req.try_recv() { + Ok(cmd) => { + tracing::trace!("MemoryReq recv {cmd:x?}"); + Some(cmd) + } + Err(err) => match &err { + ipc::TryRecvError::IpcError(kind) => match kind { + ipc::IpcError::Disconnected => { + self.ipc_reset(); + None + } + _ => panic!("Encounter Ipc error {err:?}"), + }, + ipc::TryRecvError::Empty => None, + }, + } + } + + /// Send memory ack + pub fn memory_ack(&mut self, ack: MemoryAck) { + let ack_tx = { + let IpcSim { memory, .. } = &self.ipc; + let MemorySim { ack, .. } = memory; + ack + }; + + // Silently drop error + let _ = ack_tx.send(ack); + } +} diff --git a/mockups/tfhe-hpu-mockup/src/lib.rs b/mockups/tfhe-hpu-mockup/src/lib.rs new file mode 100644 index 000000000..297dfcd4e --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/lib.rs @@ -0,0 +1,906 @@ +#[cfg(feature = "isc-order-check")] +use hpu_asm::dop::ToAsm; +use hpu_asm::PbsLut; +use std::array::from_fn; +use std::collections::VecDeque; +use std::io::Write; +use tfhe::core_crypto::algorithms::{ + lwe_ciphertext_add_assign, lwe_ciphertext_cleartext_mul_assign, lwe_ciphertext_opposite_assign, + lwe_ciphertext_plaintext_add_assign, lwe_ciphertext_plaintext_sub_assign, + lwe_ciphertext_sub_assign, +}; +use tfhe::core_crypto::entities::{ + Cleartext, LweCiphertextOwned, LweCiphertextView, LweKeyswitchKey, NttLweBootstrapKey, + Plaintext, +}; +use tfhe::core_crypto::hpu::glwe_lookuptable::create_hpu_lookuptable; +use tfhe::core_crypto::prelude::*; +use tfhe::shortint::parameters::KeySwitch32PBSParameters; +use tfhe::tfhe_hpu_backend::fw::isc_sim::PeConfigStore; + +mod ipc; +use ipc::Ipc; + +mod modules; +pub use modules::isc; +pub use modules::params::{MockupOptions, MockupParameters}; +use modules::{DdrMem, HbmBank, RegisterEvent, RegisterMap, UCore, HBM_BANK_NB}; + +use tfhe::tfhe_hpu_backend::interface::io_dump::HexMem; +use tfhe::tfhe_hpu_backend::prelude::*; + +use serde_json; + +pub struct HpuSim { + config: HpuConfig, + params: MockupParameters, + options: MockupOptions, + + ipc: Ipc, + regmap: RegisterMap, + + /// On-board memory + hbm_bank: [HbmBank; HBM_BANK_NB], + ddr: DdrMem, + /// On-chip regfile + regfile: Vec>, + /// Program counter + pc: usize, + + /// UCore model + ucore: UCore, + + /// Instruction scheduler + isc: isc::Scheduler, + + // WorkAckq interface ----------------------------------------------------- + workq_stream: VecDeque, + /// Pending Iop + iop_req: VecDeque, + iop_nb: usize, + iop_pdg: VecDeque, + + /// Tfhe server keys + /// Read from memory after bsk_avail/ksk_avail register are set + /// Conversion from Hpu->Cpu is costly. Thuse store it in the object to prevent extra + /// computation + /// Also store buffer for ks-pbs computation + sks: Option<( + LweKeyswitchKeyOwned, + LweCiphertextOwned, + NttLweBootstrapKeyOwned, + )>, + + // Execute history -------------------------------------------------------- + #[cfg(feature = "isc-order-check")] + dops_exec_order: Vec, + #[cfg(feature = "isc-order-check")] + dops_check_order: Vec, +} + +impl HpuSim { + pub fn new(config: HpuConfig, params: MockupParameters, options: MockupOptions) -> Self { + // Allocate communication channels + let ipc = { + let name = match config.fpga.ffi { + FFIMode::Sim { ref ipc_name } => ipc_name.expand(), + _ => panic!("Unsupported config type with ffi::sim"), + }; + Ipc::new(&name) + }; + // Allocate register map emulation + let regmap_expanded = config + .fpga + .regmap + .iter() + .map(|f| f.expand()) + .collect::>(); + let regmap_str = regmap_expanded + .iter() + .map(|f| f.as_str()) + .collect::>(); + let regmap = RegisterMap::new(params.rtl_params.clone(), ®map_str); + + // Allocate on-board memory emulation + let hbm_bank: [HbmBank; HBM_BANK_NB] = from_fn(HbmBank::new); + let ddr = DdrMem::new(); + + // Allocate inner regfile and lock abstraction + let regfile = (0..params.rtl_params.regf_params.reg_nb) + .map(|_| HpuLweCiphertextOwned::new(0, params.rtl_params.clone())) + .collect::>(); + + // Allocate Ucore Fw translation + let ucore = UCore::new(config.board.clone()); + + // Allocate InstructionScheduler + // This module is also in charge of performances estimation + let pe_config = PeConfigStore::from((¶ms.rtl_params, &config)); + let isc = isc::Scheduler::new( + params.freq_mhz, + params.quantum_us, + ¶ms.rtl_params.isc_params, + pe_config, + ); + Self { + config, + params, + options, + ipc, + regmap, + hbm_bank, + ddr, + regfile, + pc: 0, + ucore, + isc, + workq_stream: VecDeque::new(), + iop_req: VecDeque::new(), + iop_nb: 0, + iop_pdg: VecDeque::new(), + sks: None, + #[cfg(feature = "isc-order-check")] + dops_exec_order: Vec::new(), + #[cfg(feature = "isc-order-check")] + dops_check_order: Vec::new(), + } + } + + pub fn ipc_poll(&mut self) { + loop { + // Flush register requests + while let Some(req) = self.ipc.register_req() { + match req { + RegisterReq::Read { addr } => { + let val = self.regmap.read_reg(addr); + self.ipc.register_ack(RegisterAck::Read(val)); + } + RegisterReq::Write { addr, value } => { + let evt = self.regmap.write_reg(addr, value); + match evt { + RegisterEvent::None => { /* Nothing to do */ } + RegisterEvent::KeyReset => { + // Reset associated key option + self.sks = None; + } + RegisterEvent::WorkQ(word) => { + // Append to workq_stream and try to extract an iop + self.workq_stream.push_back(word); + match hpu_asm::IOp::from_words(&mut self.workq_stream) { + Ok(iop) => self.iop_req.push_back(iop), + Err(_) => { + // not enough data to match + } + } + } + } + self.ipc.register_ack(RegisterAck::Write); + } + RegisterReq::PbsParams => { + self.ipc.register_ack(RegisterAck::PbsParams( + self.params.rtl_params.pbs_params.clone(), + )); + } + } + } + + // Flush memory requests + while let Some(req) = self.ipc.memory_req() { + match req { + MemoryReq::Allocate { mem_kind, size_b } => { + let addr = match mem_kind { + MemKind::Ddr { offset } => { + self.ddr.alloc_at(offset as u64, size_b); + offset as u64 + } + MemKind::Hbm { pc } => self.hbm_bank[pc].alloc(size_b), + }; + self.ipc.memory_ack(MemoryAck::Allocate { addr }); + } + MemoryReq::Sync { + mem_kind, + addr, + mode, + data, + } => match mode { + SyncMode::Host2Device => { + let sw_data = data.expect("No data received on Host2Device sync"); + match mem_kind { + MemKind::Ddr { .. } => { + self.ddr.get_mut_chunk(addr).ipc_update(sw_data) + } + MemKind::Hbm { pc } => { + self.hbm_bank[pc].get_mut_chunk(addr).ipc_update(sw_data) + } + } + // Generate ack + self.ipc.memory_ack(MemoryAck::Sync { data: None }); + } + SyncMode::Device2Host => { + assert!(data.is_none(), "Received data on Device2Host sync"); + + // Read data + let hw_data = match mem_kind { + MemKind::Ddr { .. } => self.ddr.get_mut_chunk(addr).ipc_wrap(), + MemKind::Hbm { pc } => self.hbm_bank[pc].get_chunk(addr).ipc_wrap(), + }; + + // Generate ack + self.ipc.memory_ack(MemoryAck::Sync { + data: Some(hw_data), + }); + } + }, + MemoryReq::Release { mem_kind, addr } => { + match mem_kind { + MemKind::Ddr { .. } => { + let _ = self.ddr.rm_chunk(addr); + } + MemKind::Hbm { pc } => { + let _ = self.hbm_bank[pc].rm_chunk(addr); + } + }; + self.ipc.memory_ack(MemoryAck::Release); + } + } + } + + // Issue IOp requests to isc + while let Some(iop) = self.iop_req.pop_front() { + let (dops, dops_patched) = + self.ucore + .translate(&self.ddr, self.hbm_bank.as_slice(), &iop); + + // Write required input material if needed + if let Some(dump_path) = self.options.dump_out.as_ref() { + let iopcode = iop.opcode().0; + + // Generate IOp file + let asm_p = format!("{dump_path}/iop/iop_{}.asm", self.iop_nb); + let hex_p = format!("{dump_path}/iop/iop_{}.hex", self.iop_nb); + let mut iop_prog = hpu_asm::Program::default(); + iop_prog.push_comment(format!("{}", iop)); + iop_prog.push_stmt(iop.clone()); + iop_prog.write_asm(&asm_p).unwrap(); + iop_prog.write_hex(&hex_p).unwrap(); + self.iop_nb += 1; + + // Generate DOps file + // TODO find a proper way to add the header back + let asm_p = format!("{dump_path}/dop/dop_{iopcode:0>2x}.asm"); + let hex_p = format!("{dump_path}/dop/dop_{iopcode:0>2x}.hex"); + let dop_prog = hpu_asm::Program::new( + dops.iter() + .map(|op| hpu_asm::AsmOp::Stmt(op.clone())) + .collect::>(), + ); + dop_prog.write_asm(&asm_p).unwrap(); + dop_prog.write_hex(&hex_p).unwrap(); + // Generate patched DOps file + let asm_patched_p = format!("{dump_path}/dop/dop_patched_{iopcode:0>2x}.asm"); + let hex_patched_p = format!("{dump_path}/dop/dop_patched_{iopcode:0>2x}.hex"); + let dop_patched_prog = hpu_asm::Program::new( + dops_patched + .iter() + .map(|op| hpu_asm::AsmOp::Stmt(op.clone())) + .collect::>(), + ); + dop_patched_prog.write_asm(&asm_patched_p).unwrap(); + dop_patched_prog.write_hex(&hex_patched_p).unwrap(); + } + + // Use to check correct scheduling at runtime + #[cfg(feature = "isc-order-check")] + self.dops_check_order + .extend_from_slice(dops_patched.as_slice()); + + // Push associated dops to scheduler + self.isc.insert_dops(dops_patched); + self.iop_pdg.push_back(iop); + } + + // Advance simulation for quantum_us time + // Quantum is used here to keep the mockup responsive to IPC + if !self.iop_pdg.is_empty() { + let bpip_timeout = if self.regmap.bpip_state().used { + Some(self.regmap.bpip_state().timeout) + } else { + None + }; + let dops_exec = self.isc.schedule(bpip_timeout); + for dop in dops_exec { + self.exec(&dop); + #[cfg(feature = "isc-order-check")] + { + self.check_order(&dop); + self.dops_exec_order.push(dop); + } + } + } + } + } +} + +impl HpuSim { + fn exec(&mut self, dop: &hpu_asm::DOp) { + tracing::debug!("Simulate execution of DOp: {dop:?}[@{}]", self.pc); + + // Read operands + match dop { + // Invariant path, handle independently from `nops` flag + hpu_asm::DOp::SYNC(_) => { + // Push ack in stream + let iop = self + .iop_pdg + .pop_front() + .expect("SYNC received but no pending IOp to acknowledge"); + // Answer with IOp header + let iop_header_u32 = iop.to_words()[0]; + self.regmap.ack_pdg(iop_header_u32); + + // Generate executed DOp order + #[cfg(feature = "isc-order-check")] + if let Some(dump_path) = self.options.dump_out.as_ref() { + let iopcode = iop.opcode().0; + + let asm_p = format!("{dump_path}/dop/dop_executed_{iopcode:0>2x}.asm"); + let hex_p = format!("{dump_path}/dop/dop_executed_{iopcode:0>2x}.hex"); + let dop_prog = hpu_asm::Program::new( + self.dops_exec_order + .iter() + .map(|op| hpu_asm::AsmOp::Stmt(op.clone())) + .collect::>(), + ); + dop_prog.write_asm(&asm_p).unwrap(); + dop_prog.write_hex(&hex_p).unwrap(); + } + + // Generate report + let time_rpt = self.isc.time_report(); + let dop_rpt = self.isc.dop_report(); + let pe_rpt = self.isc.pe_report(); + tracing::info!("Report for IOp: {}", iop); + tracing::info!("{time_rpt:?}"); + tracing::info!("{dop_rpt}"); + tracing::info!("{pe_rpt}"); + + if let Some(mut rpt_file) = self.options.report_file((&iop).into()) { + writeln!(rpt_file, "Report for IOp: {}", iop).unwrap(); + writeln!(rpt_file, "{time_rpt:?}").unwrap(); + writeln!(rpt_file, "{dop_rpt}").unwrap(); + writeln!(rpt_file, "{pe_rpt}").unwrap(); + } + + let trace = self.isc.reset_trace(); + trace.iter().for_each(|pt| tracing::trace!("{pt}")); + if let Some(mut trace_file) = self.options.report_trace((&iop).into()) { + let json_string = + serde_json::to_string(&trace).expect("Could not serialize trace"); + writeln!(trace_file, "{}", json_string).unwrap(); + } + } + // Skipped with `nops` flag + _ => { + if !self.options.nops { + match dop { + hpu_asm::DOp::LD(op_impl) => { + let dst = &mut self.regfile[op_impl.0.rid.0 as usize]; + let cid_ofst = match op_impl.0.slot { + hpu_asm::MemId::Addr(ct_id) => ct_id.0 as usize, + _ => panic!("Template must have been resolved before execution"), + }; + + // Ct_ofst is equal over PC + let ct_ofst = cid_ofst + * page_align( + hpu_big_lwe_ciphertext_size(&self.params.rtl_params) + .div_ceil(self.params.rtl_params.pc_params.pem_pc) + * std::mem::size_of::(), + ); + let ct_chunk = self + .config + .board + .ct_pc + .iter() + .enumerate() + .map(|(id, mem_kind)| { + let ldst_ofst = { + let (msb, lsb) = self.regmap.addr_offset().ldst[id]; + ((msb as u64) << 32) + lsb as u64 + }; + match mem_kind { + MemKind::Ddr { .. } => { + self.ddr.get_chunk(ldst_ofst + ct_ofst as u64).data() + } + MemKind::Hbm { pc } => self.hbm_bank[*pc] + .get_chunk(ldst_ofst + ct_ofst as u64) + .data(), + } + // self.hbm_bank[*pc].get_chunk(ldst_ofst + ct_ofst as u64) + }) + .collect::>(); + + let hw_slice = dst.as_mut_view().into_container(); + std::iter::zip(hw_slice, ct_chunk).for_each(|(hpu, mem)| { + // NB: Chunk are extended to enforce page align buffer + // -> To prevent error during copy, with shrink the mem buffer to + // the real size before-hand + let size_b = std::mem::size_of_val(hpu); + let hbm_u64 = bytemuck::cast_slice::(&mem[0..size_b]); + hpu.clone_from_slice(hbm_u64); + }); + } + + hpu_asm::DOp::ST(op_impl) => { + let src = &self.regfile[op_impl.0.rid.0 as usize]; + let cid_ofst = match op_impl.0.slot { + hpu_asm::MemId::Addr(ct_id) => ct_id.0 as usize, + _ => panic!("Template must have been resolved before execution"), + }; + + // Ct_ofst is equal over PC + let ct_ofst = cid_ofst + * page_align( + hpu_big_lwe_ciphertext_size(&self.params.rtl_params) + .div_ceil(self.params.rtl_params.pc_params.pem_pc) + * std::mem::size_of::(), + ); + src.as_view() + .into_container() + .into_iter() + .enumerate() + .for_each(|(id, hpu)| { + let ldst_ofst = { + let (msb, lsb) = self.regmap.addr_offset().ldst[id]; + ((msb as u64) << 32) + lsb as u64 + }; + let ct_chunk_mut_view = match self.config.board.ct_pc[id] { + MemKind::Ddr { .. } => self + .ddr + .get_mut_chunk(ldst_ofst + ct_ofst as u64) + .data_mut(), + MemKind::Hbm { pc } => self.hbm_bank[pc] + .get_mut_chunk(ldst_ofst + ct_ofst as u64) + .data_mut(), + }; + // NB: hbm chunk are extended to enforce page align buffer + // -> Shrunk it to slice size to prevent error during copy + let size_b = std::mem::size_of_val(hpu); + + let ct_chunk_u64 = bytemuck::cast_slice_mut::( + &mut ct_chunk_mut_view[0..size_b], + ); + ct_chunk_u64.copy_from_slice(hpu); + }); + } + + hpu_asm::DOp::ADD(op_impl) => { + // NB: The first src is used as destination to prevent useless + // allocation + let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid); + let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid); + lwe_ciphertext_add_assign(&mut cpu_s0, &cpu_s1); + self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view()); + } + hpu_asm::DOp::SUB(op_impl) => { + // NB: The first src is used as destination to prevent useless + // allocation + let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid); + let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid); + lwe_ciphertext_sub_assign(&mut cpu_s0, &cpu_s1); + self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view()); + } + hpu_asm::DOp::MAC(op_impl) => { + // NB: Srcs are used as destination to prevent useless allocation + let mut cpu_s0 = self.reg2cpu(op_impl.0.src0_rid); + let cpu_s1 = self.reg2cpu(op_impl.0.src1_rid); + + lwe_ciphertext_cleartext_mul_assign( + &mut cpu_s0, + Cleartext(op_impl.0.mul_factor.0 as u64), + ); + lwe_ciphertext_add_assign(&mut cpu_s0, &cpu_s1); + + self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view()); + } + hpu_asm::DOp::ADDS(op_impl) => { + // NB: The first src is used as destination to prevent useless + // allocation + let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid); + let msg_cst = match op_impl.0.msg_cst { + hpu_asm::ImmId::Cst(cst) => cst as u64, + _ => panic!("Template must have been resolved before execution"), + }; + let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta(); + lwe_ciphertext_plaintext_add_assign( + &mut cpu_s0, + Plaintext(msg_encoded), + ); + self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view()); + } + hpu_asm::DOp::SUBS(op_impl) => { + // NB: The first src is used as destination to prevent useless + // allocation + let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid); + let msg_cst = match op_impl.0.msg_cst { + hpu_asm::ImmId::Cst(cst) => cst as u64, + _ => panic!("Template must have been resolved before execution"), + }; + let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta(); + lwe_ciphertext_plaintext_sub_assign( + &mut cpu_s0, + Plaintext(msg_encoded), + ); + self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view()); + } + hpu_asm::DOp::SSUB(op_impl) => { + // NB: The first src is used as destination to prevent useless + // allocation + let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid); + lwe_ciphertext_opposite_assign(&mut cpu_s0); + let msg_cst = match op_impl.0.msg_cst { + hpu_asm::ImmId::Cst(cst) => cst as u64, + _ => panic!("Template must have been resolved before execution"), + }; + let msg_encoded = msg_cst * self.params.rtl_params.pbs_params.delta(); + lwe_ciphertext_plaintext_add_assign( + &mut cpu_s0, + Plaintext(msg_encoded), + ); + self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view()); + } + hpu_asm::DOp::MULS(op_impl) => { + // NB: The first src is used as destination to prevent useless + // allocation + let mut cpu_s0 = self.reg2cpu(op_impl.0.src_rid); + let msg_cst = match op_impl.0.msg_cst { + hpu_asm::ImmId::Cst(cst) => cst as u64, + _ => panic!("Template must have been resolved before execution"), + }; + lwe_ciphertext_cleartext_mul_assign(&mut cpu_s0, Cleartext(msg_cst)); + self.cpu2reg(op_impl.0.dst_rid, cpu_s0.as_view()); + } + hpu_asm::DOp::PBS(op_impl) => self.apply_pbs2reg( + 1, + op_impl.0.dst_rid, + op_impl.0.src_rid, + op_impl.0.gid, + ), + hpu_asm::DOp::PBS_ML2(op_impl) => self.apply_pbs2reg( + 2, + op_impl.0.dst_rid, + op_impl.0.src_rid, + op_impl.0.gid, + ), + hpu_asm::DOp::PBS_ML4(op_impl) => self.apply_pbs2reg( + 4, + op_impl.0.dst_rid, + op_impl.0.src_rid, + op_impl.0.gid, + ), + hpu_asm::DOp::PBS_ML8(op_impl) => self.apply_pbs2reg( + 8, + op_impl.0.dst_rid, + op_impl.0.src_rid, + op_impl.0.gid, + ), + hpu_asm::DOp::PBS_F(op_impl) => self.apply_pbs2reg( + 1, + op_impl.0.dst_rid, + op_impl.0.src_rid, + op_impl.0.gid, + ), + hpu_asm::DOp::PBS_ML2_F(op_impl) => self.apply_pbs2reg( + 2, + op_impl.0.dst_rid, + op_impl.0.src_rid, + op_impl.0.gid, + ), + hpu_asm::DOp::PBS_ML4_F(op_impl) => self.apply_pbs2reg( + 4, + op_impl.0.dst_rid, + op_impl.0.src_rid, + op_impl.0.gid, + ), + hpu_asm::DOp::PBS_ML8_F(op_impl) => self.apply_pbs2reg( + 8, + op_impl.0.dst_rid, + op_impl.0.src_rid, + op_impl.0.gid, + ), + _ => panic!("Error: {dop:?} must have been handled by invariant path"), + } + } + } + } + + // Dump operation src/dst in file if required + if !self.options.nops { + self.dump_op_reg(dop); + } + + // Increment program counter + self.pc += 1; + } + + /// Compute dst_rid <- Pbs(src_rid, lut) + /// Use a function to prevent code duplication in PBS/PBS_F implementation + /// NB: Current Pbs lookup function arn't reverted from Hbm memory + /// TODO: Read PbsLut from Hbm instead of online generation based on Pbs Id + fn apply_pbs2reg( + &mut self, + opcode_lut_nb: u8, + dst_rid: hpu_asm::RegId, + src_rid: hpu_asm::RegId, + gid: hpu_asm::PbsGid, + ) { + let mut cpu_reg = self.reg2cpu(src_rid); + let lut = hpu_asm::Pbs::from_hex(gid).expect("Invalid PBS Gid"); + // TODO use an assert or a simple warning + // In practice, hardware apply the LUT but extract only opcode_lut_nb Ct + assert_eq!( + lut.lut_nb(), + opcode_lut_nb, + "ERROR: Mismatch between PBS ML configuration and selected Lut." + ); + + // Generate Lut + let hpu_lut = create_hpu_lookuptable(self.params.rtl_params.clone(), &lut); + let mut tfhe_lut = hpu_lut.as_view().into(); + + // Get keys and computation buffer + let (ksk, ref mut bfr_after_ks, bsk) = self.get_server_key(); + + // TODO add a check on trivialness for fast simulation ? + keyswitch_lwe_ciphertext_with_scalar_change(ksk, &cpu_reg, bfr_after_ks); + blind_rotate_ntt64_bnf_assign(bfr_after_ks, &mut tfhe_lut, &bsk); + + assert_eq!( + dst_rid.0, + (dst_rid.0 >> lut.lut_lg()) << lut.lut_lg(), + "Pbs destination register must be aligned with lut size" + ); + + // Compute ManyLut function stride + let fn_stride = { + let pbs_p = &self.params.rtl_params.pbs_params; + let modulus_sup = 1_usize << pbs_p.message_width + pbs_p.carry_width; + let box_size = pbs_p.polynomial_size / modulus_sup; + // Max valid degree for a ciphertext when using the LUT we generate + // If MaxDegree == 1, we can have two input values 0 and 1, so we need MaxDegree + 1 + // boxes + let max_degree = modulus_sup / lut.lut_nb() as usize; + max_degree * box_size + }; + + for fn_idx in 0..lut.lut_nb() as usize { + let monomial_degree = MonomialDegree(fn_idx * fn_stride); + extract_lwe_sample_from_glwe_ciphertext(&tfhe_lut, &mut cpu_reg, monomial_degree); + self.cpu2reg(hpu_asm::RegId(dst_rid.0 + fn_idx as u8), cpu_reg.as_view()); + } + } + + // NB: to prevent issues with borrow checker we have to clone the value from + // the regfile. A clone is also required for conversion + // Thus, directly cast value in Cpu version to prevent extra clone + /// Extract a cpu value from register file + fn reg2cpu(&self, reg_id: hpu_asm::RegId) -> LweCiphertextOwned { + let reg = self.regfile[reg_id.0 as usize].as_view(); + LweCiphertextOwned::from(reg) + } + + /// Insert a cpu value into the register file + fn cpu2reg(&mut self, reg_id: hpu_asm::RegId, cpu: LweCiphertextView) { + let hpu = HpuLweCiphertextOwned::::create_from(cpu, self.params.rtl_params.clone()); + std::iter::zip( + self.regfile[reg_id.0 as usize] + .as_mut_view() + .into_container(), + hpu.into_container(), + ) + .for_each(|(reg, hpu)| { + reg.copy_from_slice(hpu.as_slice()); + }); + } + + /// Get the inner server key used for computation + /// Check the register state and extract sks from memory if needed + fn get_server_key( + &mut self, + ) -> ( + &LweKeyswitchKeyOwned, + &mut LweCiphertextOwned, + &NttLweBootstrapKeyOwned, + ) { + if self.sks.is_none() { + tracing::debug!("Reload Bsk/Ksk from memory"); + assert!( + self.regmap.bsk_state().is_avail(), + "Bsk avail bit was not set. Hw will hang on Pbs computation, Mockup panic instead" + ); + assert!( + self.regmap.ksk_state().is_avail(), + "Ksk avail bit was not set. Hw will hang on Pbs computation, Mockup panic instead" + ); + + // Extract HpuBsk /HpuKsk from hbm + let hpu_bsk = { + // Create Hpu Bsk container + let mut bsk = HpuLweBootstrapKeyOwned::new(0, self.params.rtl_params.clone()); + + // Copy content from Hbm + let hw_slice = bsk.as_mut_view().into_container(); + std::iter::zip(hw_slice, self.config.board.bsk_pc.iter()) + .enumerate() + .for_each(|(id, (hpu, mem_kind))| { + let bank = match mem_kind { + MemKind::Ddr { .. } => panic!( + "Error: Key could not be allocated in Dddr for performance reasons" + ), + MemKind::Hbm { pc } => &self.hbm_bank[*pc], + }; + let ofst = { + let (msb, lsb) = self.regmap.addr_offset().bsk[id]; + ((msb as usize) << 32) + lsb as usize + }; + bank.read_across_chunk(ofst, hpu); + }); + bsk + }; + let hpu_ksk = { + // Create Hpu ksk container + let mut ksk = HpuLweKeyswitchKeyOwned::new(0, self.params.rtl_params.clone()); + + // Copy content from Hbm + let hw_slice = ksk.as_mut_view().into_container(); + std::iter::zip(hw_slice, self.config.board.ksk_pc.iter()) + .enumerate() + .for_each(|(id, (hpu, mem_kind))| { + let bank = match mem_kind { + MemKind::Ddr { .. } => panic!( + "Error: Key could not be allocated in Dddr for performance reasons" + ), + MemKind::Hbm { pc } => &self.hbm_bank[*pc], + }; + let ofst = { + let (msb, lsb) = self.regmap.addr_offset().ksk[id]; + ((msb as usize) << 32) + lsb as usize + }; + bank.read_across_chunk(ofst, hpu); + }); + ksk + }; + // Allocate Pbs intermediate buffer + let pbs_p = KeySwitch32PBSParameters::from(self.params.rtl_params.clone()); + let bfr_after_ks = LweCiphertext::new( + 0, + pbs_p.lwe_dimension.to_lwe_size(), + pbs_p.post_keyswitch_ciphertext_modulus(), + ); + + // Construct Cpu server_key version + let cpu_bsk = NttLweBootstrapKey::from(hpu_bsk.as_view()); + let cpu_ksk = LweKeyswitchKey::from(hpu_ksk.as_view()); + self.sks = Some((cpu_ksk, bfr_after_ks, cpu_bsk)); + } + let (ksk, bfr, bsk) = self.sks.as_mut().unwrap(); + (ksk, bfr, bsk) + } +} + +impl HpuSim { + fn dump_op_reg(&self, op: &hpu_asm::DOp) { + if self.options.dump_out.is_some() && self.options.dump_reg { + let dump_out = self.options.dump_out.as_ref().unwrap(); + + // Dump register value + let regid = match op { + hpu_asm::DOp::LD(op_impl) => op_impl.0.rid.0 as usize, + hpu_asm::DOp::ST(op_impl) => op_impl.0.rid.0 as usize, + hpu_asm::DOp::ADDS(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::SUBS(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::SSUB(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::MULS(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::ADD(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::SUB(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::MAC(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::PBS(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::PBS_ML2(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::PBS_ML4(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::PBS_ML8(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::PBS_F(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::PBS_ML2_F(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::PBS_ML4_F(op_impl) => op_impl.0.dst_rid.0 as usize, + hpu_asm::DOp::PBS_ML8_F(op_impl) => op_impl.0.dst_rid.0 as usize, + _ => return, + }; + let regf = self.regfile[regid].as_view(); + + // Create base-path + let base_path = format!("{}/blwe/run/blwe_isc{}_reg", dump_out, self.pc,); + self.dump_regf(regf, &base_path); + } + } + + /// Dump associated regf value in a file + fn dump_regf(&self, regf: HpuLweCiphertextView, base_path: &str) { + // Iterate over slice + regf.into_container() + .iter() + .enumerate() + .for_each(|(i, slice)| { + // Create file-path + let file_path = format!("{base_path}_{:0>1x}.hex", i); + let mut wr_f = MockupOptions::open_wr_file(&file_path); + + writeln!(&mut wr_f, "# LweCiphertext slice #{}", i).unwrap(); + // Compact Blwe on 32b if possible + if self.params.rtl_params.ntt_params.ct_width <= u32::BITS { + let slice_32b = slice.iter().map(|x| *x as u32).collect::>(); + slice_32b.as_slice().write_hex( + &mut wr_f, + self.params.rtl_params.pc_params.pem_bytes_w, + Some("XX"), + ); + } else { + slice.write_hex( + &mut wr_f, + self.params.rtl_params.pc_params.pem_bytes_w, + Some("XX"), + ); + } + }); + } +} + +#[cfg(feature = "isc-order-check")] +impl HpuSim { + /// Check for RAW/WAR violation at runtime + fn check_order(&mut self, exec_dop: &hpu_asm::DOp) { + let exec_pos = self + .dops_check_order + .iter() + .enumerate() + .filter(|(_i, d)| exec_dop == *d) + .map(|(i, _d)| i) + .collect::>()[0]; + + // Check collision with all DOp before + for dop in self.dops_check_order[0..exec_pos].iter() { + // Read after Write check + let raw_err = exec_dop + .dst() + .into_iter() + .flat_map(|dst| dop.src().into_iter().map(move |src| dst == src)) + .fold(false, |acc, cur| acc || cur); + + // Write after read check + // Mainly associated register is read before the expected write + let war_err = dop + .dst() + .into_iter() + .flat_map(|dst| exec_dop.src().into_iter().map(move |src| dst == src)) + .fold(false, |acc, cur| acc || cur); + + // Write after write check + let waw_err = dop + .dst() + .into_iter() + .flat_map(|dst| exec_dop.dst().into_iter().map(move |edst| dst == edst)) + .fold(false, |acc, cur| acc || cur); + + if raw_err { + tracing::warn!("RAW_ERR {} -> {}", exec_dop, dop); + } + if war_err { + tracing::warn!("WAR_ERR {} -> {}", exec_dop, dop); + } + if waw_err { + tracing::warn!("WAW_ERR {} -> {}", exec_dop, dop); + } + } + + // Remove exec_dop from the list + self.dops_check_order.remove(exec_pos); + } +} diff --git a/mockups/tfhe-hpu-mockup/src/mockup.rs b/mockups/tfhe-hpu-mockup/src/mockup.rs new file mode 100644 index 000000000..e1ee2b7f1 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/mockup.rs @@ -0,0 +1,174 @@ +//! Hpu Simulation mockup +//! Emulate Hpu behavior for simulation +//! Enable to test tfhe-rs application that required tfhe-hpu-backend without the real hardware. +//! It rely on the `ffi-sim` interface of `tfhe-hpu-backend` and on ipc-channel for communication +//! +//! WARN: User must start the HpuSim mockup before tfhe-rs application + +use std::fs::OpenOptions; +use std::path::Path; + +use tfhe::tfhe_hpu_backend::prelude::*; +use tfhe_hpu_mockup::{HpuSim, MockupOptions, MockupParameters}; + +/// Define CLI arguments +use clap::Parser; +#[derive(clap::Parser, Debug, Clone)] +#[clap(long_about = "Hpu Simulation mockup.")] +pub struct Args { + // Configuration ---------------------------------------------------- + /// Fpga fake configuration + /// Toml file similar to the one used with the real hpu-backend + /// Enable to retrieved ipc_name, register_file and board definition + #[clap( + long, + value_parser, + default_value = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml" + )] + pub config: ShellString, + + /// Hpu rtl parameters + /// Enable to retrieved the associated tfhe-rs parameters and other Rtl parameters + #[clap( + long, + value_parser, + default_value = "${HPU_MOCKUP_DIR}/params/gaussian_64b_fast.toml" + )] + pub params: ShellString, + + // Override params -------------------------------------------------- + // Quick way to override parameters through ClI instead of editing the + // configuration file + // Used to override some parameters at runtime + /// Override Number of Register + #[clap(long, value_parser)] + register: Option, + + /// Override HPU lookahead buffer depth + /// Number of instruction that are considered in advance + #[clap(long, value_parser)] + isc_depth: Option, + + // Simulation configuration ----------------------------------------- + /// Frequency in MHz + /// Only use for report display + #[clap(long, value_parser, default_value_t = 300)] + freq_mhz: usize, + + /// Simulation quantum in micro_seconds. + /// Maximum simulation time drift between mockup and backend + #[clap(long, value_parser, default_value_t = 1_000_000)] + quantum_us: usize, + + /// Simulation bypass. + /// Disable execution, obviously led to incorrect behavior but accurate + /// performance estimation. + /// For correct behavior (but false perf estimation) use a "fast" parameter set + #[clap(long, value_parser)] + nops: bool, + + // Dump configuration ---------------------------------------------------- + // Use to activate some dump features for the generation of simulation stimulus + /// Specify simulus dump folder. + /// NB: The Rtl stimulus (i.e. Input/Output, Keys, Lut) should be generated by the client. + /// Only used to dump IOp/DOp and the inner register values (c.f. dump-reg) + #[clap(long, value_parser)] + dump_out: Option, + + /// Activate the dump of intermediate register value. Only work if dump-out is also specified + #[clap(long, value_parser)] + dump_reg: bool, + + // Reports configuration ------------------------------------------------- + // Use to activate some performances reports + /// Specify reports dump folder. When not specified, no reports were generated + #[clap(long, value_parser)] + report_out: Option, + + /// Activate the execution trace export for later analysis + #[clap(long, value_parser)] + report_trace: bool, + + // Log configuration ------------------------------------------------- + /// Write trace message in the file (instead of on stdio) + /// NB: Use RUST_LOG env variable to set the verbosity + #[clap(long, value_parser)] + log_out: Option, +} + +impl From<&Args> for MockupOptions { + fn from(args: &Args) -> Self { + Self { + dump_out: args.dump_out.clone(), + dump_reg: args.dump_reg, + report_out: args.report_out.clone(), + report_trace: args.report_trace, + nops: args.nops, + } + } +} + +fn main() { + let args = Args::parse(); + println!("User Options: {args:?}"); + + let options = MockupOptions::from(&args); + + // Configure traging ------------------------------------------------- + // Tracing subscriber rely on env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + let dflt_tracer = tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + // Display source code file paths + .with_file(false) + // Display source code line numbers + .with_line_number(false) + .without_time(); + + if let Some(file) = args.log_out { + // Open file + // Create path + let path = Path::new(&file); + if let Some(dir_p) = path.parent() { + std::fs::create_dir_all(dir_p).unwrap(); + } + + // Open file + let wr_f = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(path) + .unwrap(); + + // Build & register the subscriber to print in a file + dflt_tracer.json().with_writer(wr_f).init(); + } else { + // Build & register the subscriber to display on stdio + dflt_tracer.compact().init(); + } + + // Load parameters from configuration file ------------------------------------ + let config = HpuConfig::from_toml(&args.config.expand()); + let params = { + let mut rtl_params = HpuParameters::from_toml(&args.params.expand()); + + // Override some parameters if required + if let Some(register) = args.register.as_ref() { + rtl_params.regf_params.reg_nb = *register; + } + if let Some(isc_depth) = args.isc_depth.as_ref() { + rtl_params.isc_params.depth = *isc_depth; + } + MockupParameters { + freq_mhz: args.freq_mhz, + quantum_us: args.quantum_us, + rtl_params, + } + }; + println!("Mockup parameters after override with CLI: {params:?}"); + + // Start mockup --------------------------------------------------------------- + let mut hpu_sim = HpuSim::new(config, params, options); + hpu_sim.ipc_poll(); +} diff --git a/mockups/tfhe-hpu-mockup/src/modules/memory/ddr.rs b/mockups/tfhe-hpu-mockup/src/modules/memory/ddr.rs new file mode 100644 index 000000000..69174afaf --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/modules/memory/ddr.rs @@ -0,0 +1,65 @@ +use std::collections::HashMap; + +use super::MemChunk; + +#[allow(unused)] +const DDR_SIZE_B: usize = 4 * 1024 * 1024 * 1024; + +pub(crate) struct DdrMem { + chunk: HashMap, +} + +impl DdrMem { + pub fn new() -> Self { + Self { + chunk: HashMap::new(), + } + } + pub(crate) fn alloc_at(&mut self, paddr: u64, size_b: usize) { + // Check that required chunk is in the Ddr range + assert!( + ((paddr as usize + size_b) < DDR_SIZE_B), + "Error: Required chunk @0x{paddr:x}[0x{size_b}] is out of Ddr range [0x0, 0x{DDR_SIZE_B}]" + ); + + // Check collision with other chunk + // It's not an hard error on real hardware, but handle it like this in the simulation + // In any case multiple view of the same memory was not a good idea and could led to + // hard to debug issues + let clash = self + .chunk + .iter() + .filter(|(_addr, chunk)| paddr < (chunk.paddr + chunk.size_b as u64)) + .filter(|(_addr, chunk)| (paddr + size_b as u64) > chunk.paddr) + .map(|(_addr, chunk)| chunk) + .collect::>(); + clash.iter().for_each(|chunk| { + tracing::debug!( + "Required Ddr allocation collide with chunk @0x{:x}[0x{:x}]", + chunk.paddr, + chunk.size_b + ) + }); + assert!( + clash.is_empty(), + "Error: Ddr allocation @0x{paddr:x}[0x{size_b:x}] has {} collision. This is likely linked to the absence of a proper HpuDevice release in previous execution.", + clash.len() + ); + + // allocate chunk and register it in hashmap + let chunk = MemChunk::new(paddr, size_b); + self.chunk.insert(paddr, chunk); + } + + pub(crate) fn get_chunk(&self, addr: u64) -> &MemChunk { + self.chunk.get(&addr).unwrap() + } + + pub(crate) fn get_mut_chunk(&mut self, addr: u64) -> &mut MemChunk { + self.chunk.get_mut(&addr).unwrap() + } + + pub(crate) fn rm_chunk(&mut self, addr: u64) -> Option { + self.chunk.remove(&addr) + } +} diff --git a/mockups/tfhe-hpu-mockup/src/modules/memory/hbm.rs b/mockups/tfhe-hpu-mockup/src/modules/memory/hbm.rs new file mode 100644 index 000000000..13ba2d966 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/modules/memory/hbm.rs @@ -0,0 +1,109 @@ +use std::collections::HashMap; + +use super::MemChunk; + +#[allow(unused)] +pub const HBM_BANK_NB: usize = 64; +#[allow(unused)] +const HBM_BANK_SIZE_B: usize = 512 * 1024 * 1024; +#[allow(unused)] +const MEM_PAGE_SIZE_B: usize = 4096; + +pub const HBM_BSK_PC_MAX: usize = 16; +pub const HBM_KSK_PC_MAX: usize = 16; + +// WARN: XRT currently not support allocation greater than 16MiB +const HBM_CHUNK_SIZE_B: usize = 16 * 1024 * 1024; + +pub(crate) struct HbmBank { + pc_id: usize, + chunk: HashMap, +} + +impl HbmBank { + pub fn new(pc_id: usize) -> Self { + Self { + pc_id, + chunk: HashMap::new(), + } + } + #[allow(unused)] + pub fn get_pc(&mut self) -> usize { + self.pc_id + } + + pub(crate) fn alloc(&mut self, size_b: usize) -> u64 { + assert!( + size_b <= HBM_CHUNK_SIZE_B, + "XRT don't support allocation greater than {HBM_CHUNK_SIZE_B} Bytes." + ); + + // Compute next paddr + let paddr = if let Some(key) = self.chunk.keys().max() { + let chunk = &self.chunk[key]; + if (chunk.size_b % MEM_PAGE_SIZE_B) != 0 { + chunk.paddr + (((chunk.size_b / MEM_PAGE_SIZE_B) + 1) * MEM_PAGE_SIZE_B) as u64 + } else { + chunk.paddr + ((chunk.size_b / MEM_PAGE_SIZE_B) * MEM_PAGE_SIZE_B) as u64 + } + } else { + 0 + }; + + // allocate chunk and register it in hashmap + let chunk = MemChunk::new(paddr, size_b); + self.chunk.insert(paddr, chunk); + + paddr + } + + pub(crate) fn get_chunk(&self, addr: u64) -> &MemChunk { + self.chunk.get(&addr).unwrap() + } + + pub(crate) fn get_mut_chunk(&mut self, addr: u64) -> &mut MemChunk { + self.chunk.get_mut(&addr).unwrap() + } + + pub(crate) fn rm_chunk(&mut self, addr: u64) -> Option { + self.chunk.remove(&addr) + } + + /// Read data slice from multiple chunk + /// WARN: To circumvent an XRT limitation with huge buffer, Key's memory are allocated with + /// multiple slot of MEM_CHUNK_SIZE_B (i.e. Currently 16MiB) This is abstracted by the + /// HugeMemory in tfhe-hpu-backend Mimics the logic here to correctly read Huge object from + /// Hbm model NB: User specify offset in unit of data. + pub(crate) fn read_across_chunk(&self, ofst: usize, data: &mut [T]) + where + T: bytemuck::Pod, + { + // Underlying memory is view as bytes memory + // Extract byte ofst and byte length + // NB: Don't use generic write method to prevent misunderstanding of ofst meaning + // Indeed, we must used a bytes offset to compute the sub-bfr id and thus keep a + // byte approach everywhere to prevent mismatch + let ofst_b = ofst * std::mem::size_of::(); + let len_b = std::mem::size_of_val(data); + + let bid_start = ofst_b / HBM_CHUNK_SIZE_B; + let bid_stop = (ofst_b + len_b) / HBM_CHUNK_SIZE_B; + let mut bid_ofst = ofst_b % HBM_CHUNK_SIZE_B; + + let mut bid_addr = self.chunk.keys().collect::>(); + bid_addr.sort(); + + let mut rmn_data = len_b; + let mut data_ofst = 0; + + let data_bytes = bytemuck::cast_slice_mut::(data); + for addr in bid_addr[bid_start..=bid_stop].iter() { + let size_b = std::cmp::min(rmn_data, HBM_CHUNK_SIZE_B - bid_ofst); + let chunk = self.chunk.get(addr).unwrap(); + data_bytes[data_ofst..data_ofst + size_b].copy_from_slice(&chunk.data[0..size_b]); + data_ofst += size_b; + rmn_data -= size_b; + bid_ofst = 0; + } + } +} diff --git a/mockups/tfhe-hpu-mockup/src/modules/memory/mod.rs b/mockups/tfhe-hpu-mockup/src/modules/memory/mod.rs new file mode 100644 index 000000000..80bf0be0b --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/modules/memory/mod.rs @@ -0,0 +1,56 @@ +//! Hpu memory models +pub(crate) mod hbm; +pub(crate) use hbm::{HbmBank, HBM_BANK_NB}; + +pub(crate) mod ddr; +pub(crate) use ddr::DdrMem; + +#[allow(unused)] +const MEM_PAGE_SIZE_B: usize = 4096; + +// CT could use Ddr or Hbm memory. +// Its not the case for Keys, and thus PC_MAX for keys are defined in hbm +pub const MEM_CT_PC_MAX: usize = 2; + +use ipc_channel::ipc; + +/// Chunk of on-board memory +/// Could be synced in both direction through IPC +pub struct MemChunk { + // Properties + pub(crate) paddr: u64, + pub(crate) size_b: usize, + + // Data + pub(crate) data: Vec, +} + +impl MemChunk { + pub fn new(paddr: u64, size_b: usize) -> Self { + Self { + paddr, + size_b, + data: vec![0; size_b], + } + } + + /// Return reference on data + pub fn data(&self) -> &[u8] { + &self.data + } + + /// Return mutable reference on data + pub fn data_mut(&mut self) -> &mut [u8] { + &mut self.data + } + + /// Generate Shm for syncing data through Ipc + pub fn ipc_wrap(&self) -> ipc::IpcSharedMemory { + ipc::IpcSharedMemory::from_bytes(self.data.as_slice()) + } + + /// Update internal data from Ipc shm + pub fn ipc_update(&mut self, ipc_data: ipc::IpcSharedMemory) { + self.data.copy_from_slice(&ipc_data); + } +} diff --git a/mockups/tfhe-hpu-mockup/src/modules/mod.rs b/mockups/tfhe-hpu-mockup/src/modules/mod.rs new file mode 100644 index 000000000..c9c7e49f8 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/modules/mod.rs @@ -0,0 +1,15 @@ +//! Hpu Simulation model + +pub(crate) mod memory; +pub(crate) use memory::{DdrMem, HbmBank, HBM_BANK_NB}; + +// mod regfile; +pub(crate) mod regmap; +pub(crate) use regmap::{RegisterEvent, RegisterMap}; + +pub(crate) mod ucore; +pub(crate) use ucore::UCore; + +pub(crate) mod params; + +pub use tfhe::tfhe_hpu_backend::fw::isc_sim as isc; diff --git a/mockups/tfhe-hpu-mockup/src/modules/params.rs b/mockups/tfhe-hpu-mockup/src/modules/params.rs new file mode 100644 index 000000000..785b22af9 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/modules/params.rs @@ -0,0 +1,57 @@ +use std::fs::{File, OpenOptions}; +use std::path::Path; + +use tfhe::tfhe_hpu_backend::prelude::*; + +#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] +pub struct MockupParameters { + pub freq_mhz: usize, + pub quantum_us: usize, + pub rtl_params: HpuParameters, +} + +/// Structure to pass runtime options +pub struct MockupOptions { + pub dump_out: Option, + pub dump_reg: bool, + pub report_out: Option, + pub report_trace: bool, + pub nops: bool, +} + +impl MockupOptions { + fn create_dir(file_path: &str) { + let path = Path::new(&file_path); + if let Some(dir_p) = path.parent() { + std::fs::create_dir_all(dir_p).unwrap(); + } + } + + pub fn open_wr_file(file_path: &str) -> File { + Self::create_dir(file_path); + OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(file_path) + .unwrap() + } + + pub fn report_file(&self, iop: hpu_asm::AsmIOpcode) -> Option { + if let Some(report_out) = &self.report_out { + let iop_file = format!("{report_out}/{iop}.rpt"); + Some(Self::open_wr_file(&iop_file)) + } else { + None + } + } + pub fn report_trace(&self, iop: hpu_asm::AsmIOpcode) -> Option { + if self.report_out.is_some() && self.report_trace { + let report_out = &self.report_out.as_ref().unwrap(); + let iop_file = format!("{report_out}/{iop}.json"); + Some(Self::open_wr_file(&iop_file)) + } else { + None + } + } +} diff --git a/mockups/tfhe-hpu-mockup/src/modules/regmap.rs b/mockups/tfhe-hpu-mockup/src/modules/regmap.rs new file mode 100644 index 000000000..c08bd8487 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/modules/regmap.rs @@ -0,0 +1,652 @@ +use std::collections::VecDeque; +use std::sync::atomic::{AtomicBool, Ordering}; + +use super::*; +use hpu_regmap::FlatRegmap; +use tfhe::tfhe_hpu_backend::interface::rtl::params::*; +use tfhe::tfhe_hpu_backend::prelude::*; + +#[derive(Default)] +pub(crate) struct KeyState { + avail: AtomicBool, + rst_pdg: AtomicBool, +} +impl KeyState { + pub fn is_avail(&self) -> bool { + self.avail.load(Ordering::SeqCst) + } +} + +#[derive(Default)] +pub(crate) struct BpipState { + pub(crate) used: bool, + pub(crate) use_opportunism: bool, + pub(crate) timeout: u32, +} + +#[derive(Default)] +pub(crate) struct AddrOffset { + pub(crate) bsk: [(u32, u32); memory::hbm::HBM_BSK_PC_MAX], + pub(crate) ksk: [(u32, u32); memory::hbm::HBM_KSK_PC_MAX], + pub(crate) lut: (u32, u32), + pub(crate) ldst: [(u32, u32); memory::MEM_CT_PC_MAX], + pub(crate) trace: (u32, u32), +} + +pub struct RegisterMap { + rtl_params: HpuParameters, + regmap: FlatRegmap, + + bsk: KeyState, + ksk: KeyState, + bpip: BpipState, + addr_ofst: AddrOffset, + ackq_pdg: VecDeque, +} + +pub enum RegisterEvent { + None, + KeyReset, + WorkQ(u32), +} + +impl RegisterMap { + pub fn new(rtl_params: HpuParameters, regmap: &[&str]) -> Self { + let regmap = FlatRegmap::from_file(regmap); + Self { + rtl_params, + regmap, + bsk: Default::default(), + ksk: Default::default(), + bpip: Default::default(), + addr_ofst: Default::default(), + ackq_pdg: VecDeque::new(), + } + } + + pub fn bsk_state(&self) -> &KeyState { + &self.bsk + } + pub fn ksk_state(&self) -> &KeyState { + &self.ksk + } + pub fn bpip_state(&self) -> &BpipState { + &self.bpip + } + pub fn addr_offset(&self) -> &AddrOffset { + &self.addr_ofst + } + + pub fn ack_pdg(&mut self, ack: u32) { + self.ackq_pdg.push_back(ack) + } +} + +/// Implement revert register access +/// -> Emulate Rtl response of register read/write +impl RegisterMap { + /// Get register name from addr + fn get_register_name(&self, addr: u64) -> &str { + let register = self + .regmap + .register() + .iter() + .find(|(_name, reg)| *reg.offset() == (addr as usize)) + .expect("Register addr not found in registermap"); + + register.0 + } + + /// Kind of register reverse + /// Return register value from parameter value + pub fn read_reg(&mut self, addr: u64) -> u32 { + let register_name = self.get_register_name(addr); + match register_name { + "info::ntt_structure" => { + let ntt_p = &self.rtl_params.ntt_params; + (ntt_p.radix + (ntt_p.psi << 8) /*+(ntt_p.div << 16)*/ + (ntt_p.delta << 24)) as u32 + } + "info::ntt_rdx_cut" => { + let ntt_p = &self.rtl_params.ntt_params; + let cut_w = match &ntt_p.core_arch { + HpuNttCoreArch::GF64(cut_w) => cut_w, + _ => &vec![ntt_p.delta as u8], + }; + cut_w + .iter() + .enumerate() + .fold(0, |acc, (id, val)| acc + ((*val as u32) << (id * 4))) + } + "info::ntt_architecture" => match self.rtl_params.ntt_params.core_arch { + HpuNttCoreArch::WmmCompactPcg => NTT_CORE_ARCH_OFS + 4, + HpuNttCoreArch::WmmUnfoldPcg => NTT_CORE_ARCH_OFS + 4, + HpuNttCoreArch::GF64(_) => NTT_CORE_ARCH_OFS + 5, + }, + "info::ntt_pbs" => { + let ntt_p = &self.rtl_params.ntt_params; + (ntt_p.batch_pbs_nb + (ntt_p.total_pbs_nb << 8)) as u32 + } + "info::ntt_modulo" => { + MOD_NTT_NAME_OFS + (self.rtl_params.ntt_params.prime_modulus.clone() as u8) as u32 + } + + "info::application" => { + if CONCRETE_BOOLEAN == self.rtl_params.pbs_params { + APPLICATION_NAME_OFS + } else if MSG2_CARRY2 == self.rtl_params.pbs_params { + APPLICATION_NAME_OFS + 1 + } else if MSG2_CARRY2_64B == self.rtl_params.pbs_params { + APPLICATION_NAME_OFS + 3 + } else if MSG2_CARRY2_44B == self.rtl_params.pbs_params { + APPLICATION_NAME_OFS + 4 + } else if MSG2_CARRY2_64B_FAKE == self.rtl_params.pbs_params { + APPLICATION_NAME_OFS + 9 + } else if MSG2_CARRY2_GAUSSIAN == self.rtl_params.pbs_params { + APPLICATION_NAME_OFS + 10 + } else if MSG2_CARRY2_TUNIFORM == self.rtl_params.pbs_params { + APPLICATION_NAME_OFS + 11 + } else if MSG2_CARRY2_PFAIL64_132B_GAUSSIAN_1F72DBA == self.rtl_params.pbs_params { + APPLICATION_NAME_OFS + 12 + } else { + // Custom simulation parameters set + // -> Return 1 without NAME_OFS + 1 + } + } + "info::ks_structure" => { + let ks_p = &self.rtl_params.ks_params; + (ks_p.lbx + (ks_p.lby << 8) + (ks_p.lbz << 16)) as u32 + } + "info::ks_crypto_param" => { + let ks_p = &self.rtl_params.ks_params; + let pbs_p = &self.rtl_params.pbs_params; + (ks_p.width + (pbs_p.ks_level << 8) + (pbs_p.ks_base_log << 16)) as u32 + } + "info::hbm_axi4_nb" => { + let pc_p = &self.rtl_params.pc_params; + // TODO: Cut number currently not reverted + (pc_p.bsk_pc + (pc_p.ksk_pc << 8) + (pc_p.pem_pc << 16)) as u32 + } + "info::hbm_axi4_dataw_ksk" => { + let bytes_w = &self.rtl_params.pc_params.ksk_bytes_w; + *bytes_w as u32 * u8::BITS + } + "info::hbm_axi4_dataw_bsk" => { + let bytes_w = &self.rtl_params.pc_params.bsk_bytes_w; + *bytes_w as u32 * u8::BITS + } + "info::hbm_axi4_dataw_pem" => { + let bytes_w = &self.rtl_params.pc_params.pem_bytes_w; + *bytes_w as u32 * u8::BITS + } + "info::hbm_axi4_dataw_glwe" => { + let bytes_w = &self.rtl_params.pc_params.glwe_bytes_w; + *bytes_w as u32 * u8::BITS + } + + "info::regf_structure" => { + let regf_p = &self.rtl_params.regf_params; + (regf_p.reg_nb + (regf_p.coef_nb << 8)) as u32 + } + "info::isc_structure" => { + let isc_p = &self.rtl_params.isc_params; + (isc_p.depth + (isc_p.min_iop_size << 8)) as u32 + } + + "bsk_avail::avail" => self.bsk.avail.load(Ordering::SeqCst) as u32, + "bsk_avail::reset" => { + if self.bsk.rst_pdg.load(Ordering::SeqCst) { + self.bsk.rst_pdg.store(false, Ordering::SeqCst); + 1 << 31 + } else { + 0 + } + } + "ksk_avail::avail" => self.ksk.avail.load(Ordering::SeqCst) as u32, + "ksk_avail::reset" => { + if self.ksk.rst_pdg.load(Ordering::SeqCst) { + self.ksk.rst_pdg.store(false, Ordering::SeqCst); + 1 << 31 + } else { + 0 + } + } + + // Bpip configuration registers + "bpip::use" => { + ((self.bpip.used as u8) + ((self.bpip.use_opportunism as u8) << 1)) as u32 + } + "bpip::timeout" => self.bpip.timeout, + + // Add offset configuration registers + "hbm_axi4_addr_1in3::ct_pc0_msb" => self.addr_ofst.ldst[0].0, + "hbm_axi4_addr_1in3::ct_pc0_lsb" => self.addr_ofst.ldst[0].1, + "hbm_axi4_addr_1in3::ct_pc1_msb" => self.addr_ofst.ldst[1].0, + "hbm_axi4_addr_1in3::ct_pc1_lsb" => self.addr_ofst.ldst[1].1, + "hbm_axi4_addr_3in3::bsk_pc0_msb" => self.addr_ofst.bsk[0].0, + "hbm_axi4_addr_3in3::bsk_pc0_lsb" => self.addr_ofst.bsk[0].1, + "hbm_axi4_addr_3in3::bsk_pc1_msb" => self.addr_ofst.bsk[1].0, + "hbm_axi4_addr_3in3::bsk_pc1_lsb" => self.addr_ofst.bsk[1].1, + "hbm_axi4_addr_3in3::bsk_pc2_msb" => self.addr_ofst.bsk[2].0, + "hbm_axi4_addr_3in3::bsk_pc2_lsb" => self.addr_ofst.bsk[2].1, + "hbm_axi4_addr_3in3::bsk_pc3_msb" => self.addr_ofst.bsk[3].0, + "hbm_axi4_addr_3in3::bsk_pc3_lsb" => self.addr_ofst.bsk[3].1, + "hbm_axi4_addr_3in3::bsk_pc4_msb" => self.addr_ofst.bsk[4].0, + "hbm_axi4_addr_3in3::bsk_pc4_lsb" => self.addr_ofst.bsk[4].1, + "hbm_axi4_addr_3in3::bsk_pc5_msb" => self.addr_ofst.bsk[5].0, + "hbm_axi4_addr_3in3::bsk_pc5_lsb" => self.addr_ofst.bsk[5].1, + "hbm_axi4_addr_3in3::bsk_pc6_msb" => self.addr_ofst.bsk[6].0, + "hbm_axi4_addr_3in3::bsk_pc6_lsb" => self.addr_ofst.bsk[6].1, + "hbm_axi4_addr_3in3::bsk_pc7_msb" => self.addr_ofst.bsk[7].0, + "hbm_axi4_addr_3in3::bsk_pc7_lsb" => self.addr_ofst.bsk[7].1, + "hbm_axi4_addr_3in3::bsk_pc8_msb" => self.addr_ofst.bsk[8].0, + "hbm_axi4_addr_3in3::bsk_pc8_lsb" => self.addr_ofst.bsk[8].1, + "hbm_axi4_addr_3in3::bsk_pc9_msb" => self.addr_ofst.bsk[9].0, + "hbm_axi4_addr_3in3::bsk_pc9_lsb" => self.addr_ofst.bsk[9].1, + "hbm_axi4_addr_3in3::bsk_pc10_msb" => self.addr_ofst.bsk[10].0, + "hbm_axi4_addr_3in3::bsk_pc10_lsb" => self.addr_ofst.bsk[10].1, + "hbm_axi4_addr_3in3::bsk_pc11_msb" => self.addr_ofst.bsk[11].0, + "hbm_axi4_addr_3in3::bsk_pc11_lsb" => self.addr_ofst.bsk[11].1, + "hbm_axi4_addr_3in3::bsk_pc12_msb" => self.addr_ofst.bsk[12].0, + "hbm_axi4_addr_3in3::bsk_pc12_lsb" => self.addr_ofst.bsk[12].1, + "hbm_axi4_addr_3in3::bsk_pc13_msb" => self.addr_ofst.bsk[13].0, + "hbm_axi4_addr_3in3::bsk_pc13_lsb" => self.addr_ofst.bsk[13].1, + "hbm_axi4_addr_3in3::bsk_pc14_msb" => self.addr_ofst.bsk[14].0, + "hbm_axi4_addr_3in3::bsk_pc14_lsb" => self.addr_ofst.bsk[14].1, + "hbm_axi4_addr_3in3::bsk_pc15_msb" => self.addr_ofst.bsk[15].0, + "hbm_axi4_addr_3in3::bsk_pc15_lsb" => self.addr_ofst.bsk[15].1, + "hbm_axi4_addr_1in3::ksk_pc0_msb" => self.addr_ofst.ksk[0].0, + "hbm_axi4_addr_1in3::ksk_pc0_lsb" => self.addr_ofst.ksk[0].1, + "hbm_axi4_addr_1in3::ksk_pc1_msb" => self.addr_ofst.ksk[1].0, + "hbm_axi4_addr_1in3::ksk_pc1_lsb" => self.addr_ofst.ksk[1].1, + "hbm_axi4_addr_1in3::ksk_pc2_msb" => self.addr_ofst.ksk[2].0, + "hbm_axi4_addr_1in3::ksk_pc2_lsb" => self.addr_ofst.ksk[2].1, + "hbm_axi4_addr_1in3::ksk_pc3_msb" => self.addr_ofst.ksk[3].0, + "hbm_axi4_addr_1in3::ksk_pc3_lsb" => self.addr_ofst.ksk[3].1, + "hbm_axi4_addr_1in3::ksk_pc4_msb" => self.addr_ofst.ksk[4].0, + "hbm_axi4_addr_1in3::ksk_pc4_lsb" => self.addr_ofst.ksk[4].1, + "hbm_axi4_addr_1in3::ksk_pc5_msb" => self.addr_ofst.ksk[5].0, + "hbm_axi4_addr_1in3::ksk_pc5_lsb" => self.addr_ofst.ksk[5].1, + "hbm_axi4_addr_1in3::ksk_pc6_msb" => self.addr_ofst.ksk[6].0, + "hbm_axi4_addr_1in3::ksk_pc6_lsb" => self.addr_ofst.ksk[6].1, + "hbm_axi4_addr_1in3::ksk_pc7_msb" => self.addr_ofst.ksk[7].0, + "hbm_axi4_addr_1in3::ksk_pc7_lsb" => self.addr_ofst.ksk[7].1, + "hbm_axi4_addr_1in3::ksk_pc8_msb" => self.addr_ofst.ksk[8].0, + "hbm_axi4_addr_1in3::ksk_pc8_lsb" => self.addr_ofst.ksk[8].1, + "hbm_axi4_addr_1in3::ksk_pc9_msb" => self.addr_ofst.ksk[9].0, + "hbm_axi4_addr_1in3::ksk_pc9_lsb" => self.addr_ofst.ksk[9].1, + "hbm_axi4_addr_1in3::ksk_pc10_msb" => self.addr_ofst.ksk[10].0, + "hbm_axi4_addr_1in3::ksk_pc10_lsb" => self.addr_ofst.ksk[10].1, + "hbm_axi4_addr_1in3::ksk_pc11_msb" => self.addr_ofst.ksk[11].0, + "hbm_axi4_addr_1in3::ksk_pc11_lsb" => self.addr_ofst.ksk[11].1, + "hbm_axi4_addr_1in3::ksk_pc12_msb" => self.addr_ofst.ksk[12].0, + "hbm_axi4_addr_1in3::ksk_pc12_lsb" => self.addr_ofst.ksk[12].1, + "hbm_axi4_addr_1in3::ksk_pc13_msb" => self.addr_ofst.ksk[13].0, + "hbm_axi4_addr_1in3::ksk_pc13_lsb" => self.addr_ofst.ksk[13].1, + "hbm_axi4_addr_1in3::ksk_pc14_msb" => self.addr_ofst.ksk[14].0, + "hbm_axi4_addr_1in3::ksk_pc14_lsb" => self.addr_ofst.ksk[14].1, + "hbm_axi4_addr_1in3::ksk_pc15_msb" => self.addr_ofst.ksk[15].0, + "hbm_axi4_addr_1in3::ksk_pc15_lsb" => self.addr_ofst.ksk[15].1, + "hbm_axi4_addr_1in3::glwe_pc0_msb" => self.addr_ofst.lut.0, + "hbm_axi4_addr_1in3::glwe_pc0_lsb" => self.addr_ofst.lut.1, + "hbm_axi4_addr_1in3::trc_pc0_msb" => self.addr_ofst.trace.0, + "hbm_axi4_addr_1in3::trc_pc0_lsb" => self.addr_ofst.trace.1, + + // Queue interface + "WorkAck::workq" => { + // TODO implement finite size queue + 0 + } + "WorkAck::ackq" => { + if let Some(ack) = self.ackq_pdg.pop_front() { + ack + } else { + ACKQ_EMPTY + } + } + + _ => { + tracing::warn!("Register {register_name} not hooked for reading, return 0"); + 0 + } + } + } + + pub fn write_reg(&mut self, addr: u64, value: u32) -> RegisterEvent { + let register_name = self.get_register_name(addr); + match register_name { + "bsk_avail::avail" => { + self.bsk.avail.store((value & 0x1) == 0x1, Ordering::SeqCst); + RegisterEvent::None + } + "bsk_avail::reset" => { + if (value & 0x1) == 0x1 { + self.bsk.rst_pdg.store(true, Ordering::SeqCst); + self.bsk.avail.store(false, Ordering::SeqCst); + RegisterEvent::KeyReset + } else { + RegisterEvent::None + } + } + "ksk_avail::avail" => { + self.ksk.avail.store((value & 0x1) == 0x1, Ordering::SeqCst); + RegisterEvent::None + } + "ksk_avail::reset" => { + if (value & 0x1) == 0x1 { + self.ksk.rst_pdg.store(true, Ordering::SeqCst); + self.ksk.avail.store(false, Ordering::SeqCst); + RegisterEvent::KeyReset + } else { + RegisterEvent::None + } + } + + // Bpip configuration registers + "bpip::use" => { + self.bpip.used = (value & 0x1) == 0x1; + self.bpip.use_opportunism = (value & 0x2) == 0x2; + RegisterEvent::None + } + "bpip::timeout" => { + self.bpip.timeout = value; + RegisterEvent::None + } + // Add offset configuration registers + "hbm_axi4_addr_1in3::ct_pc0_msb" => { + self.addr_ofst.ldst[0].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ct_pc0_lsb" => { + self.addr_ofst.ldst[0].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ct_pc1_msb" => { + self.addr_ofst.ldst[1].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ct_pc1_lsb" => { + self.addr_ofst.ldst[1].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc0_msb" => { + self.addr_ofst.bsk[0].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc0_lsb" => { + self.addr_ofst.bsk[0].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc1_msb" => { + self.addr_ofst.bsk[1].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc1_lsb" => { + self.addr_ofst.bsk[1].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc2_msb" => { + self.addr_ofst.bsk[2].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc2_lsb" => { + self.addr_ofst.bsk[2].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc3_msb" => { + self.addr_ofst.bsk[3].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc3_lsb" => { + self.addr_ofst.bsk[3].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc4_msb" => { + self.addr_ofst.bsk[4].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc4_lsb" => { + self.addr_ofst.bsk[4].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc5_msb" => { + self.addr_ofst.bsk[5].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc5_lsb" => { + self.addr_ofst.bsk[5].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc6_msb" => { + self.addr_ofst.bsk[6].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc6_lsb" => { + self.addr_ofst.bsk[6].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc7_msb" => { + self.addr_ofst.bsk[7].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc7_lsb" => { + self.addr_ofst.bsk[7].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc8_msb" => { + self.addr_ofst.bsk[8].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc8_lsb" => { + self.addr_ofst.bsk[8].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc9_msb" => { + self.addr_ofst.bsk[9].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc9_lsb" => { + self.addr_ofst.bsk[9].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc10_msb" => { + self.addr_ofst.bsk[10].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc10_lsb" => { + self.addr_ofst.bsk[10].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc11_msb" => { + self.addr_ofst.bsk[11].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc11_lsb" => { + self.addr_ofst.bsk[11].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc12_msb" => { + self.addr_ofst.bsk[12].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc12_lsb" => { + self.addr_ofst.bsk[12].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc13_msb" => { + self.addr_ofst.bsk[13].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc13_lsb" => { + self.addr_ofst.bsk[13].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc14_msb" => { + self.addr_ofst.bsk[14].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc14_lsb" => { + self.addr_ofst.bsk[14].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc15_msb" => { + self.addr_ofst.bsk[15].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_3in3::bsk_pc15_lsb" => { + self.addr_ofst.bsk[15].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc0_msb" => { + self.addr_ofst.ksk[0].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc0_lsb" => { + self.addr_ofst.ksk[0].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc1_msb" => { + self.addr_ofst.ksk[1].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc1_lsb" => { + self.addr_ofst.ksk[1].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc2_msb" => { + self.addr_ofst.ksk[2].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc2_lsb" => { + self.addr_ofst.ksk[2].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc3_msb" => { + self.addr_ofst.ksk[3].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc3_lsb" => { + self.addr_ofst.ksk[3].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc4_msb" => { + self.addr_ofst.ksk[4].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc4_lsb" => { + self.addr_ofst.ksk[4].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc5_msb" => { + self.addr_ofst.ksk[5].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc5_lsb" => { + self.addr_ofst.ksk[5].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc6_msb" => { + self.addr_ofst.ksk[6].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc6_lsb" => { + self.addr_ofst.ksk[6].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc7_msb" => { + self.addr_ofst.ksk[7].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc7_lsb" => { + self.addr_ofst.ksk[7].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc8_msb" => { + self.addr_ofst.ksk[8].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc8_lsb" => { + self.addr_ofst.ksk[8].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc9_msb" => { + self.addr_ofst.ksk[9].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc9_lsb" => { + self.addr_ofst.ksk[9].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc10_msb" => { + self.addr_ofst.ksk[10].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc10_lsb" => { + self.addr_ofst.ksk[10].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc11_msb" => { + self.addr_ofst.ksk[11].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc11_lsb" => { + self.addr_ofst.ksk[11].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc12_msb" => { + self.addr_ofst.ksk[12].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc12_lsb" => { + self.addr_ofst.ksk[12].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc13_msb" => { + self.addr_ofst.ksk[13].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc13_lsb" => { + self.addr_ofst.ksk[13].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc14_msb" => { + self.addr_ofst.ksk[14].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc14_lsb" => { + self.addr_ofst.ksk[14].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc15_msb" => { + self.addr_ofst.ksk[15].0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::ksk_pc15_lsb" => { + self.addr_ofst.ksk[15].1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::glwe_pc0_msb" => { + self.addr_ofst.lut.0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::glwe_pc0_lsb" => { + self.addr_ofst.lut.1 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::trc_pc0_msb" => { + self.addr_ofst.trace.0 = value; + RegisterEvent::None + } + "hbm_axi4_addr_1in3::trc_pc0_lsb" => { + self.addr_ofst.trace.1 = value; + RegisterEvent::None + } + + "WorkAck::workq" => RegisterEvent::WorkQ(value), + _ => { + tracing::warn!("Register {register_name} not hooked for writing"); + RegisterEvent::None + } + } + } +} diff --git a/mockups/tfhe-hpu-mockup/src/modules/ucore.rs b/mockups/tfhe-hpu-mockup/src/modules/ucore.rs new file mode 100644 index 000000000..ee55fa8a9 --- /dev/null +++ b/mockups/tfhe-hpu-mockup/src/modules/ucore.rs @@ -0,0 +1,137 @@ +use super::{DdrMem, HbmBank}; +use tfhe::tfhe_hpu_backend::prelude::*; + +pub struct UCore { + config: BoardConfig, +} + +impl UCore { + pub fn new(config: BoardConfig) -> Self { + Self { config } + } +} + +impl UCore { + /// Top level function + /// Read DOp stream from Fw memory and patch Templated LD/ST with concrete one + pub fn translate( + &self, + ddr: &DdrMem, + hbm_bank: &[HbmBank], + iop: &hpu_asm::IOp, + ) -> (Vec, Vec) { + let dops = self.load_fw(ddr, hbm_bank, iop); + let dops_patched = self.patch_fw(iop, &dops); + (dops, dops_patched) + } + + /// Read DOp stream from Firmware memory + fn load_fw(&self, ddr: &DdrMem, hbm_bank: &[HbmBank], iop: &hpu_asm::IOp) -> Vec { + let fw_view = match self.config.fw_pc { + MemKind::Ddr { offset } => ddr.get_chunk(offset as u64).data(), + MemKind::Hbm { pc } => { + // Bypass fw_ofst register value + // Expect to have only one memzone in fw bank allocated in 0 + // NB: Fw memory bank is linked to ucore and there is no associated offset register + // -> Stick with Offset 0 + hbm_bank[pc].get_chunk(0).data() + } + }; + let fw_view_u32 = bytemuck::cast_slice::(fw_view); + + // WARN: fw ofst are in byte addr and we addr the fw array as 32b word + let dop_ofst = fw_view_u32[iop.fw_entry()] as usize / std::mem::size_of::(); + let dop_len = fw_view_u32[dop_ofst] as usize; + let (start, end) = (dop_ofst + 1, dop_ofst + 1 + dop_len); + let dop_stream = &fw_view_u32[start..end]; + + // Allocate DOp parser + dop_stream + .iter() + .map(|bin| hpu_asm::DOp::from_hex(*bin).expect("Invalid DOp")) + .collect::>() + } + + /// Rtl ucore emulation + /// Map a Raw DOp stream to the given IOp operands + /// I.e. it replace Templated MemId with concrete one + fn patch_fw(&self, iop: &hpu_asm::IOp, dops: &[hpu_asm::DOp]) -> Vec { + let mut dops_patch = dops + .iter() + .map(|dop| { + let mut dop_patch = dop.clone(); + match &mut dop_patch { + hpu_asm::DOp::LD(op_impl) => { + let slot = op_impl.slot_mut(); + *slot = match slot { + hpu_asm::MemId::Heap { bid } => hpu_asm::MemId::Addr(hpu_asm::CtId( + (self.config.ct_mem - 1) as u16 - *bid, + )), + hpu_asm::MemId::Src { tid, bid } => hpu_asm::MemId::Addr( + hpu_asm::CtId(iop.src()[*tid as usize].base_cid.0 + *bid as u16), + ), + hpu_asm::MemId::Dst { tid, bid } => hpu_asm::MemId::Addr( + hpu_asm::CtId(iop.dst()[*tid as usize].base_cid.0 + *bid as u16), + ), + hpu_asm::MemId::Addr(ct_id) => hpu_asm::MemId::Addr(*ct_id), + }; + dop_patch + } + hpu_asm::DOp::ST(op_impl) => { + let slot = op_impl.slot_mut(); + *slot = match slot { + hpu_asm::MemId::Heap { bid } => hpu_asm::MemId::Addr(hpu_asm::CtId( + (self.config.ct_mem - 1) as u16 - *bid, + )), + hpu_asm::MemId::Src { tid, bid } => hpu_asm::MemId::Addr( + hpu_asm::CtId(iop.src()[*tid as usize].base_cid.0 + *bid as u16), + ), + hpu_asm::MemId::Dst { tid, bid } => hpu_asm::MemId::Addr( + hpu_asm::CtId(iop.dst()[*tid as usize].base_cid.0 + *bid as u16), + ), + hpu_asm::MemId::Addr(ct_id) => hpu_asm::MemId::Addr(*ct_id), + }; + dop_patch + } + hpu_asm::DOp::ADDS(op_impl) => { + let imm = op_impl.msg_mut(); + patch_imm(iop, imm); + dop_patch + } + hpu_asm::DOp::SUBS(op_impl) => { + let imm = op_impl.msg_mut(); + patch_imm(iop, imm); + dop_patch + } + hpu_asm::DOp::SSUB(op_impl) => { + let imm = op_impl.msg_mut(); + patch_imm(iop, imm); + dop_patch + } + hpu_asm::DOp::MULS(op_impl) => { + let imm = op_impl.msg_mut(); + patch_imm(iop, imm); + dop_patch + } + // TODO Patch immediat + _ => dop_patch, + } + }) + .collect::>(); + + // Ucore is in charge of Sync insertion + dops_patch.push(hpu_asm::dop::DOpSync::new(None).into()); + tracing::trace!("Patch DOp stream => {dops_patch:?}"); + dops_patch + } +} + +/// Utility function to patch immediat argument +fn patch_imm(iop: &hpu_asm::IOp, imm: &mut hpu_asm::ImmId) { + *imm = match imm { + hpu_asm::ImmId::Cst(val) => hpu_asm::ImmId::Cst(*val), + hpu_asm::ImmId::Var { tid, bid } => { + hpu_asm::ImmId::Cst(iop.imm()[*tid as usize].msg_block(*bid)) + } + } +} diff --git a/setup_hpu.sh b/setup_hpu.sh new file mode 100644 index 000000000..bf15e3032 --- /dev/null +++ b/setup_hpu.sh @@ -0,0 +1,146 @@ +#! /usr/bin/env/ bash + +# Find current script directory. This should be PROJECT_DIR +CUR_SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +HPU_BACKEND_DIR=$CUR_SCRIPT_DIR/backends/tfhe-hpu-backend +HPU_MOCKUP_DIR=$CUR_SCRIPT_DIR/mockups/tfhe-hpu-mockup + +# Default default bitstream +# Available options are: +# * sim: use with the mockup (i.e simulation) +# * u55c: use with u55c (latest bitstream with gf64 config) +# * v80: use with v80 (i.e should specify pcie-dev flag [zamav80: 01, srvzama: 21] +HPU_CONFIG="sim" + +# Default log verbosity +RUST_LOG="info" + +# Setting PCI device variable: depends on the machine +mapfile -t DEVICE< <(lspci -d 10ee:50b5) +if [ ${#DEVICE[@]} -gt 1 ]; then + echo "[ERROR]: There is more than one device pcie, we only support one hpu for now" + return 1 +else + V80_PCIE_DEV="${DEVICE[0]%%:*}" +fi + +# Default Qdma init +V80_QDMA_INIT=false + +# Parse user CLI ############################################################## +opt_short="hc:l:p:i" +opt_long="help,config:,rust-log:pcie-dev:init-qdma" +OPTS=$(getopt -o "$opt_short" -l "$opt_long" -- "$@") + +while true +do + case "$1" in + -h|--help) + echo "Available options are:" + echo " * --config: target configuration [sim, u55c_gf64, v80]" + echo " * --rust-log: Specify rust verbosity [Cf. tracing]" + echo " * --pcie-dev: target pcie device [Warn: v80 only]" + echo " * --init-qdma: init the qdma driver [Warn: v80 only]" + return 0 + ;; + -c|--config) + if [ -n "${2}" ] && [[ ! ${2} =~ ^- ]]; then + HPU_CONFIG="${2}" + else + echo "Error: --config requires a value" + return 1 + fi + shift 2 + ;; + -l|--rust_log) + if [ -n "${2}" ] && [[ ! ${2} =~ ^- ]]; then + RUST_LOG="${2}" + ((i++)) + else + echo "Error: --rust-log requires a value" + return 1 + fi + shift 2 + ;; + -p|--pcie-dev) + if [ -n "${2}" ] && [[ ! ${2} =~ ^- ]]; then + V80_PCIE_DEV="${2}" + ((i++)) + else + echo "Error: --pcie-dev requires a value" + return 1 + fi + shift 2 + ;; + -i|--init-qdma) + V80_QDMA_INIT=true + shift + ;; + "") # End of input reading + break ;; + *) + echo "Unknown flag: $1" + echo " use -h|--help for available options" + return 1 + ;; + esac +done + +echo "###############################################################################" +echo "### Setup Hpu Backend ###" +echo "###############################################################################" +echo "# * Config: ${HPU_CONFIG}" +echo "# * Backend directory: ${HPU_BACKEND_DIR}" +if [[ "$HPU_CONFIG" == sim* ]]; then +echo "# * Mockup directory: ${HPU_MOCKUP_DIR}" +elif [[ "$HPU_CONFIG" == v80* ]]; then +echo "# * PCIe id: ${V80_PCIE_DEV} [V80 only]" +echo "# * Init Qdma: ${V80_QDMA_INIT} [V80 only]" +fi +echo "# * Rust verbosity: ${RUST_LOG}" +echo "###############################################################################" + +# Common init ################################################################# +# -> Create config simlink and some exports +export HPU_BACKEND_DIR +export HPU_CONFIG +export RUST_LOG + +# Sim specific init ########################################################### +if [[ "$HPU_CONFIG" == sim* ]]; then + export HPU_MOCKUP_DIR +fi + +# U55c specific init ########################################################### +if [[ "$HPU_CONFIG" == u55c* ]]; then + # Setup Xrt for low-level xfer with u55c + XRT_SETUP=/opt/xilinx/xrt/setup.sh + if [[ -f $XRT_SETUP ]]; then + source $XRT_SETUP + fi +fi + +# V80 specific init ########################################################### +if [[ "$HPU_CONFIG" == v80* ]]; then + export V80_PCIE_DEV + if [[ "$V80_QDMA_INIT" == true ]]; then + while true; do + read -p "QDMA_PF init requested by user. This required sudo right, Are you sure to process [Y/n]" user_input + if [[ "$user_input" == [Yy] ]]; then + echo "Continuing... You could be prompt for sudo password" + sudo modprobe -r qdma-pf && sudo modprobe qdma-pf + sudo bash -c "echo 100 > /sys/bus/pci/devices/0000\:${V80_PCIE_DEV}\:00.1/qdma/qmax" + sudo dma-ctl qdma${V80_PCIE_DEV}001 q add idx 1 mode mm dir h2c + sudo dma-ctl qdma${V80_PCIE_DEV}001 q add idx 2 mode mm dir c2h + sudo dma-ctl qdma${V80_PCIE_DEV}001 q start idx 1 dir h2c + sudo dma-ctl qdma${V80_PCIE_DEV}001 q start idx 2 dir c2h + break + elif [[ "$user_input" == [Nn] ]]; then + echo "Skipped QDMA_PF init" + break + else + echo "Invalid input. Please enter 'Y' or 'n'." + fi + done + fi +fi diff --git a/tasks/src/check_tfhe_docs_are_tested.rs b/tasks/src/check_tfhe_docs_are_tested.rs index 740831b27..e5dc56a86 100644 --- a/tasks/src/check_tfhe_docs_are_tested.rs +++ b/tasks/src/check_tfhe_docs_are_tested.rs @@ -10,7 +10,7 @@ const DIR_TO_IGNORE: [&str; 3] = [ "tests/tfhe-backward-compat-data", ]; -const FILES_TO_IGNORE: [&str; 7] = [ +const FILES_TO_IGNORE: [&str; 8] = [ // This contains fragments of code that are unrelated to TFHE-rs "tfhe/docs/tutorials/sha256_bool.md", // TODO: This contains code that could be executed as a trivium docstring @@ -23,6 +23,7 @@ const FILES_TO_IGNORE: [&str; 7] = [ "tfhe-ntt/README.md", "utils/tfhe-lints/README.md", "CONTRIBUTING.md", + "backends/tfhe-hpu-backend/Readme.md", ]; pub fn check_tfhe_docs_are_tested() -> Result<(), Error> { diff --git a/tfhe-benchmark/Cargo.toml b/tfhe-benchmark/Cargo.toml index d1039c82f..2ce514589 100644 --- a/tfhe-benchmark/Cargo.toml +++ b/tfhe-benchmark/Cargo.toml @@ -35,6 +35,8 @@ boolean = ["tfhe/boolean"] shortint = ["tfhe/shortint"] integer = ["shortint", "tfhe/integer"] gpu = ["tfhe/gpu"] +hpu = ["tfhe/hpu"] +hpu-v80 = ["tfhe/hpu-v80"] internal-keycache = ["tfhe/internal-keycache"] nightly-avx512 = ["tfhe/nightly-avx512"] pbs-stats = ["tfhe/pbs-stats"] diff --git a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs index 52bfbd368..2d6bf17f9 100644 --- a/tfhe-benchmark/benches/core_crypto/pbs_bench.rs +++ b/tfhe-benchmark/benches/core_crypto/pbs_bench.rs @@ -726,7 +726,11 @@ fn mem_optimized_pbs_ntt(c: &mut Criterion) { bsk.ciphertext_modulus(), ); - par_convert_standard_lwe_bootstrap_key_to_ntt64(&bsk, &mut nbsk); + par_convert_standard_lwe_bootstrap_key_to_ntt64( + &bsk, + &mut nbsk, + NttLweBootstrapKeyOption::Normalize, + ); drop(bsk); diff --git a/tfhe-benchmark/benches/high_level_api/bench.rs b/tfhe-benchmark/benches/high_level_api/bench.rs index 9595f6918..95b8a3001 100644 --- a/tfhe-benchmark/benches/high_level_api/bench.rs +++ b/tfhe-benchmark/benches/high_level_api/bench.rs @@ -1,17 +1,17 @@ -use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; use criterion::{black_box, Criterion}; use rand::prelude::*; use std::fmt::Write; use std::ops::*; use tfhe::prelude::*; use tfhe::{ - set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheUint10, FheUint12, - FheUint128, FheUint14, FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, + ClientKey, CompressedServerKey, FheUint10, FheUint12, FheUint128, FheUint14, FheUint16, + FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, }; fn bench_fhe_type(c: &mut Criterion, client_key: &ClientKey, type_name: &str) where FheType: FheEncrypt, + FheType: FheWait, for<'a> &'a FheType: Add<&'a FheType, Output = FheType> + Sub<&'a FheType, Output = FheType> + Mul<&'a FheType, Output = FheType> @@ -35,54 +35,133 @@ where let mut name = String::with_capacity(255); write!(name, "add({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs + &rhs))); - name.clear(); - - write!(name, "overflowing_add({type_name}, {type_name})").unwrap(); bench_group.bench_function(&name, |b| { - b.iter(|| black_box((&lhs).overflowing_add(&rhs))) + b.iter(|| { + let res = &lhs + &rhs; + res.wait(); + black_box(res) + }) }); name.clear(); - write!(name, "overflowing_sub({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(lhs.overflowing_sub(&rhs)))); - name.clear(); + #[cfg(not(feature = "hpu"))] + { + write!(name, "overflowing_add({type_name}, {type_name})").unwrap(); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let (res, flag) = lhs.overflowing_add(&rhs); + res.wait(); + black_box((res, flag)) + }) + }); + name.clear(); + } + + #[cfg(not(feature = "hpu"))] + { + write!(name, "overflowing_sub({type_name}, {type_name})").unwrap(); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let (res, flag) = lhs.overflowing_sub(&rhs); + res.wait(); + black_box((res, flag)) + }) + }); + name.clear(); + } write!(name, "sub({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs - &rhs))); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = &lhs - &rhs; + res.wait(); + black_box(res) + }) + }); name.clear(); write!(name, "mul({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs * &rhs))); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = &lhs * &rhs; + res.wait(); + black_box(res) + }) + }); name.clear(); write!(name, "bitand({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs & &rhs))); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = &lhs & &rhs; + res.wait(); + black_box(res) + }) + }); name.clear(); write!(name, "bitor({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs | &rhs))); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = &lhs | &rhs; + res.wait(); + black_box(res) + }) + }); name.clear(); write!(name, "bitxor({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs ^ &rhs))); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = &lhs ^ &rhs; + res.wait(); + black_box(res) + }) + }); name.clear(); - write!(name, "shl({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs << &rhs))); - name.clear(); + #[cfg(not(feature = "hpu"))] + { + write!(name, "shl({type_name}, {type_name})").unwrap(); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = &lhs << &rhs; + res.wait(); + black_box(res) + }) + }); + name.clear(); - write!(name, "shr({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box(&lhs >> &rhs))); - name.clear(); + write!(name, "shr({type_name}, {type_name})").unwrap(); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = &lhs >> &rhs; + res.wait(); + black_box(res) + }) + }); + name.clear(); - write!(name, "rotl({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box((&lhs).rotate_left(&rhs)))); - name.clear(); + write!(name, "rotl({type_name}, {type_name})").unwrap(); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = (&lhs).rotate_left(&rhs); + res.wait(); + black_box(res) + }) + }); + name.clear(); - write!(name, "rotr({type_name}, {type_name})").unwrap(); - bench_group.bench_function(&name, |b| b.iter(|| black_box((&lhs).rotate_right(&rhs)))); - name.clear(); + write!(name, "rotr({type_name}, {type_name})").unwrap(); + bench_group.bench_function(&name, |b| { + b.iter(|| { + let res = (&lhs).rotate_right(&rhs); + res.wait(); + black_box(res) + }) + }); + name.clear(); + } } macro_rules! bench_type { @@ -108,13 +187,39 @@ bench_type!(FheUint64); bench_type!(FheUint128); fn main() { - let config = - ConfigBuilder::with_custom_parameters(BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128) - .build(); - let cks = ClientKey::generate(config); - let compressed_sks = CompressedServerKey::new(&cks); + #[cfg(feature = "hpu")] + let cks = { + // Hpu is enable, start benchmark on Hpu hw accelerator + use tfhe::tfhe_hpu_backend::prelude::*; + use tfhe::{set_server_key, Config}; - set_server_key(compressed_sks.decompress()); + // Use environment variable to construct path to configuration file + let config_path = ShellString::new( + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string(), + ); + let hpu_device = HpuDevice::from_config(&config_path.expand()); + + let config = Config::from_hpu_device(&hpu_device); + let cks = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&cks); + + set_server_key((hpu_device, compressed_sks)); + cks + }; + #[cfg(not(feature = "hpu"))] + let cks = { + use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + use tfhe::{set_server_key, ConfigBuilder}; + let config = ConfigBuilder::with_custom_parameters( + BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, + ) + .build(); + let cks = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&cks); + + set_server_key(compressed_sks.decompress()); + cks + }; let mut c = Criterion::default().configure_from_args(); diff --git a/tfhe-benchmark/benches/high_level_api/erc20.rs b/tfhe-benchmark/benches/high_level_api/erc20.rs index b5c93acc8..9485495cf 100644 --- a/tfhe-benchmark/benches/high_level_api/erc20.rs +++ b/tfhe-benchmark/benches/high_level_api/erc20.rs @@ -1,21 +1,22 @@ #[cfg(feature = "gpu")] -use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; -#[cfg(not(feature = "gpu"))] -use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; -#[cfg(feature = "gpu")] use benchmark::utilities::configure_gpu; use benchmark::utilities::{write_to_json, OperatorType}; use criterion::measurement::WallTime; use criterion::{BenchmarkGroup, Criterion, Throughput}; use rand::prelude::*; use rand::thread_rng; +#[cfg(not(feature = "hpu"))] use rayon::prelude::*; -use std::ops::{Add, Mul, Sub}; +#[cfg(not(feature = "hpu"))] +use std::ops::Mul; +use std::ops::{Add, Sub}; +#[cfg(feature = "gpu")] +use tfhe::core_crypto::gpu::get_number_of_gpus; use tfhe::keycache::NamedParam; use tfhe::prelude::*; #[cfg(feature = "gpu")] use tfhe::GpuIndex; -use tfhe::{set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheBool, FheUint64}; +use tfhe::{set_server_key, ClientKey, CompressedServerKey, FheBool, FheUint64}; /// Transfer as written in the original FHEvm white-paper, /// it uses a comparison to check if the sender has enough, @@ -25,6 +26,28 @@ pub fn transfer_whitepaper( to_amount: &FheType, amount: &FheType, ) -> (FheType, FheType) +where + FheType: Add + for<'a> FheOrd<&'a FheType>, + FheBool: IfThenElse, + for<'a> &'a FheType: Add + Sub, +{ + let has_enough_funds = (from_amount).ge(amount); + + let mut new_to_amount = to_amount + amount; + new_to_amount = has_enough_funds.if_then_else(&new_to_amount, to_amount); + + let mut new_from_amount = from_amount - amount; + new_from_amount = has_enough_funds.if_then_else(&new_from_amount, from_amount); + + (new_from_amount, new_to_amount) +} + +/// Parallel variant of [`transfer_whitepaper`]. +pub fn par_transfer_whitepaper( + from_amount: &FheType, + to_amount: &FheType, + amount: &FheType, +) -> (FheType, FheType) where FheType: Add + for<'a> FheOrd<&'a FheType> + Send + Sync, FheBool: IfThenElse, @@ -48,6 +71,7 @@ where /// This one also uses a comparison, but it leverages the 'boolean' multiplication /// instead of cmuxes, so it is faster +#[cfg(not(feature = "hpu"))] fn transfer_no_cmux( from_amount: &FheType, to_amount: &FheType, @@ -71,6 +95,7 @@ where /// This one uses overflowing sub to remove the need for comparison /// it also uses the 'boolean' multiplication +#[cfg(not(feature = "hpu"))] fn transfer_overflow( from_amount: &FheType, to_amount: &FheType, @@ -97,6 +122,7 @@ where /// This ones uses both overflowing_add/sub to check that both /// the sender has enough funds, and the receiver will not overflow its balance +#[cfg(not(feature = "hpu"))] fn transfer_safe( from_amount: &FheType, to_amount: &FheType, @@ -123,7 +149,30 @@ where (new_from_amount, new_to_amount) } -#[cfg(feature = "pbs-stats")] +#[cfg(feature = "hpu")] +/// This one use a dedicated IOp inside Hpu +fn transfer_hpu( + from_amount: &FheType, + to_amount: &FheType, + amount: &FheType, +) -> (FheType, FheType) +where + FheType: FheHpu, +{ + use tfhe::tfhe_hpu_backend::prelude::hpu_asm; + let src = HpuHandle { + native: vec![from_amount, to_amount, amount], + boolean: vec![], + imm: vec![], + }; + let mut res_handle = FheHpu::iop_exec(&hpu_asm::iop::IOP_ERC_20, src); + // Iop erc_20 return new_from, new_to + let new_to = res_handle.native.pop().unwrap(); + let new_from = res_handle.native.pop().unwrap(); + (new_from, new_to) +} + +#[cfg(all(feature = "pbs-stats", not(feature = "hpu")))] mod pbs_stats { use super::*; use std::fs::{File, OpenOptions}; @@ -200,6 +249,7 @@ fn bench_transfer_latency( transfer_func: F, ) where FheType: FheEncrypt, + FheType: FheWait, F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType), { #[cfg(feature = "gpu")] @@ -214,7 +264,11 @@ fn bench_transfer_latency( let amount = FheType::encrypt(rng.gen::(), client_key); b.iter(|| { - let (_, _) = transfer_func(&from_amount, &to_amount, &amount); + let (new_from, new_to) = transfer_func(&from_amount, &to_amount, &amount); + new_from.wait(); + criterion::black_box(new_from); + new_to.wait(); + criterion::black_box(new_to); }) }); @@ -231,7 +285,7 @@ fn bench_transfer_latency( ); } -#[cfg(not(feature = "gpu"))] +#[cfg(not(any(feature = "gpu", feature = "hpu")))] fn bench_transfer_throughput( group: &mut BenchmarkGroup<'_, WallTime>, client_key: &ClientKey, @@ -283,6 +337,7 @@ fn bench_transfer_throughput( ); } } + #[cfg(feature = "gpu")] fn cuda_bench_transfer_throughput( group: &mut BenchmarkGroup<'_, WallTime>, @@ -370,16 +425,75 @@ fn cuda_bench_transfer_throughput( } } -#[cfg(feature = "pbs-stats")] -use pbs_stats::print_transfer_pbs_counts; -#[cfg(feature = "gpu")] -use tfhe::core_crypto::gpu::get_number_of_gpus; +#[cfg(feature = "hpu")] +fn hpu_bench_transfer_throughput( + group: &mut BenchmarkGroup<'_, WallTime>, + client_key: &ClientKey, + bench_name: &str, + type_name: &str, + fn_name: &str, + transfer_func: F, +) where + FheType: FheEncrypt + Send + Sync, + FheType: FheWait, + F: for<'a> Fn(&'a FheType, &'a FheType, &'a FheType) -> (FheType, FheType) + Sync, +{ + let mut rng = thread_rng(); -#[cfg(not(feature = "gpu"))] + for num_elems in [10, 100] { + group.throughput(Throughput::Elements(num_elems)); + let bench_id = + format!("{bench_name}::throughput::{fn_name}::{type_name}::{num_elems}_elems"); + group.bench_with_input(&bench_id, &num_elems, |b, &num_elems| { + let from_amounts = (0..num_elems) + .map(|_| FheType::encrypt(rng.gen::(), client_key)) + .collect::>(); + let to_amounts = (0..num_elems) + .map(|_| FheType::encrypt(rng.gen::(), client_key)) + .collect::>(); + let amounts = (0..num_elems) + .map(|_| FheType::encrypt(rng.gen::(), client_key)) + .collect::>(); + + b.iter(|| { + let (last_new_from, last_new_to) = std::iter::zip( + from_amounts.iter(), + std::iter::zip(to_amounts.iter(), amounts.iter()), + ) + .map(|(from_amount, (to_amount, amount))| { + transfer_func(from_amount, to_amount, amount) + }) + .last() + .unwrap(); + + // Wait on last result to enforce all computation is over + last_new_from.wait(); + criterion::black_box(last_new_from); + last_new_to.wait(); + criterion::black_box(last_new_to); + }); + }); + + let params = client_key.computation_parameters(); + + write_to_json::( + &bench_id, + params, + params.name(), + "erc20-transfer", + &OperatorType::Atomic, + 64, + vec![], + ); + } +} + +#[cfg(not(any(feature = "gpu", feature = "hpu")))] fn main() { - let params = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + use crate::pbs_stats::print_transfer_pbs_counts; + let params = benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; - let config = ConfigBuilder::with_custom_parameters(params).build(); + let config = tfhe::ConfigBuilder::with_custom_parameters(params).build(); let cks = ClientKey::generate(config); let compressed_sks = CompressedServerKey::new(&cks); @@ -401,7 +515,7 @@ fn main() { &cks, "FheUint64", "transfer::whitepaper", - transfer_whitepaper::, + par_transfer_whitepaper::, ); print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::); print_transfer_pbs_counts( @@ -422,7 +536,7 @@ fn main() { bench_name, "FheUint64", "transfer::whitepaper", - transfer_whitepaper::, + par_transfer_whitepaper::, ); bench_transfer_latency( &mut group, @@ -461,7 +575,7 @@ fn main() { bench_name, "FheUint64", "transfer::whitepaper", - transfer_whitepaper::, + par_transfer_whitepaper::, ); bench_transfer_throughput( &mut group, @@ -496,9 +610,10 @@ fn main() { #[cfg(feature = "gpu")] fn main() { - let params = BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + use crate::pbs_stats::print_transfer_pbs_counts; + let params = benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; - let config = ConfigBuilder::with_custom_parameters(params).build(); + let config = tfhe::ConfigBuilder::with_custom_parameters(params).build(); let cks = ClientKey::generate(config); let mut c = Criterion::default().sample_size(10).configure_from_args(); @@ -514,7 +629,7 @@ fn main() { &cks, "FheUint64", "transfer::whitepaper", - transfer_whitepaper::, + par_transfer_whitepaper::, ); print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::); print_transfer_pbs_counts( @@ -535,7 +650,7 @@ fn main() { bench_name, "FheUint64", "transfer::whitepaper", - transfer_whitepaper::, + par_transfer_whitepaper::, ); bench_transfer_latency( &mut group, @@ -574,7 +689,7 @@ fn main() { bench_name, "FheUint64", "transfer::whitepaper", - transfer_whitepaper::, + par_transfer_whitepaper::, ); cuda_bench_transfer_throughput( &mut group, @@ -605,3 +720,76 @@ fn main() { c.final_summary(); } +#[cfg(feature = "hpu")] +fn main() { + let cks = { + // Hpu is enable, start benchmark on Hpu hw accelerator + use tfhe::tfhe_hpu_backend::prelude::*; + use tfhe::Config; + + // Use environment variable to construct path to configuration file + let config_path = ShellString::new( + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string(), + ); + let hpu_device = HpuDevice::from_config(&config_path.expand()); + + let config = Config::from_hpu_device(&hpu_device); + let cks = ClientKey::generate(config); + let compressed_sks = CompressedServerKey::new(&cks); + + set_server_key((hpu_device, compressed_sks)); + cks + }; + + let mut c = Criterion::default().sample_size(10).configure_from_args(); + + let bench_name = "hlapi::hpu::erc20::transfer"; + + // FheUint64 latency + { + let mut group = c.benchmark_group(bench_name); + bench_transfer_latency( + &mut group, + &cks, + bench_name, + "FheUint64", + "whitepaper", + transfer_whitepaper::, + ); + // Erc20 optimized instruction only available on Hpu + bench_transfer_latency( + &mut group, + &cks, + bench_name, + "FheUint64", + "hpu_optim", + transfer_hpu::, + ); + group.finish(); + } + + // FheUint64 Throughput + { + let mut group = c.benchmark_group(bench_name); + hpu_bench_transfer_throughput( + &mut group, + &cks, + bench_name, + "FheUint64", + "whitepaper", + transfer_whitepaper::, + ); + // Erc20 optimized instruction only available on Hpu + hpu_bench_transfer_throughput( + &mut group, + &cks, + bench_name, + "FheUint64", + "hpu_optim", + transfer_hpu::, + ); + group.finish(); + } + + c.final_summary(); +} diff --git a/tfhe-benchmark/benches/integer/bench.rs b/tfhe-benchmark/benches/integer/bench.rs index 0416cd819..cb717eeb0 100644 --- a/tfhe-benchmark/benches/integer/bench.rs +++ b/tfhe-benchmark/benches/integer/bench.rs @@ -2931,6 +2931,323 @@ use cuda::{ unchecked_cuda_ops, unchecked_scalar_cuda_ops, }; +#[cfg(feature = "hpu")] +mod hpu { + use super::*; + use criterion::{black_box, criterion_group}; + use tfhe::integer::hpu::ciphertext::HpuRadixCiphertext; + use tfhe::prelude::CastFrom; + use tfhe::tfhe_hpu_backend::prelude::*; + + /// Base function to bench an hpu operations. + /// Inputs/Output types and length are inferred based on associated iop prototype + fn bench_hpu_iop_clean_inputs( + c: &mut Criterion, + bench_name: &str, + display_name: &str, + iop: &hpu_asm::AsmIOpcode, + ) { + let mut bench_group = c.benchmark_group(bench_name); + bench_group + .sample_size(15) + .measurement_time(std::time::Duration::from_secs(60)); + let mut rng = rand::thread_rng(); + + for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() { + if bit_size > ScalarType::BITS as usize { + break; + } + let param_name = param.name(); + + let max_value_for_bit_size = ScalarType::MAX >> (ScalarType::BITS as usize - bit_size); + + let bench_id; + + let proto = if let Some(format) = iop.format() { + format.proto.clone() + } else { + panic!("HPU only IOp with defined prototype could be benched"); + }; + + match get_bench_type() { + BenchmarkType::Latency => { + bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits"); + bench_group.bench_function(&bench_id, |b| { + let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let hpu_device_mutex = KEY_CACHE.get_hpu_device(param); + let hpu_device = hpu_device_mutex.lock().unwrap(); + + let gen_inputs = || { + let srcs = proto + .src + .iter() + .map(|mode| { + let (bw, block) = match mode { + hpu_asm::iop::VarMode::Native => (bit_size, num_block), + hpu_asm::iop::VarMode::Half => { + (bit_size / 2, num_block / 2) + } + hpu_asm::iop::VarMode::Bool => (1, 1), + }; + + let clear = rng + .gen_range(0..u128::cast_from(max_value_for_bit_size)) + & if bw < u128::BITS as usize { + (1_u128 << bw) - 1 + } else { + !0_u128 + }; + let fhe = cks.encrypt_radix(clear, block); + HpuRadixCiphertext::from_radix_ciphertext(&fhe, &hpu_device) + }) + .collect::>(); + + let imms = (0..proto.imm) + .map(|_| rng.gen_range(0..u128::cast_from(max_value_for_bit_size))) + .collect::>(); + (srcs, imms) + }; + + b.iter_batched( + gen_inputs, + |(srcs, imms)| { + let res = + HpuRadixCiphertext::exec(&proto, iop.opcode(), &srcs, &imms); + res.into_iter().for_each(|ct| { + ct.wait(); + black_box(ct); + }); + }, + criterion::BatchSize::SmallInput, + ) + }); + } + BenchmarkType::Throughput => { + bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits"); + bench_group + .sample_size(10) + .measurement_time(std::time::Duration::from_secs(30)); + let elements = throughput_num_threads(num_block, 1); + bench_group.throughput(Throughput::Elements(elements)); + bench_group.bench_function(&bench_id, |b| { + let (cks, _sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix); + let hpu_device_mutex = KEY_CACHE.get_hpu_device(param); + let hpu_device = hpu_device_mutex.lock().unwrap(); + + let inputs = (0..elements) + .map(|_| { + let srcs = proto + .src + .iter() + .map(|mode| { + let (bw, block) = match mode { + hpu_asm::iop::VarMode::Native => (bit_size, num_block), + hpu_asm::iop::VarMode::Half => { + (bit_size / 2, num_block / 2) + } + hpu_asm::iop::VarMode::Bool => (1, 1), + }; + + let clear = rng + .gen_range(0..u128::cast_from(max_value_for_bit_size)) + & if bw < u128::BITS as usize { + (1_u128 << bw) - 1 + } else { + !0_u128 + }; + let fhe = cks.encrypt_radix(clear, block); + HpuRadixCiphertext::from_radix_ciphertext(&fhe, &hpu_device) + }) + .collect::>(); + + let imms = (0..proto.imm) + .map(|_| { + rng.gen_range(0..u128::cast_from(max_value_for_bit_size)) + }) + .collect::>(); + (srcs, imms) + }) + .collect::>(); + + b.iter(|| { + let last_res = inputs + .iter() + .map(|input| { + HpuRadixCiphertext::exec( + &proto, + iop.opcode(), + &input.0, + &input.1, + ) + }) + .next_back() + .unwrap(); + last_res.into_iter().for_each(|ct| { + ct.wait(); + black_box(ct); + }); + }) + }); + } + } + + write_to_json::( + &bench_id, + param, + param.name(), + display_name, + &OperatorType::Atomic, + bit_size as u32, + vec![param.message_modulus().0.ilog2(); num_block], + ); + } + + bench_group.finish() + } + + macro_rules! define_hpu_bench_default_fn ( + (iop_name: $iop:ident, display_name:$name:ident) => { + ::paste::paste!{ + fn [< default_hpu_ $iop:lower >](c: &mut Criterion) { + bench_hpu_iop_clean_inputs( + c, + concat!("integer::hpu::", stringify!($iop)), + stringify!($name), + &hpu_asm::iop::[< IOP_ $iop:upper >], + ) + } + } + } + ); + + macro_rules! define_hpu_bench_default_fn_scalar ( + (iop_name: $iop:ident, display_name:$name:ident) => { + ::paste::paste!{ + fn [< default_hpu_ $iop:lower >](c: &mut Criterion) { + bench_hpu_iop_clean_inputs( + c, + concat!("integer::hpu::scalar::", stringify!($iop)), + stringify!($name), + &hpu_asm::iop::[< IOP_ $iop:upper >], + ) + } + } + } + ); + + // Alu ------------------------------------------------------------------------ + define_hpu_bench_default_fn!( + iop_name: add, + display_name: add + ); + define_hpu_bench_default_fn!( + iop_name: sub, + display_name: sub + ); + define_hpu_bench_default_fn!( + iop_name: mul, + display_name: mul + ); + criterion_group!( + default_hpu_ops, + default_hpu_add, + default_hpu_sub, + default_hpu_mul + ); + + // Alu Scalar ----------------------------------------------------------------- + define_hpu_bench_default_fn_scalar!( + iop_name: adds, + display_name: add + ); + define_hpu_bench_default_fn_scalar!( + iop_name: subs, + display_name: sub + ); + //define_hpu_bench_default_fn!( + // iop_name: ssub, + // display_name: scalar_sub + //); + define_hpu_bench_default_fn_scalar!( + iop_name: muls, + display_name: mul + ); + criterion_group!( + default_hpu_ops_scalar, + default_hpu_adds, + default_hpu_subs, + //default_hpu_ssub, + default_hpu_muls + ); + // Bitwise -------------------------------------------------------------------- + define_hpu_bench_default_fn!( + iop_name: bw_and, + display_name: bitand + ); + define_hpu_bench_default_fn!( + iop_name: bw_or, + display_name: bitor + ); + define_hpu_bench_default_fn!( + iop_name: bw_xor, + display_name: bitxor + ); + criterion_group!( + default_hpu_bitwise, + default_hpu_bw_and, + default_hpu_bw_or, + default_hpu_bw_xor, + ); + // Comparison ---------------------------------------------------------------- + define_hpu_bench_default_fn!( + iop_name: cmp_eq, + display_name: equal + ); + define_hpu_bench_default_fn!( + iop_name: cmp_neq, + display_name: not_equal + ); + define_hpu_bench_default_fn!( + iop_name: cmp_gt, + display_name: greater_than + ); + define_hpu_bench_default_fn!( + iop_name: cmp_gte, + display_name: greater_or_equal + ); + define_hpu_bench_default_fn!( + iop_name: cmp_lt, + display_name: lower_than + ); + define_hpu_bench_default_fn!( + iop_name: cmp_lte, + display_name: lower_or_equal + ); + criterion_group!( + default_hpu_cmp, + default_hpu_cmp_eq, + default_hpu_cmp_neq, + default_hpu_cmp_gt, + default_hpu_cmp_gte, + default_hpu_cmp_lt, + default_hpu_cmp_lte, + ); + // Ternary -------------------------------------------------------------------- + define_hpu_bench_default_fn!( + iop_name: if_then_else, + display_name: if_then_else + ); + define_hpu_bench_default_fn!( + iop_name: if_then_zero, + display_name: if_then_zero + ); + criterion_group!( + default_hpu_select, + default_hpu_if_then_else, + default_hpu_if_then_zero, + ); +} + criterion_group!( smart_ops, smart_neg, @@ -3297,6 +3614,23 @@ fn go_through_gpu_bench_groups(val: &str) { }; } +#[cfg(feature = "hpu")] +fn go_through_hpu_bench_groups(val: &str) { + match val.to_lowercase().as_str() { + "default" => { + hpu::default_hpu_ops(); + hpu::default_hpu_ops_scalar(); + hpu::default_hpu_bitwise(); + hpu::default_hpu_cmp(); + hpu::default_hpu_select(); + } + "fast_default" => { + hpu::default_hpu_ops(); + } + _ => panic!("unknown benchmark operations flavor"), + }; +} + fn go_through_cpu_bench_groups(val: &str) { match val.to_lowercase().as_str() { "default" => { @@ -3336,7 +3670,9 @@ fn main() { Ok(val) => { #[cfg(feature = "gpu")] go_through_gpu_bench_groups(&val); - #[cfg(not(feature = "gpu"))] + #[cfg(feature = "hpu")] + go_through_hpu_bench_groups(&val); + #[cfg(not(any(feature = "gpu", feature = "hpu")))] go_through_cpu_bench_groups(&val); } Err(_) => { diff --git a/tfhe-benchmark/src/params.rs b/tfhe-benchmark/src/params.rs index 24c5712df..561d8aadf 100644 --- a/tfhe-benchmark/src/params.rs +++ b/tfhe-benchmark/src/params.rs @@ -33,7 +33,8 @@ pub mod shortint_params { use tfhe::core_crypto::prelude::{DynamicDistribution, LweBskGroupingFactor}; use tfhe::keycache::NamedParam; use tfhe::shortint::{ - CarryModulus, ClassicPBSParameters, MessageModulus, MultiBitPBSParameters, PBSParameters, + AtomicPatternParameters, CarryModulus, ClassicPBSParameters, MessageModulus, + MultiBitPBSParameters, }; pub const SHORTINT_BENCH_PARAMS_TUNIFORM: [ClassicPBSParameters; 4] = [ @@ -78,7 +79,7 @@ pub mod shortint_params { .map(|params| { ( params.name(), - >::into(*params) + >::into(*params) .to_owned() .into(), ) @@ -94,7 +95,7 @@ pub mod shortint_params { .map(|(params, name)| { ( name.to_string(), - >::into(*params) + >::into(*params) .to_owned() .into(), ) @@ -111,7 +112,7 @@ pub mod shortint_params { .map(|params| { ( params.name(), - >::into(*params) + >::into(*params) .to_owned() .into(), ) @@ -132,7 +133,7 @@ pub mod shortint_params { .map(|(params, name)| { ( name.to_string(), - >::into(*params) + >::into(*params) .to_owned() .into(), ) @@ -150,7 +151,7 @@ pub mod shortint_params { .map(|params| { ( params.name(), - >::into(*params) + >::into(*params) .to_owned() .into(), params.grouping_factor, @@ -172,7 +173,7 @@ pub mod shortint_params { .map(|(params, name)| { ( name.to_string(), - >::into(*params) + >::into(*params) .to_owned() .into(), params.grouping_factor, @@ -183,7 +184,7 @@ pub mod shortint_params { } } - pub fn raw_benchmark_parameters() -> Vec { + pub fn raw_benchmark_parameters() -> Vec { let is_multi_bit = match env::var("__TFHE_RS_PARAM_TYPE") { Ok(val) => val.to_lowercase() == "multi_bit", Err(_) => false, @@ -351,7 +352,7 @@ pub mod shortint_params { } } - pub fn filter_parameters<'a, P: Copy + Into>( + pub fn filter_parameters<'a, P: Copy + Into>( params: &[(&'a P, &'a str)], desired_noise_distribution: DesiredNoiseDistribution, desired_backend: DesiredBackend, @@ -359,7 +360,7 @@ pub mod shortint_params { params .iter() .filter_map(|(p, name)| { - let temp_param: PBSParameters = (**p).into(); + let temp_param: AtomicPatternParameters = (**p).into(); match ( temp_param.lwe_noise_distribution(), @@ -391,13 +392,14 @@ mod integer_params { use crate::utilities::EnvConfig; use itertools::iproduct; use std::vec::IntoIter; - use tfhe::shortint::PBSParameters; + use tfhe::shortint::AtomicPatternParameters; /// An iterator that yields a succession of combinations /// of parameters and a num_block to achieve a certain bit_size ciphertext /// in radix decomposition pub struct ParamsAndNumBlocksIter { - params_and_bit_sizes: itertools::Product, IntoIter>, + params_and_bit_sizes: + itertools::Product, IntoIter>, } impl Default for ParamsAndNumBlocksIter { @@ -405,23 +407,33 @@ mod integer_params { let env_config = EnvConfig::new(); if env_config.is_multi_bit { - #[cfg(feature = "gpu")] - let params = vec![ - BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128 - .into(), - ]; - #[cfg(not(feature = "gpu"))] - let params = vec![ - BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128.into(), - ]; + #[cfg(feature = "hpu")] + panic!("Hpu doesn't implement MultiBit"); - let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes()); - Self { - params_and_bit_sizes, + #[cfg(not(feature = "hpu"))] + { + #[cfg(feature = "gpu")] + let params = vec![ + BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128 + .into(), + ]; + #[cfg(not(feature = "gpu"))] + let params = vec![ + BENCH_PARAM_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128 + .into(), + ]; + + let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes()); + Self { + params_and_bit_sizes, + } } } else { // FIXME One set of parameter is tested since we want to benchmark only quickest // operations. + #[cfg(feature = "hpu")] + let params = vec![BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64.into()]; + #[cfg(not(feature = "hpu"))] let params = vec![BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into()]; let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes()); @@ -433,7 +445,7 @@ mod integer_params { } impl Iterator for ParamsAndNumBlocksIter { - type Item = (PBSParameters, usize, usize); + type Item = (AtomicPatternParameters, usize, usize); fn next(&mut self) -> Option { let (param, bit_size) = self.params_and_bit_sizes.next()?; diff --git a/tfhe-benchmark/src/params_aliases.rs b/tfhe-benchmark/src/params_aliases.rs index 825c09aeb..cb016db79 100644 --- a/tfhe-benchmark/src/params_aliases.rs +++ b/tfhe-benchmark/src/params_aliases.rs @@ -1,6 +1,8 @@ #[cfg(any(feature = "shortint", feature = "integer"))] pub mod shortint_params_aliases { use tfhe::shortint::parameters::current_params::*; + #[cfg(feature = "hpu")] + use tfhe::shortint::parameters::KeySwitch32PBSParameters; use tfhe::shortint::parameters::{ ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters, MultiBitPBSParameters, NoiseSquashingParameters, ShortintKeySwitchingParameters, @@ -136,6 +138,15 @@ pub mod shortint_params_aliases { pub const BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128: NoiseSquashingParameters = V1_2_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128; + + #[cfg(feature = "hpu")] + // KS PBS Gaussian for Hpu + pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64: KeySwitch32PBSParameters = + V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64; + #[cfg(feature = "hpu")] + // KS PBS TUniform + pub const BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64: KeySwitch32PBSParameters = + V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64; } #[cfg(any(feature = "shortint", feature = "integer"))] diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs index 75fbfda5c..2ed6e78ea 100644 --- a/tfhe-benchmark/src/utilities.rs +++ b/tfhe-benchmark/src/utilities.rs @@ -312,6 +312,7 @@ pub fn write_to_json< const FAST_BENCH_BIT_SIZES: [usize; 1] = [64]; const BENCH_BIT_SIZES: [usize; 8] = [4, 8, 16, 32, 40, 64, 128, 256]; +const HPU_BENCH_BIT_SIZES: [usize; 5] = [8, 16, 32, 64, 128]; const MULTI_BIT_CPU_SIZES: [usize; 6] = [4, 8, 16, 32, 40, 64]; /// User configuration in which benchmarks must be run. @@ -349,6 +350,8 @@ impl EnvConfig { } else { MULTI_BIT_CPU_SIZES.to_vec() } + } else if cfg!(feature = "hpu") { + HPU_BENCH_BIT_SIZES.to_vec() } else { BENCH_BIT_SIZES.to_vec() } @@ -397,7 +400,15 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 { elements.min(1500) // This threshold is useful for operation with both a small number of // block and low PBs count. } - #[cfg(not(feature = "gpu"))] + #[cfg(feature = "hpu")] + { + // NB: unused with HPU + let _ = minimum_loading; + let _ = op_pbs_count; + // Enforce that a minimum of 64 IOp is sent + block_multiplicator.min(64.0) as u64 + } + #[cfg(not(any(feature = "gpu", feature = "hpu")))] { let num_threads = rayon::current_num_threads() as f64; let operation_loading = (num_threads / (op_pbs_count as f64)).max(minimum_loading); diff --git a/tfhe/Cargo.toml b/tfhe/Cargo.toml index 78b9c4843..a7632d54e 100644 --- a/tfhe/Cargo.toml +++ b/tfhe/Cargo.toml @@ -42,6 +42,14 @@ env_logger = "0.11" log = "0.4.19" hex = "0.4.3" # End regex-engine deps +# Used in noise-measurements +csv = "1.3.0" + +# Begin hpu-demo deps +# Enable to have hpu execution trace +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +clap-num = {version = "1.1.1"} +# End hpu-demo deps [build-dependencies] cbindgen = { version = "0.28", optional = true } @@ -83,6 +91,8 @@ serde-wasm-bindgen = { version = "0.6.0", optional = true } getrandom = { version = "0.2.8", optional = true } bytemuck = { workspace = true } +tfhe-hpu-backend = { version = "0.1", path = "../backends/tfhe-hpu-backend", optional = true } + [features] boolean = [] shortint = ["dep:sha3"] @@ -96,6 +106,12 @@ gpu-experimental-multi-arch = [ ] gpu-profile = ["gpu", "tfhe-cuda-backend/profile"] zk-pok = ["dep:tfhe-zk-pok"] +# Start Fpga Hpu features +hpu = ["dep:tfhe-hpu-backend", "shortint", "integer"] +hpu-xrt = ["hpu", "tfhe-hpu-backend/hw-xrt"] +hpu-v80 = ["hpu", "tfhe-hpu-backend/hw-v80"] +hpu-debug = ["hpu", "tfhe-hpu-backend/io-dump"] +# End Fpga Hpu features # Adds more FheUint/FheInt types to the HL extended-types = [] @@ -149,6 +165,7 @@ features = [ "zk-pok", "software-prng", "strings", + "hpu", ] rustdoc-args = ["--html-in-header", "katex-header.html"] @@ -200,6 +217,23 @@ required-features = ["integer", "pbs-stats"] name = "dist_tuniform" required-features = ["integer", "internal-keycache"] +# Start of Hpu related section +[[example]] +name = "hpu_hlapi" +path = "examples/hpu/hlapi.rs" +required-features = ["hpu"] + +[[example]] +name = "hpu_matmul" +path = "examples/hpu/matmul.rs" +required-features = ["hpu"] + +[[example]] +name = "hpu_bench" +path = "examples/hpu/bench.rs" +required-features = ["hpu"] +# End of Hpu related section + [lib] crate-type = ["lib", "staticlib", "cdylib"] diff --git a/tfhe/docs/README.md b/tfhe/docs/README.md index c23cf7f50..5541ab23d 100644 --- a/tfhe/docs/README.md +++ b/tfhe/docs/README.md @@ -28,7 +28,7 @@ Learn the basics of TFHE-rs, set it up, and make it run with ease. Start building with TFHE-rs by exploring its core features, discovering essential guides, and learning more with user-friendly tutorials. -
FHE ComputationsRun FHE computation on encrypted data.build1.png
ConfigurationAdvanced configuration for better performance.build2.png
IntegrationUse TFHE-rs in different contexts or platforms..build3.png
+
FHE ComputationsRun FHE computation on encrypted data.build1.png
ConfigurationAdvanced configuration for better performance.build2.png
IntegrationUse TFHE-rs in different contexts or platforms..build3.png
## Explore more diff --git a/tfhe/docs/SUMMARY.md b/tfhe/docs/SUMMARY.md index a7ed73fe6..62bc95593 100644 --- a/tfhe/docs/SUMMARY.md +++ b/tfhe/docs/SUMMARY.md @@ -14,6 +14,8 @@ * [GPU Benchmarks](getting_started/benchmarks/gpu/README.md) * [Integer](getting_started/benchmarks/gpu/gpu_integer_operations.md) * [Programmable bootstrapping](getting_started/benchmarks/gpu/gpu_programmable_bootstrapping.md) + * [HPU Benchmarks](getting_started/benchmarks/hpu/README.md) + * [Integer](getting_started/benchmarks/hpu/hpu_integer_operations.md) * [Zero-knowledge proof benchmarks](getting_started/benchmarks/zk_proof_benchmarks.md) * [Security and cryptography](getting_started/security_and_cryptography.md) @@ -64,6 +66,8 @@ * [Compressing ciphertexts](configuration/gpu_acceleration/compressing_ciphertexts.md) * [Array types](configuration/gpu_acceleration/array_type.md) * [Multi-GPU support](configuration/gpu_acceleration/multi_gpu.md) +* [HPU acceleration](configuration/hpu_acceleration/run_on_hpu.md) + * [Benchmark](configuration/hpu_acceleration/benchmark.md) * [Parallelized PBS](configuration/parallelized_pbs.md) ## Integration diff --git a/tfhe/docs/_static/cpu_gpu_integer_benchmark_fheuint64_tuniform_2m64_ciphertext.svg b/tfhe/docs/_static/cpu_gpu_hpu_integer_benchmark_fheuint64_tuniform_2m64_ciphertext.svg similarity index 100% rename from tfhe/docs/_static/cpu_gpu_integer_benchmark_fheuint64_tuniform_2m64_ciphertext.svg rename to tfhe/docs/_static/cpu_gpu_hpu_integer_benchmark_fheuint64_tuniform_2m64_ciphertext.svg diff --git a/tfhe/docs/_static/hpu_integer_benchmark_hpux1_tuniform_2m64_ciphertext.svg b/tfhe/docs/_static/hpu_integer_benchmark_hpux1_tuniform_2m64_ciphertext.svg new file mode 100644 index 000000000..d7109e8c6 --- /dev/null +++ b/tfhe/docs/_static/hpu_integer_benchmark_hpux1_tuniform_2m64_ciphertext.svg @@ -0,0 +1,67 @@ + + + + Operation \ Size + FheUint + 8 + FheUint + 16 + FheUint + 32 + FheUint + 64 + FheUint + 128 + + + Add / Sub (+,-) + 6.56 ms + 7.85 ms + 7.93 ms + 13.9 ms + 22.5 ms + Mul (x) + 10.7 ms + 19.4 ms + 51.5 ms + 149 ms + 585 ms + Equal / Not Equal (eq, ne) + 6.74 ms + 8.14 ms + 9.18 ms + 12.5 ms + 18.3 ms + Comparisons (ge, gt, le, lt) + 6.84 ms + 7.83 ms + 10.3 ms + 11.9 ms + 18.2 ms + Bitwise operations (&, |, ^) + 4.22 ms + 3.92 ms + 5.57 ms + 6.26 ms + 11.0 ms + Select + 3.37 ms + 6.11 ms + 6.09 ms + 10.2 ms + 14.6 ms + + + + + + + + + + + + + + + diff --git a/tfhe/docs/_static/hpu_integer_benchmark_hpux1_tuniform_2m64_plaintext.svg b/tfhe/docs/_static/hpu_integer_benchmark_hpux1_tuniform_2m64_plaintext.svg new file mode 100644 index 000000000..78da0475f --- /dev/null +++ b/tfhe/docs/_static/hpu_integer_benchmark_hpux1_tuniform_2m64_plaintext.svg @@ -0,0 +1,39 @@ + + + + Operation \ Size + FheUint + 8 + FheUint + 16 + FheUint + 32 + FheUint + 64 + FheUint + 128 + + + Add / Sub (+,-) + 7.04 ms + 9.02 ms + 7.84 ms + 14.6 ms + 23.2 ms + Mul (x) + 9.85 ms + 21.3 ms + 51.3 ms + 150 ms + 585 ms + + + + + + + + + + + diff --git a/tfhe/docs/configuration/hpu_acceleration/benchmark.md b/tfhe/docs/configuration/hpu_acceleration/benchmark.md new file mode 100644 index 000000000..6ebe8f3e8 --- /dev/null +++ b/tfhe/docs/configuration/hpu_acceleration/benchmark.md @@ -0,0 +1,3 @@ +# Benchmarks + +Please refer to the [HPU benchmarks](../../getting_started/benchmarks/hpu/README.md) for detailed performance benchmark results. diff --git a/tfhe/docs/configuration/hpu_acceleration/run_on_hpu.md b/tfhe/docs/configuration/hpu_acceleration/run_on_hpu.md new file mode 100644 index 000000000..40bd1e702 --- /dev/null +++ b/tfhe/docs/configuration/hpu_acceleration/run_on_hpu.md @@ -0,0 +1,150 @@ +# HPU acceleration + +This guide explains how to update your existing program to leverage HPU acceleration, or to start a new program using HPU. + +**TFHE-rs** now supports a HPU backend based on FPGA implementation, enabling integer arithmetic operations on encrypted data. + +## Prerequisites + +* An [AMD/Xilinx V80 board](https://www.amd.com/en/products/accelerators/alveo/v80.html) installed on a server running Linux with kernel 5.15.0-\* +* A HPU bitstream that you can find (or build) in [HPU fpga repository](https://github.com/zama-ai/hpu_fpga) and load in V80 flash and FPGA using its [README](https://github.com/zama-ai/hpu_fpga/README.md) +* AMI linux device driver version from this [fork](https://github.com/zama-ai/AVED) +* QDMA linux device driver version from this [fork](https://github.com/zama-ai/dma_ip_drivers) +* Rust version - check this [page](../rust_configuration.md) + +## Importing to your project + +To use the **TFHE-rs** HPU backend in your project, add the following dependency in your `Cargo.toml`. + +```toml +tfhe = { version = "~1.2.0", features = ["integer", "hpu-v80"] } +``` + +{% hint style="success" %} +For optimal performance when using **TFHE-rs**, run your code in release mode with the `--release` flag. +{% endhint %} + +### Supported platforms + +**TFHE-rs** HPU backend is supported on Linux (x86, aarch64). + +| OS | x86 | aarch64 | +| ------- | ----------- | ------------- | +| Linux | Supported | Unsupported | +| macOS | Unsupported | Unsupported | +| Windows | Unsupported | Unsupported | + +## A first example + +### Configuring and creating keys. + +Comparing to the [CPU example](../../getting_started/quick_start.md), HPU set up differs in the key creation and device registration, as detailed [here](run\_on\_hpu.md#setting-the-hpu) + +Here is a full example (combining the client and server parts): + +```rust +use tfhe::{ConfigBuilder, set_server_key, FheUint8, ClientKey, CompressedServerKey}; +use tfhe::prelude::*; +use tfhe_hpu_backend::prelude::*; + +fn main() { + + // Instantiate HpuDevice -------------------------------------------------- + // HPU configuration knobs are retrieved from a TOML configuration file. Prebuilt configurations could be find in `backends/tfhe-hpu-backend/config_store` + // For ease of use a setup_hpu.sh script is available in repository root folder and it handle the required environment variables setup and driver initialisation + // More details are available in `backends/tfhe-hpu-backend/Readme.md` + let hpu_device = HpuDevice::from_config(ShellString::new("${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string())); + + // Generate keys ---------------------------------------------------------- + let config = Config::from_hpu_device(&hpu_device); + + let client_key = ClientKey::generate(config); + let compressed_server_key = CompressedServerKey::new(&client_key); + + // Register HpuDevice and key as thread-local engine + set_server_key((hpu_device, compressed_server_key)); + + let clear_a = 27u8; + let clear_b = 128u8; + + let a = FheUint8::encrypt(clear_a, &client_key); + let b = FheUint8::encrypt(clear_b, &client_key); + + // Server-side computation + let result = a + b; + + // Client-side + let decrypted_result: u8 = result.decrypt(&client_key); + + let clear_result = clear_a + clear_b; + + assert_eq!(decrypted_result, clear_result); +} +``` + +### Setting the hpu + +An HPU device is built for a given parameter set. At this point, because HPU is still a prototype, the software provided is retrieving this parameter set from an instantiated HpuDevice. Once retrieved, reading some HPU registers, this parameter set is used by the example applications to generate both client and compressed server keys. +Server key has then to be decompressed by the server to be converted into the right format and uploaded to the device. +Once decompressed, the operations between CPU and HPU are identical. + +### Encryption + +On the client-side, the method to encrypt the data is exactly the same than the CPU one, as shown in the following example: + +```Rust + let clear_a = 27u8; + let clear_b = 128u8; + + let a = FheUint8::encrypt(clear_a, &client_key); + let b = FheUint8::encrypt(clear_b, &client_key); +``` + +### Computation + +The server first needs to set up its keys with `set_server_key((hpu_device, compressed_server_key))`. + +Then, homomorphic computations are performed using the same approach as the [CPU operations](../../fhe-computation/operations/README.md). + +``` rust + // Server-side + let result = a + b; + + //Client-side + let decrypted_result: u8 = result.decrypt(&client_key); + + let clear_result = clear_a + clear_b; + + assert_eq!(decrypted_result, clear_result); +``` + +### Decryption + +Finally, the client decrypts the result using: + +```Rust + let decrypted_result: u8 = result.decrypt(&client_key); +``` + +## List of available operations + +The HPU backend includes the following operations for unsigned encrypted integers: + +| name | symbol | `Enc`/`Enc` | `Enc`/ `Int` | +| --------------------- | -------------- | -------------------------- | -------------------------- | +| Add | `+` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| Sub | `-` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| Mul | `*` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| BitAnd | `&` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| BitOr | `\|` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| BitXor | `^` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| Greater than | `gt` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| Greater or equal than | `ge` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| Lower than | `lt` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| Lower or equal than | `le` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| Equal | `eq` | :heavy\_check\_mark: | :heavy\_check\_mark: | +| Ternary operator | `select` | :heavy\_check\_mark: | :heavy\_check\_mark: | + +{% hint style="info" %} +All operations follow the same syntax than the one described in [here](../../fhe-computation/operations/README.md). +{% endhint %} diff --git a/tfhe/docs/getting_started/benchmarks/README.md b/tfhe/docs/getting_started/benchmarks/README.md index 684e3d7da..508f1e641 100644 --- a/tfhe/docs/getting_started/benchmarks/README.md +++ b/tfhe/docs/getting_started/benchmarks/README.md @@ -1,6 +1,6 @@ # Benchmarks -This document summarizes the timings of some homomorphic operations over 64-bit encrypted integers, depending on the hardware. More details are given for [the CPU](cpu/README.md), [the GPU](gpu/README.md), or [zeros-knowledge proofs](zk_proof_benchmarks.md). +This document summarizes the timings of some homomorphic operations over 64-bit encrypted integers, depending on the hardware. More details are given for [the CPU](cpu/README.md), [the GPU](gpu/README.md), [the HPU](hpu/README.md) or [zeros-knowledge proofs](zk_proof_benchmarks.md). The cryptographic parameters used for benchmarking follow a tweaked uniform (TUniform) noise distribution instead of a Gaussian. The main advantage of this distribution is to be bounded, whereas the usual Gaussian one is not. In some practical cases, this can simplify the use of homomorphic computation. See the [noise section](../security_and_cryptography.md#noise) of the Security and cryptography documentation page for more information on the noise distributions. @@ -16,4 +16,4 @@ make print_doc_bench_parameters All CPU benchmarks in the Table below were launched on an `AWS hpc7a.96xlarge` instance equipped with a 96-core `AMD EPYC 9R14 CPU @ 2.60GHz` and 740GB of RAM, while all GPU benchmarks were launched on 1xH100 GPU, and rely on the multithreaded PBS algorithm. {% endhint %} -![](../../_static/cpu_gpu_integer_benchmark_fheuint64_tuniform_2m64_ciphertext.svg) +![](../../_static/cpu_gpu_hpu_integer_benchmark_fheuint64_tuniform_2m64_ciphertext.svg) diff --git a/tfhe/docs/getting_started/benchmarks/cpu/cpu_integer_operations.md b/tfhe/docs/getting_started/benchmarks/cpu/cpu_integer_operations.md index baad1d500..911fedae5 100644 --- a/tfhe/docs/getting_started/benchmarks/cpu/cpu_integer_operations.md +++ b/tfhe/docs/getting_started/benchmarks/cpu/cpu_integer_operations.md @@ -32,7 +32,7 @@ The next table shows the operation timings on CPU when the left input is encrypt All timings are based on parallelized Radix-based integer operations where each block is encrypted using the default parameters `PARAM_MESSAGE_2_CARRY_2_KS_PBS`. To ensure predictable timings, we perform operations in the `default` mode, which ensures that the input and output encoding are similar (i.e., the carries are always emptied). -You can minimize operational costs by selecting from 'unchecked', 'checked', or 'smart' modes from [the fine-grained APIs](../../../references/fine-grained-apis/quick_start.md), each balancing performance and correctness differently. For more details about parameters, see [here](../../../references/fine-grained-apis/shortint/parameters.md). You can find the benchmark results on GPU for all these operations [here](../../../configuration/gpu_acceleration/benchmark.md). +You can minimize operational costs by selecting from 'unchecked', 'checked', or 'smart' modes from [the fine-grained APIs](../../../references/fine-grained-apis/quick_start.md), each balancing performance and correctness differently. For more details about parameters, see [here](../../../references/fine-grained-apis/shortint/parameters.md). You can find the benchmark results on GPU for all these operations on GPU [here](../../../configuration/gpu_acceleration/benchmark.md) and on HPU [here](../../../configuration/hpu_acceleration/benchmark.md). ## Reproducing TFHE-rs benchmarks diff --git a/tfhe/docs/getting_started/benchmarks/hpu/README.md b/tfhe/docs/getting_started/benchmarks/hpu/README.md new file mode 100644 index 000000000..6310053c9 --- /dev/null +++ b/tfhe/docs/getting_started/benchmarks/hpu/README.md @@ -0,0 +1,11 @@ +# Benchmarks over HPU + +This document details the HPU performance benchmarks of homomorphic operations using **TFHE-rs**. + +By their nature, homomorphic operations run slower than their cleartext equivalents. + +{% hint style="info" %} +All HPU benchmarks were launched on AMD Alveo v80 FPGAs. +{% endhint %} + +* [Integer operations](hpu_integer_operations.md) diff --git a/tfhe/docs/getting_started/benchmarks/hpu/hpu_integer_operations.md b/tfhe/docs/getting_started/benchmarks/hpu/hpu_integer_operations.md new file mode 100644 index 000000000..b57bd0f66 --- /dev/null +++ b/tfhe/docs/getting_started/benchmarks/hpu/hpu_integer_operations.md @@ -0,0 +1,30 @@ +# Integer Operations over HPU + +This document details the HPU performance benchmarks of homomorphic operations on integers using **TFHE-rs**. + +{% hint style="info" %} +All HPU benchmarks were launched on AMD Alveo v80 FPGAs. +{% endhint %} + +The cryptographic parameters `HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64` were used. + +## 1xHPU +Below are the results for the execution on a single Alveo v80 board. +The following table shows the performance when the inputs of the benchmarked operation are encrypted: + +![](../../../_static/hpu_integer_benchmark_hpux1_tuniform_2m64_ciphertext.svg) + +The following table shows the performance when the left input of the benchmarked operation is encrypted and the other is a clear scalar of the same size: + +![](../../../_static/hpu_integer_benchmark_hpux1_tuniform_2m64_plaintext.svg) + +## Reproducing TFHE-rs benchmarks + +**TFHE-rs** benchmarks can be easily reproduced from the [source](https://github.com/zama-ai/tfhe-rs). + +The following example shows how to reproduce **TFHE-rs** benchmarks: + +```shell +#Integer benchmarks: +make bench_integer_hpu +``` diff --git a/tfhe/examples/dark_market/main.rs b/tfhe/examples/dark_market/main.rs index 3f424dd1f..ef26e4e22 100644 --- a/tfhe/examples/dark_market/main.rs +++ b/tfhe/examples/dark_market/main.rs @@ -1,6 +1,6 @@ use std::time::Instant; use tfhe::integer::ciphertext::RadixCiphertext; -use tfhe::integer::keycache::IntegerKeyCache; +use tfhe::integer::keycache::KEY_CACHE; use tfhe::integer::{IntegerKeyKind, ServerKey}; use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS; @@ -73,7 +73,7 @@ fn test_volume_match_fhe( println!("Generating keys..."); let time = Instant::now(); let (client_key, server_key) = - IntegerKeyCache.get_from_params(PARAM_MESSAGE_2_CARRY_2_KS_PBS, IntegerKeyKind::Radix); + KEY_CACHE.get_from_params(PARAM_MESSAGE_2_CARRY_2_KS_PBS, IntegerKeyKind::Radix); println!("Keys generated in {:?}", time.elapsed()); println!("Running test cases for the FHE implementation"); diff --git a/tfhe/examples/hpu/bench.rs b/tfhe/examples/hpu/bench.rs new file mode 100644 index 000000000..2df59f9fa --- /dev/null +++ b/tfhe/examples/hpu/bench.rs @@ -0,0 +1,315 @@ +//! Application dedicated to quick HW test/benchmark and RTL stimulus generation +//! This could be used in tandem with `mockups/tfhe-hpu-mockup/src/mockup.rs or +//! with the real hardware directly. +//! +//! With the `dump-out` option it enable to generate bit-accurate stimulus +//! for RTL simulation + +use std::collections::{HashMap, HashSet}; +pub use std::time::{Duration, Instant}; + +use integer::hpu::ciphertext::HpuRadixCiphertext; +use tfhe::integer::{ClientKey, CompressedServerKey, ServerKey}; + +use itertools::Itertools; +use tfhe::core_crypto::commons::generators::DeterministicSeeder; +use tfhe::core_crypto::prelude::DefaultRandomGenerator; +use tfhe::shortint::parameters::KeySwitch32PBSParameters; +use tfhe::*; +use tfhe_hpu_backend::prelude::*; + +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; + +/// Define CLI arguments +pub use clap::Parser; +pub use clap_num::maybe_hex; +#[derive(clap::Parser, Debug, Clone, serde::Serialize)] +#[clap( + long_about = "HPU stimulus generation application: Start operation on HPU for RTL test purpose." +)] +pub struct Args { + // Fpga configuration ------------------------------------------------------ + /// Toml top-level configuration file + #[clap( + long, + value_parser, + default_value = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml" + )] + pub config: ShellString, + + // Exec configuration ---------------------------------------------------- + /// Select integer width to bench + /// If None default to All available one (c.f. Firmware configuration) + #[clap(long, value_parser)] + pub integer_w: Vec, + + /// Iop to expand and simulate + /// If None default to All IOp + #[clap(long, value_parser)] + pub iop: Vec, + + /// Number of iteration for each IOp + #[clap(long, value_parser, default_value_t = 1)] + pub iter: usize, + + /// Force ct input values + #[clap(long, value_parser=maybe_hex::)] + pub src: Vec, + + /// Force immediat input values + #[clap(long, value_parser=maybe_hex::)] + pub imm: Vec, + + /// Fallback prototype + /// Only apply to IOp with unspecified prototype + /// Used for custom IOp testing when prototype isn't known + /// Syntax example: " <- <0>" + /// Each entry options are (case incensitive): + /// * N, Nat, Native -> Full size integer; + /// * H, Half -> Half size integer; + /// * B, Bool -> boolean value; + #[clap(long, value_parser)] + pub user_proto: Option, + + /// Seed used for some rngs + #[clap(long, value_parser)] + pub seed: Option, + + // Debug option ---------------------------------------------------------- + #[cfg(feature = "hpu-debug")] + /// Hpu io dump path + #[clap(long, value_parser)] + pub io_dump: Option, + + /// Use trivial encrypt ciphertext + #[clap(long, value_parser)] + pub trivial: bool, + + /// Override the firmware implementation used + #[clap(long, value_parser)] + pub fw_impl: Option, +} + +#[derive(Debug)] +pub struct BenchReport(HashMap); + +impl Default for BenchReport { + fn default() -> Self { + Self::new() + } +} + +impl BenchReport { + pub fn new() -> Self { + Self(HashMap::new()) + } +} + +impl std::ops::Deref for BenchReport { + type Target = HashMap; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} +impl std::ops::DerefMut for BenchReport { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl std::fmt::Display for BenchReport { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for op in self.keys().sorted() { + writeln!(f, " {op} -> {:?}", self[op])?; + } + Ok(()) + } +} + +pub fn main() { + let args = Args::parse(); + println!("User Options: {args:?}"); + + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(false) + // Display source code line numbers + .with_line_number(false) + .without_time() + // Build & register the subscriber + .init(); + + // Seeder for args randomization ------------------------------------------ + let mut rng: StdRng = if let Some(seed) = args.seed { + SeedableRng::seed_from_u64((seed & u64::MAX as u128) as u64) + } else { + SeedableRng::from_entropy() + }; + + // Hpu io dump for debug ------------------------------------------------- + #[cfg(feature = "hpu-debug")] + if let Some(dump_path) = args.io_dump.as_ref() { + set_hpu_io_dump(dump_path); + } + + // Override some configuration settings + let mut hpu_config = HpuConfig::from_toml(args.config.expand().as_str()); + if let Some(name) = args.fw_impl { + hpu_config.firmware.implementation = name; + } + + // Instantiate HpuDevice -------------------------------------------------- + let hpu_device = HpuDevice::new(hpu_config); + + // Force key seeder if seed specified by user + if let Some(seed) = args.seed { + let mut seeder = DeterministicSeeder::::new(Seed(seed)); + let shortint_engine = crate::shortint::engine::ShortintEngine::new_from_seeder(&mut seeder); + crate::shortint::engine::ShortintEngine::with_thread_local_mut(|engine| { + std::mem::replace(engine, shortint_engine) + }); + } + + // Extract pbs_configuration from Hpu and create Client/Server Key + let cks = ClientKey::new(KeySwitch32PBSParameters::from(hpu_device.params())); + let sks = ServerKey::new_radix_server_key(&cks); + let sks_compressed = CompressedServerKey::new_radix_compressed_server_key(&cks); + + // Init Hpu device with server key and firmware + tfhe::integer::hpu::init_device(&hpu_device, sks_compressed).expect("Invalid key"); + + // Create IOps/Width list ------------------------------------------------ + let bench_iop = if !args.iop.is_empty() { + args.iop.clone() + } else { + hpu_asm::iop::IOP_LIST.to_vec() + }; + + let bench_w = if !args.integer_w.is_empty() { + HashSet::from_iter(args.integer_w.iter().cloned()) + } else { + hpu_device.config().firmware.integer_w.clone() + }; + + assert!( + bench_w.is_subset(&hpu_device.config().firmware.integer_w), + "Requested integer width {:?} isn't enabled [Hpu: {:?}] and could lead to Undefined Behavior.", + bench_w, + hpu_device.config().firmware.integer_w + ); + + // Execute based on required integer_w ------------------------------------ + let mut report = Vec::with_capacity(bench_w.len()); + for width in bench_w.iter() { + let num_block = width / hpu_device.params().pbs_params.message_width; + + let mut width_report = BenchReport::new(); + for iop in bench_iop.iter() { + let proto = if let Some(format) = iop.format() { + format.proto.clone() + } else { + args.user_proto.clone().expect( + "Use of user defined IOp required a explicit prototype -> C.f. --user-proto", + ) + }; + + let (srcs_clear, srcs_enc): (Vec<_>, Vec<_>) = proto + .src + .iter() + .enumerate() + .map(|(pos, mode)| { + let (bw, block) = match mode { + hpu_asm::iop::VarMode::Native => (*width, num_block), + hpu_asm::iop::VarMode::Half => (width / 2, num_block / 2), + hpu_asm::iop::VarMode::Bool => (1, 1), + }; + + let clear = *args + .src + .get(pos) + .unwrap_or(&rng.gen_range(0..=u128::MAX >> (u128::BITS - (bw as u32)))); + let fhe = if args.trivial { + sks.create_trivial_radix(clear, block) + } else { + cks.encrypt_radix(clear, block) + }; + let hpu_fhe = HpuRadixCiphertext::from_radix_ciphertext(&fhe, &hpu_device); + (clear, hpu_fhe) + }) + .unzip(); + + let imms = (0..proto.imm) + .map(|pos| { + *args + .imm + .get(pos) + .unwrap_or(&rng.gen_range(0..u128::MAX >> (u128::BITS - (*width as u32)))) + }) + .collect::>(); + + println!( + "{}:: Start test loop for IOp {iop} ...", + stringify!($fhe_type) + ); + let roi_start = Instant::now(); + + let res_hpu = (0..args.iter) + .map(|_i| { + let res = HpuRadixCiphertext::exec(&proto, iop.opcode(), &srcs_enc, &imms); + std::hint::black_box(&res); + res + }) + .next_back() + .expect("Iteration must be greater than 0"); + + // let res_fhe = $fhe_type::from(res_hpu); + let res_fhe = res_hpu + .iter() + .map(|x| x.to_radix_ciphertext()) + .collect::>(); + let roi_duration = roi_start.elapsed(); + let op_duration = roi_duration / (args.iter as u32); + let res = res_fhe + .iter() + .map(|x| cks.decrypt_radix(x)) + .collect::>(); + println!("Integer_{width}b:: Execution report: {iop}"); + println!( + "Behavior : {res:?} <- {iop} <{:?}> <{:?}> {{{}}}", + srcs_clear, imms, args.iter + ); + println!( + "Behavior (in hex): {res:x?} <- {iop} <{:x?}> <{:x?}> {{{}}}", + srcs_clear, imms, args.iter + ); + println!("Performance: {iop} -> {op_duration:?} [{roi_duration:?}]"); + width_report.insert(iop.to_string(), op_duration); + } + report.push((format!("Integer_{width}"), width_report)); + + // Prevent potential performance dropdown due to memory fragrmentation + hpu_device.mem_sanitizer(); + } + + // Display summary report ---------------------------------------------------------- + println!("--------------------------------------------------------------------------------"); + for (name, perf) in report { + println!("________________________________________"); + println!("Benchmark report for {name}:"); + println!("{perf}"); + println!("________________________________________"); + } + println!("--------------------------------------------------------------------------------"); + #[cfg(feature = "hpu-debug")] + if let Some(io_dump) = args.io_dump { + println!("Stimulus generated in {io_dump}"); + } else { + println!("No stimulus generated. C.f. `--iop-dump` for more information"); + } +} diff --git a/tfhe/examples/hpu/hlapi.rs b/tfhe/examples/hpu/hlapi.rs new file mode 100644 index 000000000..b78b8d19d --- /dev/null +++ b/tfhe/examples/hpu/hlapi.rs @@ -0,0 +1,197 @@ +macro_rules! impl_hlapi_showcase { + ($fhe_type: ty, $user_type: ty) => { + ::paste::paste! { + fn hlapi_showcase( + cks: &ClientKey, + rng: &mut StdRng, + ) { + println!("Start showcase on {} ----------------------------------------", stringify!($fhe_type)); + // Sum ------------------------------------------------------------- + // Generate random inputs value and compute expected result + let in_a = rng.gen_range(0..$user_type::MAX); + let in_b = rng.gen_range(0..$user_type::MAX); + let clear_sum_ab = in_a.wrapping_add(in_b); + + // Encrypt input value + let fhe_a = $fhe_type::encrypt(in_a, cks); + let fhe_b = $fhe_type::encrypt(in_b, cks); + + // Triggered operation on HPU through hl_api + let fhe_sum_ab = fhe_a+fhe_b; + + // Decrypt values + let dec_sum_ab: $user_type = fhe_sum_ab.decrypt(cks); + + // Display result and check + println!(" {} + {} = fhe({}), clear({})", in_a, in_b, dec_sum_ab, clear_sum_ab); + assert_eq!(dec_sum_ab, clear_sum_ab, + "Error with + operation get {}, expect {}",dec_sum_ab, clear_sum_ab); + + // Product --------------------------------------------------------- + // Generate random inputs value and compute expected result + let in_a = rng.gen_range(0..$user_type::MAX); + let in_b = rng.gen_range(0..$user_type::MAX); + + let clear_mul_ab = in_a.wrapping_mul(in_b); + + // Encrypt input value + let fhe_a = $fhe_type::encrypt(in_a, cks); + let fhe_b = $fhe_type::encrypt(in_b, cks); + + // Triggered operation on HPU through hl_api + let fhe_mul_ab = fhe_a * fhe_b; + + // Decrypt values + let dec_mul_ab: $user_type = fhe_mul_ab.decrypt(cks); + + // Display result and check + println!(" {} * {} = fhe({}), clear({})", in_a, in_b, dec_mul_ab, clear_mul_ab); + assert_eq!(dec_mul_ab, clear_mul_ab, + "Error with * operation get {}, expect {}",dec_mul_ab, clear_mul_ab); + + // BW_XOR ---------------------------------------------------------- + // Generate random inputs value and compute expected result + let in_a = rng.gen_range(0..$user_type::MAX); + let in_b = rng.gen_range(0..$user_type::MAX); + + let clear_bw_xor_ab = in_a ^ in_b; + + // Encrypt input value + let fhe_a = $fhe_type::encrypt(in_a, cks); + let fhe_b = $fhe_type::encrypt(in_b, cks); + + // Triggered operation on HPU through hl_api + let fhe_bw_xor_ab = fhe_a ^ fhe_b; + + // Decrypt values + let dec_bw_xor_ab: $user_type = fhe_bw_xor_ab.decrypt(cks); + + // Display result and check + println!(" {} ^ {} = fhe({}), clear({})", in_a, in_b, dec_bw_xor_ab, clear_bw_xor_ab); + + assert_eq!(dec_bw_xor_ab, clear_bw_xor_ab, + "Error with ^ operation get {}, expect {}",dec_bw_xor_ab, clear_bw_xor_ab); + + // CMP_GTE --------------------------------------------------------- + // Generate random inputs value and compute expected result + let in_a = rng.gen_range(0..$user_type::MAX); + let in_b = rng.gen_range(0..$user_type::MAX); + + let clear_cmp_gte_ab = in_a >= in_b; + + // Encrypt input value + let fhe_a = $fhe_type::encrypt(in_a, cks); + let fhe_b = $fhe_type::encrypt(in_b, cks); + + // Triggered operation on HPU through hl_api + let fhe_cmp_gte_ab = fhe_a.ge(fhe_b); + + // Decrypt values + let dec_cmp_gte_ab: bool = fhe_cmp_gte_ab.decrypt(cks); + + // Display result and check + println!(" {} >= {} = fhe({}), clear({})", in_a, in_b, dec_cmp_gte_ab, clear_cmp_gte_ab); + + assert_eq!(dec_cmp_gte_ab, clear_cmp_gte_ab, + "Error with >= operation get {}, expect {}",dec_cmp_gte_ab, clear_cmp_gte_ab); + } + }; + }; +} + +fn main() { + use tfhe::core_crypto::commons::generators::DeterministicSeeder; + use tfhe::core_crypto::prelude::DefaultRandomGenerator; + use tfhe::prelude::*; + use tfhe::{set_server_key, ClientKey, CompressedServerKey, Config, FheUint8, *}; + use tfhe_hpu_backend::prelude::*; + + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + + pub use clap::Parser; + /// Define CLI arguments + #[derive(clap::Parser, Debug, Clone, serde::Serialize)] + #[clap(long_about = "HPU example that shows the use of the HighLevelAPI.")] + pub struct Args { + // Fpga configuration ------------------------------------------------------ + /// Toml top-level configuration file + #[clap( + long, + value_parser, + default_value = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml" + )] + pub config: ShellString, + + // Exec configuration ---------------------------------------------------- + /// Seed used for some rngs + #[clap(long, value_parser)] + pub seed: Option, + } + let args = Args::parse(); + println!("User Options: {args:?}"); + + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(false) + // Display source code line numbers + .with_line_number(false) + .without_time() + // Build & register the subscriber + .init(); + + // Seeder for args randomization ------------------------------------------ + let mut rng: StdRng = if let Some(seed) = args.seed { + SeedableRng::seed_from_u64((seed & u64::MAX as u128) as u64) + } else { + SeedableRng::from_entropy() + }; + + // Instantiate HpuDevice -------------------------------------------------- + let hpu_device = HpuDevice::from_config(&args.config.expand()); + + // Generate keys ---------------------------------------------------------- + let config = Config::from_hpu_device(&hpu_device); + + // Force key seeder if seed specified by user + if let Some(seed) = args.seed { + let mut seeder = DeterministicSeeder::::new(Seed(seed)); + let shortint_engine = tfhe::shortint::engine::ShortintEngine::new_from_seeder(&mut seeder); + tfhe::shortint::engine::ShortintEngine::with_thread_local_mut(|engine| { + std::mem::replace(engine, shortint_engine) + }); + } + + let cks = ClientKey::generate(config); + let csks = CompressedServerKey::new(&cks); + + set_server_key((hpu_device, csks)); + + // Show 8bit capabilities -------------------------------------------------- + { + impl_hlapi_showcase!(FheUint8, u8); + hlapi_showcase(&cks, &mut rng); + } + + // Show 16bit capabilities ------------------------------------------------- + { + impl_hlapi_showcase!(FheUint16, u16); + hlapi_showcase(&cks, &mut rng); + } + + // Show 32bit capabilities ------------------------------------------------- + { + impl_hlapi_showcase!(FheUint32, u32); + hlapi_showcase(&cks, &mut rng); + } + + // Show 64bit capabilities ------------------------------------------------- + { + impl_hlapi_showcase!(FheUint64, u64); + hlapi_showcase(&cks, &mut rng); + } +} diff --git a/tfhe/examples/hpu/matmul.rs b/tfhe/examples/hpu/matmul.rs new file mode 100644 index 000000000..31b52acc0 --- /dev/null +++ b/tfhe/examples/hpu/matmul.rs @@ -0,0 +1,157 @@ +// tfhe +use tfhe::prelude::*; +// hpu +use crate::tfhe_hpu_backend::prelude::*; +use tfhe::{set_server_key, FheUint64, *}; +// misc +pub use clap::Parser; +use rand::Rng; + +fn main() { + // Register tracing subscriber that use env-filter + // Select verbosity with env_var: e.g. `RUST_LOG=Alu=trace` + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + // Display source code file paths + .with_file(false) + // Display source code line numbers + .with_line_number(false) + .without_time() + // Build & register the subscriber + .init(); + + println!("\n----------------------------------------------"); + println!("- hpu demo: matrix multiplication -"); + println!("----------------------------------------------"); + // This examples performs a matrix multiplication between matrix_a and matrix_b + // matrix_a as m rows and n columns + // matrix_b as n rows and p columns + // m=3, n=2 and p=2 can be set using CLI by adding: -- --m=3 --n=2 --p=2 + + /// Define CLI arguments + #[derive(clap::Parser, Debug, Clone, serde::Serialize)] + #[clap(long_about = "HPU example that shows the use of the HighLevelAPI.")] + pub struct Args { + #[clap( + long, + value_parser, + default_value = "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml" + )] + pub config: ShellString, + + /// Number of rows in matrix A + #[clap(long, value_parser, default_value_t = 2)] + pub m: usize, + /// Number of columns in matrix A and Number of rows in matrix B + #[clap(long, value_parser, default_value_t = 2)] + pub n: usize, + /// Number of columns in matrix B + #[clap(long, value_parser, default_value_t = 2)] + pub p: usize, + } + let args = Args::parse(); + let hpu_device = HpuDevice::from_config(&args.config.expand()); + + println!("\n 1. Key generation"); + println!(" Generate client and server keys..."); + // println!(" -> targeting CPU"); + // let config = ConfigBuilder::default().build(); + // let (client_key, server_key) = generate_keys(config); + println!(" -> targeting HPU"); + let config = Config::from_hpu_device(&hpu_device); + let client_key = ClientKey::generate(config); + let server_key = CompressedServerKey::new(&client_key); + + // println!(" Upload keys-material on CPU..."); + // set_server_key(server_key); + println!(" Upload keys-material on HPU..."); + set_server_key((hpu_device, server_key)); + + println!("\n 2. Matrices definition"); + let random_matrix_a = (0..args.m) + .map(|_| { + (0..args.n) + .map(|_| rand::thread_rng().gen::()) + .collect::>() + }) + .collect::>>(); + let random_matrix_b = (0..args.n) + .map(|_| { + (0..args.p) + .map(|_| rand::thread_rng().gen::()) + .collect::>() + }) + .collect::>>(); + + println!("\n 3. Encrypting the two matrices"); + let encrypted_matrix_a: Vec> = random_matrix_a + .iter() + .map(|row| { + row.iter() + .map(|&val| FheUint64::encrypt(val, &client_key)) + .collect() + }) + .collect(); + + let encrypted_matrix_b: Vec> = random_matrix_b + .iter() + .map(|row| { + row.iter() + .map(|&val| FheUint64::encrypt(val, &client_key)) + .collect() + }) + .collect(); + + println!("\n 4. Triggering operations through hl_api"); + // Do a cartesian product over matrix_a rows and matrix_b cols + let fhe_result = (0..args.m) + .map(|i| { + (0..args.p) + .map(|j| { + (0..args.n).fold( + FheUint64::try_encrypt(0u64, &client_key).unwrap(), + |acc, k| acc + &encrypted_matrix_a[i][k] * &encrypted_matrix_b[k][j], + ) + }) + .collect::>() + }) + .collect::>>(); + + println!("\n 5. Wait for computation"); + fhe_result + .last() // last row + .expect("Compute over empty row matrix ") + .last() // last coef of the row + .expect("Compute over empty column matrix") + .wait(); + + println!("\n 6. Decrypting result"); + let dec_result = fhe_result + .iter() + .map(|row| { + row.iter() + .map(|x| x.decrypt(&client_key)) + .collect::>() + }) + .collect::>>(); + + println!("\n----------------------------------------------"); + println!("- checker: cleartext computation -"); + println!("----------------------------------------------"); + let clear_result = (0..args.m) + .map(|i| { + (0..args.p) + .map(|j| { + (0..args.n).fold(0_u64, |acc, k| { + acc.wrapping_add(random_matrix_a[i][k].wrapping_mul(random_matrix_b[k][j])) + }) + }) + .collect::>() + }) + .collect::>>(); + println!("\n> decrypted result {dec_result:?}"); + println!("> cleartext result {clear_result:?}"); + + assert!(clear_result == dec_result, "matrices are not the same"); +} diff --git a/tfhe/examples/sha256.rs b/tfhe/examples/sha256.rs index 20a27eb3b..9731e86b9 100644 --- a/tfhe/examples/sha256.rs +++ b/tfhe/examples/sha256.rs @@ -229,6 +229,11 @@ fn main() -> Result<(), std::io::Error> { }); set_server_key(server_key); } + #[cfg(feature = "hpu")] + (Device::Hpu, _) => { + println!("Hpu is not supported"); + std::process::exit(1); + } } println!("key gen end"); diff --git a/tfhe/src/core_crypto/algorithms/lwe_bootstrap_key_conversion.rs b/tfhe/src/core_crypto/algorithms/lwe_bootstrap_key_conversion.rs index 31807f5d1..6c512dfeb 100644 --- a/tfhe/src/core_crypto/algorithms/lwe_bootstrap_key_conversion.rs +++ b/tfhe/src/core_crypto/algorithms/lwe_bootstrap_key_conversion.rs @@ -295,6 +295,7 @@ pub fn par_convert_standard_lwe_bootstrap_key_to_fourier_128( input_bsk: &LweBootstrapKey, output_bsk: &mut NttLweBootstrapKey, + option: NttLweBootstrapKeyOption, ) where InputCont: Container, OutputCont: ContainerMut, @@ -355,9 +356,11 @@ pub fn convert_standard_lwe_bootstrap_key_to_ntt64( output_poly.as_mut_view(), input_poly, ); - ntt.plan.normalize(output_poly.as_mut()); } else { - ntt.forward_normalized(output_poly, input_poly); + ntt.forward(output_poly.as_mut_view(), input_poly); + } + if matches!(option, NttLweBootstrapKeyOption::Normalize) { + ntt.plan.normalize(output_poly.as_mut()); } } } @@ -365,6 +368,7 @@ pub fn convert_standard_lwe_bootstrap_key_to_ntt64( pub fn par_convert_standard_lwe_bootstrap_key_to_ntt64( input_bsk: &LweBootstrapKey, output_bsk: &mut NttLweBootstrapKey, + option: NttLweBootstrapKeyOption, ) where InputCont: Container + std::marker::Sync, OutputCont: ContainerMut, @@ -434,9 +438,11 @@ pub fn par_convert_standard_lwe_bootstrap_key_to_ntt64( output_poly.as_mut_view(), input_poly, ); - ntt.plan.normalize(output_poly.as_mut()); } else { - ntt.forward_normalized(output_poly, input_poly); + ntt.forward(output_poly.as_mut_view(), input_poly); + } + if matches!(option, NttLweBootstrapKeyOption::Normalize) { + ntt.plan.normalize(output_poly.as_mut()); } } }); diff --git a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs index 5e0030767..d9bf04e88 100644 --- a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs +++ b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_bnf_pbs.rs @@ -92,8 +92,12 @@ use dyn_stack::{PodStack, SizeOverflow, StackReq}; /// ); /// /// // Use the conversion function (a memory optimized version also exists but is more complicated -/// // to use) to convert the standard bootstrapping key to the Fourier domain -/// convert_standard_lwe_bootstrap_key_to_ntt64(&std_bootstrapping_key, &mut ntt_bsk); +/// // to use) to convert the standard bootstrapping key to the Fourier domain. +/// convert_standard_lwe_bootstrap_key_to_ntt64( +/// &std_bootstrapping_key, +/// &mut ntt_bsk, +/// NttLweBootstrapKeyOption::Raw, +/// ); /// // We don't need the standard bootstrapping key anymore /// drop(std_bootstrapping_key); /// @@ -161,12 +165,13 @@ use dyn_stack::{PodStack, SizeOverflow, StackReq}; /// "Multiplication via PBS result is correct! Expected 6, got {pbs_multiplication_result}" /// ); /// ``` -pub fn blind_rotate_ntt64_bnf_assign( +pub fn blind_rotate_ntt64_bnf_assign( input: &LweCiphertext, lut: &mut GlweCiphertext, bsk: &NttLweBootstrapKey, ) where - InputCont: Container, + InputScalar: UnsignedInteger + CastInto, + InputCont: Container, OutputCont: ContainerMut, KeyCont: Container, { @@ -194,21 +199,22 @@ pub fn blind_rotate_ntt64_bnf_assign( /// a properly configured [`Ntt64View`] object and a `PodStack` used as a memory buffer having a /// capacity at least as large as the result of /// [`blind_rotate_ntt64_bnf_assign_mem_optimized_requirement`]. -pub fn blind_rotate_ntt64_bnf_assign_mem_optimized( +pub fn blind_rotate_ntt64_bnf_assign_mem_optimized( input: &LweCiphertext, lut: &mut GlweCiphertext, bsk: &NttLweBootstrapKey, ntt: Ntt64View<'_>, stack: &mut PodStack, ) where - InputCont: Container, + InputScalar: UnsignedInteger + CastInto, + InputCont: Container, OutputCont: ContainerMut, KeyCont: Container, { - fn implementation( + fn implementation>( bsk: NttLweBootstrapKeyView<'_, u64>, lut: GlweCiphertextMutView<'_, u64>, - lwe: &[u64], + lwe: &[InputScalar], ntt: Ntt64View<'_>, stack: &mut PodStack, ) { @@ -226,7 +232,7 @@ pub fn blind_rotate_ntt64_bnf_assign_mem_optimized( .into_chunks(poly_size) .map(PolynomialMutView::from_container), ) - .for_each(|(out, ntt_poly)| { + .for_each(|(out, mut ntt_poly)| { + // NB: Bnf implementation apply normalization on each bwd path + ntt.plan.normalize(ntt_poly.as_mut()); ntt.add_backward_on_power_of_two_modulus(power_of_two_modulus_width, out, ntt_poly); }); } diff --git a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs index 76364e1d5..65e717020 100644 --- a/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs +++ b/tfhe/src/core_crypto/algorithms/lwe_programmable_bootstrapping/ntt64_pbs.rs @@ -99,8 +99,13 @@ use dyn_stack::{PodStack, SizeOverflow, StackReq}; /// ); /// /// // Use the conversion function (a memory optimized version also exists but is more complicated -/// // to use) to convert the standard bootstrapping key to the Fourier domain -/// convert_standard_lwe_bootstrap_key_to_ntt64(&std_bootstrapping_key, &mut ntt_bsk); +/// // to use) to convert the standard bootstrapping key to the Fourier domain. +/// // The obtained coefficient contains INtt normalization +/// convert_standard_lwe_bootstrap_key_to_ntt64( +/// &std_bootstrapping_key, +/// &mut ntt_bsk, +/// NttLweBootstrapKeyOption::Normalize, +/// ); /// // We don't need the standard bootstrapping key anymore /// drop(std_bootstrapping_key); /// @@ -354,8 +359,13 @@ pub fn blind_rotate_ntt64_assign_mem_optimized( /// ); /// /// // Use the conversion function (a memory optimized version also exists but is more complicated -/// // to use) to convert the standard bootstrapping key to the Fourier domain -/// convert_standard_lwe_bootstrap_key_to_ntt64(&std_bootstrapping_key, &mut ntt_bsk); +/// // to use) to convert the standard bootstrapping key to the Fourier domain. +/// // The obtained coefficient contains INtt normalization +/// convert_standard_lwe_bootstrap_key_to_ntt64( +/// &std_bootstrapping_key, +/// &mut ntt_bsk, +/// NttLweBootstrapKeyOption::Normalize, +/// ); /// // We don't need the standard bootstrapping key anymore /// drop(std_bootstrapping_key); /// diff --git a/tfhe/src/core_crypto/algorithms/test/lwe_programmable_bootstrapping.rs b/tfhe/src/core_crypto/algorithms/test/lwe_programmable_bootstrapping.rs index 257b988e8..0f0bc84f2 100644 --- a/tfhe/src/core_crypto/algorithms/test/lwe_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/algorithms/test/lwe_programmable_bootstrapping.rs @@ -784,7 +784,11 @@ fn lwe_encrypt_pbs_ntt64_decrypt_custom_mod(params: ClassicTestParams) { buffers.resize(stack_size); - par_convert_standard_lwe_bootstrap_key_to_ntt64(&bsk, &mut nbsk); + par_convert_standard_lwe_bootstrap_key_to_ntt64( + &bsk, + &mut nbsk, + NttLweBootstrapKeyOption::Normalize, + ); drop(bsk); @@ -1079,7 +1083,11 @@ fn lwe_encrypt_pbs_ntt64_bnf_decrypt(params: ClassicTestParams) { buffers.resize(stack_size); - par_convert_standard_lwe_bootstrap_key_to_ntt64(&bsk, &mut nbsk); + par_convert_standard_lwe_bootstrap_key_to_ntt64( + &bsk, + &mut nbsk, + NttLweBootstrapKeyOption::Raw, + ); drop(bsk); diff --git a/tfhe/src/core_crypto/algorithms/test/noise_distribution/lwe_hpu_noise.rs b/tfhe/src/core_crypto/algorithms/test/noise_distribution/lwe_hpu_noise.rs new file mode 100644 index 000000000..2270224bc --- /dev/null +++ b/tfhe/src/core_crypto/algorithms/test/noise_distribution/lwe_hpu_noise.rs @@ -0,0 +1,637 @@ +use super::*; +use crate::core_crypto::commons::math::ntt::ntt64::Ntt64; +use crate::core_crypto::commons::test_tools::{torus_modular_diff, variance}; +use std::io; + +// This is 1 / 16 which is exactly representable in an f64 (even an f32) +// 1 / 32 is too strict and fails the tests +const RELATIVE_TOLERANCE: f64 = 0.0625; + +const NB_HPU_TESTS: usize = 5; +const NB_PBS: usize = 200; + +#[derive(Clone, Copy)] +pub struct HpuTestParams { + pub lwe_dimension: LweDimension, + pub glwe_dimension: GlweDimension, + pub polynomial_size: PolynomialSize, + pub lwe_noise_distribution: DynamicDistribution, + pub glwe_noise_distribution: DynamicDistribution, + pub pbs_base_log: DecompositionBaseLog, + pub pbs_level: DecompositionLevelCount, + pub ks_base_log: DecompositionBaseLog, + pub ks_level: DecompositionLevelCount, + pub message_modulus_log: CiphertextModulusLog, + pub ct_width: usize, + pub ksk_width: usize, + pub norm2: u64, + pub ntt_modulus: u64, +} +#[allow(unused)] +pub const HPU_TEST_PARAMS_4_BITS_HPU_44_KS_21: HpuTestParams = HpuTestParams { + lwe_dimension: LweDimension(742), + glwe_dimension: GlweDimension(2), + polynomial_size: PolynomialSize(1024), + lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 1.259_780_968_897_627_7e-5, + )), + glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 2.2737367544323206e-13, + )), + pbs_base_log: DecompositionBaseLog(20), + pbs_level: DecompositionLevelCount(1), + ks_level: DecompositionLevelCount(7), + ks_base_log: DecompositionBaseLog(2), + message_modulus_log: CiphertextModulusLog(4), + ct_width: 44, + ksk_width: 21, + norm2: 5, + ntt_modulus: 17592186028033, +}; + +#[allow(unused)] +pub const HPU_TEST_PARAMS_4_BITS_HPU_64_KS_21: HpuTestParams = HpuTestParams { + lwe_dimension: LweDimension(786), + glwe_dimension: GlweDimension(1), + polynomial_size: PolynomialSize(2048), + lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 5.314_123_935_599_821e-6, + )), + glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 9.1881734381394e-16, + )), + pbs_base_log: DecompositionBaseLog(24), + pbs_level: DecompositionLevelCount(1), + ks_level: DecompositionLevelCount(8), + ks_base_log: DecompositionBaseLog(2), + message_modulus_log: CiphertextModulusLog(4), + ct_width: 64, + ksk_width: 21, + norm2: 5, + ntt_modulus: 18446744069414584321, +}; + +#[allow(unused)] +pub const HPU_TEST_PARAMS_4_BITS_HPU_64_KS_21_132_GAUSSIAN: HpuTestParams = HpuTestParams { + lwe_dimension: LweDimension(804), + glwe_dimension: GlweDimension(1), + polynomial_size: PolynomialSize(2048), + lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 5.963_599_673_924_788e-6, + )), + glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 2.8452674713391114e-15, + )), + pbs_base_log: DecompositionBaseLog(23), + pbs_level: DecompositionLevelCount(1), + ks_level: DecompositionLevelCount(8), + ks_base_log: DecompositionBaseLog(2), + message_modulus_log: CiphertextModulusLog(4), + ct_width: 64, + ksk_width: 21, + norm2: 5, + ntt_modulus: 18446744069414584321, +}; + +#[allow(unused)] +pub const HPU_TEST_PARAMS_4_BITS_HPU_64_KS_21_132_TUNIFORM: HpuTestParams = HpuTestParams { + lwe_dimension: LweDimension(839), + glwe_dimension: GlweDimension(1), + polynomial_size: PolynomialSize(2048), + lwe_noise_distribution: DynamicDistribution::new_t_uniform(4), + glwe_noise_distribution: DynamicDistribution::new_t_uniform(17), + pbs_base_log: DecompositionBaseLog(23), + pbs_level: DecompositionLevelCount(1), + ks_level: DecompositionLevelCount(7), + ks_base_log: DecompositionBaseLog(2), + message_modulus_log: CiphertextModulusLog(4), + ct_width: 64, + ksk_width: 21, + norm2: 5, + ntt_modulus: 18446744069414584321, +}; + +#[allow(unused)] +pub const HPU_TEST_PARAMS_4_BITS_NATIVE_U64: HpuTestParams = HpuTestParams { + lwe_dimension: LweDimension(742), + glwe_dimension: GlweDimension(1), + polynomial_size: PolynomialSize(2048), + lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 0.000007069849454709433, + )), + glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 0.00000000000000029403601535432533, + )), + pbs_base_log: DecompositionBaseLog(23), + pbs_level: DecompositionLevelCount(1), + ks_level: DecompositionLevelCount(5), + ks_base_log: DecompositionBaseLog(3), + message_modulus_log: CiphertextModulusLog(4), + ct_width: 64, + ksk_width: 64, + norm2: 5, + ntt_modulus: 18446744069414584321, +}; + +#[allow(unused)] +pub const HPU_TEST_PARAMS_4_BITS_NATIVE_U64_132_BITS_GAUSSIAN: HpuTestParams = HpuTestParams { + lwe_dimension: LweDimension(841), + glwe_dimension: GlweDimension(1), + polynomial_size: PolynomialSize(2048), + lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 3.149_667_468_577_243_5e-6, + )), + glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 2.845267479601915e-15, + )), + pbs_base_log: DecompositionBaseLog(22), + pbs_level: DecompositionLevelCount(1), + ks_level: DecompositionLevelCount(5), + ks_base_log: DecompositionBaseLog(3), + message_modulus_log: CiphertextModulusLog(4), + ct_width: 64, + ksk_width: 64, + norm2: 5, + ntt_modulus: 18446744069414584321, +}; + +pub fn get_modulo_value(modulus: &CiphertextModulus) -> u128 { + if modulus.is_native_modulus() { + let converted: CiphertextModulus = modulus.try_to().unwrap(); + u128::cast_from(converted.get_custom_modulus()) + } else { + u128::cast_from(modulus.get_custom_modulus()) + } +} + +//fn lwe_noise_distribution_hpu + CastFrom>( +fn hpu_noise_distribution(params: HpuTestParams) { + let lwe_dimension = params.lwe_dimension; + let glwe_dimension = params.glwe_dimension; + let polynomial_size = params.polynomial_size; + let lwe_noise_distribution = params.lwe_noise_distribution; + let glwe_noise_distribution = params.glwe_noise_distribution; + let ciphertext_modulus = CiphertextModulus::try_new_power_of_2(params.ct_width).unwrap(); + let message_modulus_log = params.message_modulus_log; + let ks_decomp_base_log = params.ks_base_log; + let ks_decomp_level_count = params.ks_level; + let pbs_decomp_base_log = params.pbs_base_log; + let pbs_decomp_level_count = params.pbs_level; + let ksk_modulus = CiphertextModulus::try_new_power_of_2(params.ksk_width).unwrap(); + let ntt_modulus = CiphertextModulus::::new(params.ntt_modulus as u128); + + let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus); + let ksk_encoding_with_padding = get_encoding_with_padding(ksk_modulus); + let expected_variance = match glwe_noise_distribution { + DynamicDistribution::Gaussian(_) => { + glwe_noise_distribution.gaussian_std_dev().get_variance() + } + DynamicDistribution::TUniform(tuniform) => Variance( + ((2.0 * (tuniform.bound_log2() as f64) + 1.0).exp2() + 1.0) / 6.0 + * (-2.0 * (params.ct_width as f64)).exp2(), + ), + }; + + let mut rsc = TestResources::new(); + + let msg_modulus = 1 << message_modulus_log.0; + let mut msg: u64 = msg_modulus; + let delta: u64 = encoding_with_padding / msg_modulus; + let ks_delta: u64 = ksk_encoding_with_padding / msg_modulus; + let norm2 = params.norm2; + + let num_samples = NB_PBS * NB_HPU_TESTS * (msg as usize); + let mut noise_samples = (0..4) + .map(|_| Vec::with_capacity(num_samples)) + .collect::>(); + println!("ciphertext_modulus {ciphertext_modulus:?} ksk_modulus {ksk_modulus:?} message_modulus_log {message_modulus_log:?} encoding_with_padding {encoding_with_padding } expected_variance {expected_variance:?} msg_modulus {msg_modulus} msg {msg} delta {delta}"); + + let f = |x: u64| x.wrapping_rem(msg_modulus); + + let accumulator = generate_programmable_bootstrap_glwe_lut( + polynomial_size, + glwe_dimension.to_glwe_size(), + msg_modulus.cast_into(), + ciphertext_modulus, + delta, + f, + ); + + assert!(check_encrypted_content_respects_mod( + &accumulator, + ciphertext_modulus + )); + + let mut lwe_sk = LweSecretKeyOwned::new_empty_key(0, lwe_dimension); + + let mut glwe_sk = GlweSecretKeyOwned::new_empty_key(0, glwe_dimension, polynomial_size); + + let mut blwe_sk = glwe_sk.clone().into_lwe_secret_key(); + let mut ksk_in_kskmod = LweKeyswitchKeyOwned::new( + 0, + ks_decomp_base_log, + ks_decomp_level_count, + blwe_sk.lwe_dimension(), + lwe_sk.lwe_dimension(), + ksk_modulus, + ); + + println!( + "n {:?} k {:?} N {:?} k*N {:?}", + lwe_sk.lwe_dimension(), + glwe_dimension, + polynomial_size, + blwe_sk.lwe_dimension() + ); + + // it includes variance of mod switch from KS modulus to 2N + let (exp_add_ks_variance, _exp_modswitch_variance) = match lwe_noise_distribution { + DynamicDistribution::Gaussian(_) => { + variance_formula::lwe_keyswitch::keyswitch_additive_variance_132_bits_security_gaussian( + glwe_dimension, + polynomial_size, + lwe_sk.lwe_dimension(), + ks_decomp_level_count, + ks_decomp_base_log, + get_modulo_value(&ksk_modulus) as f64, + get_modulo_value(&ciphertext_modulus) as f64, + ) + } + DynamicDistribution::TUniform(_) => { + variance_formula::lwe_keyswitch::keyswitch_additive_variance_132_bits_security_tuniform( + glwe_dimension, + polynomial_size, + lwe_sk.lwe_dimension(), + ks_decomp_level_count, + ks_decomp_base_log, + get_modulo_value(&ksk_modulus) as f64, + get_modulo_value(&ciphertext_modulus) as f64, + ) + } + }; + println!( + "KS additive theo variance: {:?} theo std_dev {:?} / {:?}", + exp_add_ks_variance.0, + exp_add_ks_variance.get_standard_dev(), + exp_add_ks_variance.get_log_standard_dev() + ); + let mut bsk = LweBootstrapKey::new( + 0, + glwe_dimension.to_glwe_size(), + polynomial_size, + pbs_decomp_base_log, + pbs_decomp_level_count, + lwe_dimension, + ciphertext_modulus, + ); + + let exp_pbs_variance = match lwe_noise_distribution { + DynamicDistribution::Gaussian(_) => { + variance_formula::lwe_programmable_bootstrap::pbs_variance_132_bits_security_gaussian( + lwe_dimension, + glwe_dimension, + polynomial_size, + pbs_decomp_level_count, + pbs_decomp_base_log, + get_modulo_value(&ciphertext_modulus) as f64, + get_modulo_value(&ntt_modulus) as f64, + ) + } + DynamicDistribution::TUniform(_) => { + variance_formula::lwe_programmable_bootstrap::pbs_variance_132_bits_security_tuniform( + lwe_dimension, + glwe_dimension, + polynomial_size, + pbs_decomp_level_count, + pbs_decomp_base_log, + get_modulo_value(&ciphertext_modulus) as f64, + get_modulo_value(&ntt_modulus) as f64, + ) + } + }; + println!( + "PBS theo variance: {:?} std_dev {:?}/{:?}", + exp_pbs_variance.0, + exp_pbs_variance.get_standard_dev(), + exp_pbs_variance.get_log_standard_dev() + ); + + let mut nbsk = NttLweBootstrapKeyOwned::::new( + 0, + bsk.input_lwe_dimension(), + bsk.glwe_size(), + bsk.polynomial_size(), + bsk.decomposition_base_log(), + bsk.decomposition_level_count(), + ntt_modulus, + ); + + let mut buffers = ComputationBuffers::new(); + + let ntt = Ntt64::new(ntt_modulus, nbsk.polynomial_size()); + let ntt = ntt.as_view(); + + let stack_size = programmable_bootstrap_ntt64_bnf_lwe_ciphertext_mem_optimized_requirement( + glwe_dimension.to_glwe_size(), + polynomial_size, + ntt, + ) + .unwrap() + .try_unaligned_bytes_required() + .unwrap(); + + buffers.resize(stack_size); + + while msg != 0 { + msg = msg.wrapping_sub(1); + for i in 0..NB_HPU_TESTS { + // re-generate keys + generate_binary_lwe_secret_key(&mut lwe_sk, &mut rsc.secret_random_generator); + generate_binary_glwe_secret_key(&mut glwe_sk, &mut rsc.secret_random_generator); + blwe_sk = glwe_sk.clone().into_lwe_secret_key(); + + // re-generate KSK + generate_lwe_keyswitch_key( + &blwe_sk, + &lwe_sk, + &mut ksk_in_kskmod, + lwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + // re-generate BSK + par_generate_lwe_bootstrap_key( + &lwe_sk, + &glwe_sk, + &mut bsk, + glwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + nbsk = NttLweBootstrapKeyOwned::::new( + 0, + bsk.input_lwe_dimension(), + bsk.glwe_size(), + bsk.polynomial_size(), + bsk.decomposition_base_log(), + bsk.decomposition_level_count(), + ntt_modulus, + ); + par_convert_standard_lwe_bootstrap_key_to_ntt64( + &bsk, + &mut nbsk, + NttLweBootstrapKeyOption::Raw, + ); + assert!(check_encrypted_content_respects_mod( + &*bsk, + ciphertext_modulus + )); + + // encrypt + let mut ct = + LweCiphertext::new(0, blwe_sk.lwe_dimension().to_lwe_size(), ciphertext_modulus); + let mut out_ks_ct = LweCiphertext::new(0, ksk_in_kskmod.output_lwe_size(), ksk_modulus); + + let plaintext = Plaintext(msg * delta); + + encrypt_lwe_ciphertext( + &blwe_sk, + &mut ct, + plaintext, + glwe_noise_distribution, + &mut rsc.encryption_random_generator, + ); + + assert!(check_encrypted_content_respects_mod( + &ct, + ciphertext_modulus + )); + + let decrypted = decrypt_lwe_ciphertext(&blwe_sk, &ct); + + let decoded = round_decode(decrypted.0, delta) % msg_modulus; + + assert_eq!(msg, decoded); + + let torus_diff = torus_modular_diff(plaintext.0, decrypted.0, ciphertext_modulus); + noise_samples[0].push(torus_diff); + + for j in 0..NB_PBS { + // b = b - (Delta * msg) to have an encryption of 0 + lwe_ciphertext_plaintext_sub_assign(&mut ct, plaintext); + + assert!(check_encrypted_content_respects_mod( + &ct, + ciphertext_modulus + )); + // * norm2 + //lwe_ciphertext_cleartext_mul_assign(&mut ct, + // Cleartext(Scalar::cast_from(norm2))); + lwe_ciphertext_cleartext_mul_assign(&mut ct, Cleartext(norm2)); + + assert!(check_encrypted_content_respects_mod( + &ct, + ciphertext_modulus + )); + + let decrypted_prodnorm2 = decrypt_lwe_ciphertext(&blwe_sk, &ct); + + let decode_prodnorm2 = round_decode(decrypted_prodnorm2.0, delta) % msg_modulus; + + let torus_diff = torus_modular_diff(0, decrypted_prodnorm2.0, ciphertext_modulus); + assert_eq!(0, decode_prodnorm2); + noise_samples[1].push(torus_diff); + // b = b + (Delta * msg) to have a noisy encryption of msg + lwe_ciphertext_plaintext_add_assign(&mut ct, plaintext); + + assert!(check_encrypted_content_respects_mod( + &ct, + ciphertext_modulus + )); + + // Compute key-switch + keyswitch_lwe_ciphertext(&ksk_in_kskmod, &ct, &mut out_ks_ct); + + assert!(check_encrypted_content_respects_mod( + &out_ks_ct, + ksk_modulus + )); + // Noise extraction and decryption check + // NB: After key-switch ciphertext is on ksk_modulus != ct_modulus + + let decrypted_after_ks = decrypt_lwe_ciphertext(&lwe_sk, &out_ks_ct); + + let decode_after_ks = round_decode(decrypted_after_ks.0, ks_delta) % msg_modulus; + + assert_eq!(msg, decode_after_ks); + + // do modulo switch on plaintext post KS only if necessary + let cm_f = get_modulo_value(&ciphertext_modulus); + let ksm_f = get_modulo_value(&ksk_modulus); + let torus_diff = if cm_f == ksm_f { + torus_modular_diff(plaintext.0, decrypted_after_ks.0, ciphertext_modulus) + } else { + let decrypted_after_ks_modswitched = + decrypted_after_ks.0 * ((cm_f / ksm_f) as u64); + torus_modular_diff( + plaintext.0, + decrypted_after_ks_modswitched, + ciphertext_modulus, + ) + }; + + noise_samples[2].push(torus_diff); + + // Compute PBS with NTT + programmable_bootstrap_ntt64_bnf_lwe_ciphertext_mem_optimized( + &out_ks_ct, + &mut ct, + &accumulator, + &nbsk, + ntt, + buffers.stack(), + ); + + assert!(check_encrypted_content_respects_mod( + &ct, + ciphertext_modulus + )); + + let decrypted_pbs = decrypt_lwe_ciphertext(&blwe_sk, &ct); + + let decoded_pbs = round_decode(decrypted_pbs.0, delta) % msg_modulus; + + assert_eq!(decoded_pbs, f(msg)); + let torus_diff = + torus_modular_diff(plaintext.0, decrypted_pbs.0, ciphertext_modulus); + println!("after pbs (msg={msg},test_nb={i}/{NB_HPU_TESTS},pbs_nb={j}/{NB_PBS}): plaintext {:?} post pbs {:?} torus_diff {:?}", plaintext.0, decrypted_pbs.0, torus_diff); + noise_samples[3].push(torus_diff); + } + } + } + + let encryption_variance = variance(&noise_samples[0]); + let bynorm2_variance = variance(&noise_samples[1]); + let after_ks_variance = variance(&noise_samples[2]); + let after_pbs_variance = variance(&noise_samples[3]); + println!( + "exp encrypt var {:?} encrypt var {:?} bynorm2 var {} after_ks_variance {} after_pbs_variance {:?}", + expected_variance.0, + encryption_variance.0, + bynorm2_variance.0, + after_ks_variance.0, + after_pbs_variance.0 + ); + // variance after *norm2 must be around (exp_pbs_variance)*(norm2**2) + // variance after KS must be around (exp_pbs_variance)*(norm2**2)+exp_add_ks_variance + // variance after PBS must be around (exp_pbs_variance) + let expected_bynorm2_variance = Variance(exp_pbs_variance.0 * (norm2 as f64).powf(2.0)); + let expected_after_ks_variance = Variance(expected_bynorm2_variance.0 + exp_add_ks_variance.0); + + let mut wtr = csv::Writer::from_writer(io::stdout()); + let _ = wtr.write_record([ + "data type", + "encrypt exp", + "encrypt", + "post *norm2", + "post KS", + "theo KS", + "post PBS", + "theo PBS", + ]); + let _ = wtr.write_record([ + "variances", + expected_variance.0.to_string().as_str(), + encryption_variance.0.to_string().as_str(), + bynorm2_variance.0.to_string().as_str(), + after_ks_variance.0.to_string().as_str(), + expected_after_ks_variance.0.to_string().as_str(), + after_pbs_variance.0.to_string().as_str(), + exp_pbs_variance.0.to_string().as_str(), + ]); + let _ = wtr.write_record([ + "std_dev", + expected_variance.get_standard_dev().0.to_string().as_str(), + encryption_variance + .get_standard_dev() + .0 + .to_string() + .as_str(), + bynorm2_variance.get_standard_dev().0.to_string().as_str(), + after_ks_variance.get_standard_dev().0.to_string().as_str(), + expected_after_ks_variance + .get_standard_dev() + .0 + .to_string() + .as_str(), + after_pbs_variance.get_standard_dev().0.to_string().as_str(), + exp_pbs_variance.get_standard_dev().0.to_string().as_str(), + ]); + let _ = wtr.write_record([ + "log2 std_dev + ct_w", + (expected_variance.get_log_standard_dev().0 + params.ct_width as f64) + .to_string() + .as_str(), + (encryption_variance.get_log_standard_dev().0 + params.ct_width as f64) + .to_string() + .as_str(), + (bynorm2_variance.get_log_standard_dev().0 + params.ct_width as f64) + .to_string() + .as_str(), + (after_ks_variance.get_log_standard_dev().0 + params.ct_width as f64) + .to_string() + .as_str(), + (expected_after_ks_variance.get_log_standard_dev().0 + params.ct_width as f64) + .to_string() + .as_str(), + (after_pbs_variance.get_log_standard_dev().0 + params.ct_width as f64) + .to_string() + .as_str(), + (exp_pbs_variance.get_log_standard_dev().0 + params.ct_width as f64) + .to_string() + .as_str(), + ]); + + let var_pbs_abs_diff = (exp_pbs_variance.0 - after_pbs_variance.0).abs(); + let pbs_tolerance_thres = RELATIVE_TOLERANCE * exp_pbs_variance.0; + + let var_ksk_abs_diff = (expected_after_ks_variance.0 - after_ks_variance.0).abs(); + let ks_tolerance_thres = RELATIVE_TOLERANCE * expected_after_ks_variance.0; + + let var_bynorm2_abs_diff = (expected_bynorm2_variance.0 - bynorm2_variance.0).abs(); + let bynorm2_tolerance_thres = RELATIVE_TOLERANCE * expected_bynorm2_variance.0; + + let after_pbs_errbit = params.ct_width as f64 + after_pbs_variance.get_log_standard_dev().0; + let after_pbs_exp_errbit = params.ct_width as f64 + exp_pbs_variance.get_log_standard_dev().0; + let bynorm2_errbit = params.ct_width as f64 + bynorm2_variance.get_log_standard_dev().0; + let bynorm2_exp_errbit = + params.ct_width as f64 + expected_bynorm2_variance.get_log_standard_dev().0; + let after_ks_errbit = params.ct_width as f64 + after_ks_variance.get_log_standard_dev().0; + let after_ks_exp_errbit = + params.ct_width as f64 + expected_after_ks_variance.get_log_standard_dev().0; + assert!( + var_pbs_abs_diff < pbs_tolerance_thres, + "Absolute difference for after PBS is incorrect: {var_pbs_abs_diff} >= {pbs_tolerance_thres}, \ + got variance: {after_pbs_variance:?} - log2(str_dev): {after_pbs_errbit:?}, \ + expected variance: {exp_pbs_variance:?} - log2(std_dev): {after_pbs_exp_errbit:?}" + ); + assert!( + var_bynorm2_abs_diff < bynorm2_tolerance_thres, + "Absolute difference for after *norm2 in incorrect: {var_bynorm2_abs_diff} >= {bynorm2_tolerance_thres} \ + got variance: {bynorm2_variance:?} - log2(str_dev): {bynorm2_errbit:?}, \ + expected variance: {expected_bynorm2_variance:?} - log2(std_dev): {bynorm2_exp_errbit:?}" + ); + assert!( + (var_ksk_abs_diff < ks_tolerance_thres) || (after_ks_errbit < after_ks_exp_errbit && (after_ks_exp_errbit - after_ks_errbit < 1f64)), + "Absolute difference for after KS is incorrect: {var_ksk_abs_diff} >= {ks_tolerance_thres} or more than 1 bit away \ + got variance: {after_ks_variance:?} - log2(str_dev): {after_ks_errbit:?}, \ + expected variance: {expected_after_ks_variance:?} - log2(std_dev): {after_ks_exp_errbit:?}" + ); +} + +create_parameterized_test!(hpu_noise_distribution { + //HPU_TEST_PARAMS_4_BITS_NATIVE_U64, + //HPU_TEST_PARAMS_4_BITS_HPU_44_KS_21, + //HPU_TEST_PARAMS_4_BITS_HPU_64_KS_21, + HPU_TEST_PARAMS_4_BITS_HPU_64_KS_21_132_GAUSSIAN, + HPU_TEST_PARAMS_4_BITS_HPU_64_KS_21_132_TUNIFORM, + //HPU_TEST_PARAMS_4_BITS_NATIVE_U64_132_BITS_GAUSSIAN, +}); diff --git a/tfhe/src/core_crypto/algorithms/test/noise_distribution/mod.rs b/tfhe/src/core_crypto/algorithms/test/noise_distribution/mod.rs index c5bc4de0b..e77516ce4 100644 --- a/tfhe/src/core_crypto/algorithms/test/noise_distribution/mod.rs +++ b/tfhe/src/core_crypto/algorithms/test/noise_distribution/mod.rs @@ -1,12 +1,14 @@ use super::*; mod lwe_encryption_noise; +mod lwe_hpu_noise; mod lwe_keyswitch_noise; // We are having crashes on aarch64 at the moment, problem is the code paths are not the same // between archs, so we disable those on the Apple M1 #[cfg(not(target_arch = "aarch64"))] mod lwe_multi_bit_programmable_bootstrapping_noise; mod lwe_programmable_bootstrapping_noise; +mod variance_formula; #[allow(clippy::excessive_precision)] pub const NOISE_TEST_PARAMS_4_BITS_NATIVE_U64_132_BITS_GAUSSIAN: ClassicTestParams = diff --git a/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/lwe_keyswitch.rs b/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/lwe_keyswitch.rs new file mode 100644 index 000000000..dcd42c687 --- /dev/null +++ b/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/lwe_keyswitch.rs @@ -0,0 +1,93 @@ +use crate::core_crypto::commons::dispersion::Variance; +use crate::core_crypto::commons::parameters::*; + +pub fn keyswitch_additive_variance_132_bits_security_gaussian( + input_glwe_dimension: GlweDimension, + input_polynomial_size: PolynomialSize, + output_lwe_dimension: LweDimension, + decomposition_level_count: DecompositionLevelCount, + decomposition_base_log: DecompositionBaseLog, + ksk_modulus: f64, + ct_modulus: f64, +) -> (Variance, Variance) { + let var_min = super::secure_noise::minimal_lwe_variance_for_132_bits_security_gaussian( + output_lwe_dimension, + ksk_modulus, + ); + let (var_ks, var_modswitch) = keyswitch_additive_variance_impl( + input_glwe_dimension.0 as f64, + input_polynomial_size.0 as f64, + output_lwe_dimension.0 as f64, + var_min.0, + decomposition_level_count.0 as f64, + decomposition_base_log.0 as i32, + ksk_modulus, + ct_modulus, + ); + (Variance(var_ks), Variance(var_modswitch)) +} + +pub fn keyswitch_additive_variance_132_bits_security_tuniform( + input_glwe_dimension: GlweDimension, + input_polynomial_size: PolynomialSize, + output_lwe_dimension: LweDimension, + decomposition_level_count: DecompositionLevelCount, + decomposition_base_log: DecompositionBaseLog, + ksk_modulus: f64, + ct_modulus: f64, +) -> (Variance, Variance) { + let var_min = super::secure_noise::minimal_lwe_variance_for_132_bits_security_tuniform( + output_lwe_dimension, + ksk_modulus, + ); + let (var_ks, var_modswitch) = keyswitch_additive_variance_impl( + input_glwe_dimension.0 as f64, + input_polynomial_size.0 as f64, + output_lwe_dimension.0 as f64, + var_min.0, + decomposition_level_count.0 as f64, + decomposition_base_log.0 as i32, + ksk_modulus, + ct_modulus, + ); + (Variance(var_ks), Variance(var_modswitch)) +} + +#[allow(clippy::too_many_arguments)] +pub fn keyswitch_additive_variance_impl( + input_glwe_dimension: f64, + input_polynomial_size: f64, + output_lwe_dimension: f64, + var_min: f64, + decomposition_level_count: f64, + decomposition_base_log: i32, + ksk_modulus: f64, + ct_modulus: f64, +) -> (f64, f64) { + //let decomposition_base = 2.0f64.powi(decomposition_base_log.0 as i32); + let pow2_2bl = 2.0f64.powi(2 * (decomposition_level_count as i32) * decomposition_base_log); + let ks_0 = ((input_glwe_dimension * input_polynomial_size) / 2.0) + * (1.0 / pow2_2bl + 2.0 * ct_modulus.powf(-2.0)) + / 12.0; + let ks_1 = (2.0 * ct_modulus.powf(-2.0) + ksk_modulus.powf(-2.0)) / 12.0; + let ks_2 = var_min + * (input_glwe_dimension * input_polynomial_size) + * decomposition_level_count + * (2.0f64.powi(2 * decomposition_base_log) + 2.0) + / 12.0; + + let var_modswitch = (1.0 + output_lwe_dimension / 2.0) + * ((2.0 * input_polynomial_size).powf(-2.0) + 2.0 * ksk_modulus.powf(-2.0)) + / 12.0; + + println!( + "KS ad var {:?} + {:?} + {:?} = {:?} / mod switch KS-2N {:?}", + ks_0, + ks_1, + ks_2, + ks_0 + ks_1 + ks_2, + var_modswitch + ); + //ks_0 + ks_1 + ks_2 + var_modswitch + (ks_0 + ks_1 + ks_2, var_modswitch) +} diff --git a/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/lwe_programmable_bootstrap.rs b/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/lwe_programmable_bootstrap.rs new file mode 100644 index 000000000..dd9ada8a1 --- /dev/null +++ b/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/lwe_programmable_bootstrap.rs @@ -0,0 +1,87 @@ +use crate::core_crypto::commons::dispersion::Variance; +use crate::core_crypto::commons::parameters::*; + +pub fn pbs_variance_132_bits_security_gaussian( + lwe_dimension: LweDimension, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + decomposition_level_count: DecompositionLevelCount, + decomposition_base_log: DecompositionBaseLog, + ciphertext_modulus: f64, + ntt_modulus: f64, +) -> Variance { + let var_min = super::secure_noise::minimal_glwe_variance_for_132_bits_security_gaussian( + glwe_dimension, + polynomial_size, + ciphertext_modulus, + ); + Variance(pbs_variance_impl( + lwe_dimension.0 as f64, + glwe_dimension.0 as f64, + polynomial_size.0 as f64, + var_min.0 as f64, + decomposition_level_count.0 as f64, + decomposition_base_log.0 as f64, + ciphertext_modulus, + ntt_modulus, + )) +} + +pub fn pbs_variance_132_bits_security_tuniform( + lwe_dimension: LweDimension, + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + decomposition_level_count: DecompositionLevelCount, + decomposition_base_log: DecompositionBaseLog, + ciphertext_modulus: f64, + ntt_modulus: f64, +) -> Variance { + let var_min = super::secure_noise::minimal_glwe_variance_for_132_bits_security_tuniform( + glwe_dimension, + polynomial_size, + ciphertext_modulus, + ); + Variance(pbs_variance_impl( + lwe_dimension.0 as f64, + glwe_dimension.0 as f64, + polynomial_size.0 as f64, + var_min.0 as f64, + decomposition_level_count.0 as f64, + decomposition_base_log.0 as f64, + ciphertext_modulus, + ntt_modulus, + )) +} + +#[allow(clippy::too_many_arguments)] +pub fn pbs_variance_impl( + lwe_dimension: f64, + glwe_dimension: f64, + polynomial_size: f64, + var_min: f64, + decomposition_level_count: f64, + decomposition_base_log: f64, + ciphertext_modulus: f64, + ntt_modulus: f64, +) -> f64 { + let pow2_2b = (2.0 * decomposition_base_log).exp2(); + let pow2_bl = (decomposition_level_count * decomposition_base_log).exp2(); + let ntt2q_factor = + ciphertext_modulus.powf(-2.0) + (ciphertext_modulus * ntt_modulus).powf(-2.0); + let q2ntt_factor = ntt_modulus.powf(-2.0) + 2.0 * (ciphertext_modulus * ntt_modulus).powf(-2.0); + let var_ntt_to_q = glwe_dimension * polynomial_size / 24.0 * ntt2q_factor + ntt2q_factor / 12.0; + let var_q_to_ntt = glwe_dimension * polynomial_size / 24.0 * q2ntt_factor + q2ntt_factor / 12.0; + let var_modswitch = (1.0 + (glwe_dimension * polynomial_size) / 2.0) + * (pow2_bl.powf(-2.0) + 2.0 * ciphertext_modulus.powf(-2.0)) + / 12.0; + let var_ext_product = decomposition_level_count + * (glwe_dimension + 1.0) + * polynomial_size + * ((pow2_2b / 12.0 + 1.0 / 6.0) + * (var_min + var_q_to_ntt * ciphertext_modulus.powf(2.0) * ntt_modulus.powf(-2.0))) + + var_modswitch / 2.0 + + var_ntt_to_q; + println!( + "PBS components var_modswitch {var_modswitch:?} var_ext_product {var_ext_product:?} lwe_dimension {lwe_dimension}"); + lwe_dimension * var_ext_product +} diff --git a/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/mod.rs b/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/mod.rs new file mode 100644 index 000000000..286657315 --- /dev/null +++ b/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/mod.rs @@ -0,0 +1,3 @@ +pub mod lwe_keyswitch; +pub mod lwe_programmable_bootstrap; +pub mod secure_noise; diff --git a/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/secure_noise.rs b/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/secure_noise.rs new file mode 100644 index 000000000..eda496b52 --- /dev/null +++ b/tfhe/src/core_crypto/algorithms/test/noise_distribution/variance_formula/secure_noise.rs @@ -0,0 +1,70 @@ +use crate::core_crypto::commons::dispersion::Variance; +use crate::core_crypto::commons::parameters::*; + +pub fn minimal_glwe_variance_for_132_bits_security_gaussian( + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + modulus: f64, +) -> Variance { + let lwe_dimension = glwe_dimension.to_equivalent_lwe_dimension(polynomial_size); + minimal_lwe_variance_for_132_bits_security_gaussian(lwe_dimension, modulus) +} + +pub fn minimal_lwe_variance_for_132_bits_security_gaussian( + lwe_dimension: LweDimension, + modulus: f64, +) -> Variance { + Variance(minimal_variance_for_132_bits_security_gaussian_impl( + lwe_dimension.0 as f64, + modulus, + )) +} + +pub fn minimal_glwe_variance_for_132_bits_security_tuniform( + glwe_dimension: GlweDimension, + polynomial_size: PolynomialSize, + modulus: f64, +) -> Variance { + let lwe_dimension = glwe_dimension.to_equivalent_lwe_dimension(polynomial_size); + minimal_lwe_variance_for_132_bits_security_tuniform(lwe_dimension, modulus) +} + +pub fn minimal_lwe_variance_for_132_bits_security_tuniform( + lwe_dimension: LweDimension, + modulus: f64, +) -> Variance { + Variance(minimal_variance_for_132_bits_security_tuniform_impl( + lwe_dimension.0 as f64, + modulus, + )) +} + +pub fn minimal_variance_for_132_bits_security_gaussian_impl( + lwe_dimension: f64, + modulus: f64, +) -> f64 { + // 128b curve + //let slope2=-0.05139355742296919; + //let biais2=5.351862745098032; + // 132b curve + let slope2 = -0.04978291316526609; + let biais2 = 5.31469187675068; + let f = slope2 * lwe_dimension + biais2; + let g = 2.0 * (2.0 - modulus.log2().ceil()); + ((f + g) / 2.0 + (f - g).abs() / 2.0).exp2() +} + +pub fn minimal_variance_for_132_bits_security_tuniform_impl( + lwe_dimension: f64, + modulus: f64, +) -> f64 { + let log2_modulus = modulus.log2(); + let epsilon_var_log2 = 2.0 * (2.2 - log2_modulus); + let slope = -0.025167785; + let biais = 68.100671; + let min_bound = (slope * lwe_dimension + biais + (log2_modulus - 64.0)).ceil(); + let theoretical_secure_var_log2 = + (((2.0 * min_bound + 1.0).exp2() + 1.0) / 6.0).log2() - 2.0 * log2_modulus; + println!("log2_modulus: {log2_modulus:?} min_bound: {min_bound:?} theoretical_secure_var_log2: {theoretical_secure_var_log2:?}"); + f64::max(theoretical_secure_var_log2.exp2(), epsilon_var_log2.exp2()) +} diff --git a/tfhe/src/core_crypto/commons/math/decomposition/decomposer.rs b/tfhe/src/core_crypto/commons/math/decomposition/decomposer.rs index 0ebe606db..c55e0893b 100644 --- a/tfhe/src/core_crypto/commons/math/decomposition/decomposer.rs +++ b/tfhe/src/core_crypto/commons/math/decomposition/decomposer.rs @@ -224,6 +224,48 @@ where ) } + /// Generate an iterator over the terms of the decomposition of the input. + /// # Warning + /// This use unbalanced decomposition and shouldn't be used with one-level decomposition + /// The returned iterator yields the terms $\tilde{\theta}\_i$ in order of decreasing $i$. + /// + /// # Example + /// + /// ```rust + /// use tfhe::core_crypto::commons::math::decomposition::SignedDecomposer; + /// use tfhe::core_crypto::commons::numeric::UnsignedInteger; + /// use tfhe::core_crypto::commons::parameters::{DecompositionBaseLog, DecompositionLevelCount}; + /// let decomposer = + /// SignedDecomposer::::new(DecompositionBaseLog(4), DecompositionLevelCount(3)); + /// // 2147483647 == 2^31 - 1 and has a decomposition term == to half_basis + /// for term in decomposer.decompose(2147483647u32) { + /// assert!(1 <= term.level().0); + /// assert!(term.level().0 <= 3); + /// let signed_term = term.value().into_signed(); + /// let half_basis = 2i32.pow(4) / 2i32; + /// assert!( + /// -half_basis <= signed_term, + /// "{} <= {signed_term} failed", + /// -half_basis + /// ); + /// assert!( + /// signed_term <= half_basis, + /// "{signed_term} <= {half_basis} failed" + /// ); + /// } + /// assert_eq!(decomposer.decompose(1).count(), 3); + /// ``` + pub fn decompose_raw(&self, input: Scalar) -> SignedDecompositionIter { + // Note that there would be no sense of making the decomposition on an input which was + // not rounded to the closest representable first. We then perform it before decomposing. + SignedDecompositionIter::new( + self.closest_representable(input) + >> (Scalar::BITS - (self.level_count * self.base_log)), + DecompositionBaseLog(self.base_log), + DecompositionLevelCount(self.level_count), + ) + } + /// Recomposes a decomposed value by summing all the terms. /// /// If the input iterator yields $\tilde{\theta}\_i$, this returns diff --git a/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs b/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs index 607e7833d..54e60a3fa 100644 --- a/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs +++ b/tfhe/src/core_crypto/commons/math/ntt/ntt64.rs @@ -108,6 +108,7 @@ impl Ntt64View<'_> { let mut standard = standard; let ntt = ntt.as_mut(); let standard = standard.as_mut(); + self.plan.inv(ntt); // autovectorize diff --git a/tfhe/src/core_crypto/commons/traits/create_from.rs b/tfhe/src/core_crypto/commons/traits/create_from.rs index f8d114c38..5936e3032 100644 --- a/tfhe/src/core_crypto/commons/traits/create_from.rs +++ b/tfhe/src/core_crypto/commons/traits/create_from.rs @@ -4,7 +4,7 @@ /// time. pub trait CreateFrom { /// Concrete type containing enough information to instantiate a new T. - type Metadata: Clone + Copy; + type Metadata: Clone; /// Instantiate a new T using the associated metadata type. fn create_from(from: T, meta: Self::Metadata) -> Self; diff --git a/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs b/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs index 7ad3a0094..bbba7e1b4 100644 --- a/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs +++ b/tfhe/src/core_crypto/entities/ntt_lwe_bootstrap_key.rs @@ -12,6 +12,17 @@ use crate::core_crypto::entities::polynomial_list::{PolynomialListMutView, Polyn use aligned_vec::ABox; use tfhe_versionable::Versionize; +/// Enum option for BootstrapKey conversion in the Ntt domain. +/// It enables to choose to embed Ntt normalization inside bootstrap_key or not. +/// +/// NB: Embed normalization inside BSK enable to use a denaturate version of the INtt without +/// normalisation and could save some computations on some architectures +#[derive(Debug, Clone, Copy)] +pub enum NttLweBootstrapKeyOption { + Raw, + Normalize, +} + #[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize, Versionize)] #[versionize(NttLweBootstrapKeyVersions)] pub struct NttLweBootstrapKey diff --git a/tfhe/src/core_crypto/hpu/algorithms/mod.rs b/tfhe/src/core_crypto/hpu/algorithms/mod.rs new file mode 100644 index 000000000..bffde9c47 --- /dev/null +++ b/tfhe/src/core_crypto/hpu/algorithms/mod.rs @@ -0,0 +1,2 @@ +pub mod modswitch; +pub mod order; diff --git a/tfhe/src/core_crypto/hpu/algorithms/modswitch.rs b/tfhe/src/core_crypto/hpu/algorithms/modswitch.rs new file mode 100644 index 000000000..be26d2c34 --- /dev/null +++ b/tfhe/src/core_crypto/hpu/algorithms/modswitch.rs @@ -0,0 +1,37 @@ +use crate::core_crypto::prelude::UnsignedInteger; +use tfhe_hpu_backend::prelude::HpuParameters; + +/// This function change information position in value +/// Move information bits from MSB to LSB +pub fn msb2lsb(params: &HpuParameters, data: Scalar) -> Scalar { + let ct_width = params.ntt_params.ct_width as usize; + let storage_width = Scalar::BITS; + data >> (storage_width - ct_width) +} + +/// This function change information position in value +/// Move information bits from LSB to MSB +pub fn lsb2msb(params: &HpuParameters, data: Scalar) -> Scalar { + let ct_width = params.ntt_params.ct_width as usize; + let storage_width = Scalar::BITS; + data << (storage_width - ct_width) +} + +/// This function change information position in container +/// Move information bits from MSB to LSB +pub fn msb2lsb_align(params: &HpuParameters, data: &mut [Scalar]) { + let ct_width = params.ntt_params.ct_width as usize; + let storage_width = Scalar::BITS; + for val in data.iter_mut() { + *val >>= storage_width - ct_width; + } +} +/// This function change information position in container +/// Move information bits from LSB to MSB +pub fn lsb2msb_align(params: &HpuParameters, data: &mut [Scalar]) { + let ct_width = params.ntt_params.ct_width as usize; + let storage_width = Scalar::BITS; + for val in data.iter_mut() { + *val <<= storage_width - ct_width; + } +} diff --git a/tfhe/src/core_crypto/hpu/algorithms/order.rs b/tfhe/src/core_crypto/hpu/algorithms/order.rs new file mode 100644 index 000000000..66d57215e --- /dev/null +++ b/tfhe/src/core_crypto/hpu/algorithms/order.rs @@ -0,0 +1,118 @@ +//! +//! Define polynomial ordering +//! And associated function that enable to translate from one ordering to the others +//! +//! Ordering is useful in HW to expose common structure in the computation. +//! Both Ntt architecture used reverse order as input +//! However, Wmm use an intermediate Network required by the BSK shuffling. + +use crate::core_crypto::prelude::{DecompositionBaseLog, DecompositionLevelCount, UnsignedInteger}; + +#[derive(Debug, Clone)] +pub struct RadixBasis { + radix_lg: DecompositionBaseLog, + digits_nb: DecompositionLevelCount, +} + +impl RadixBasis { + pub fn new(radix: usize, digits_nb: usize) -> Self { + let radix_lg = radix.ilog2() as usize; + Self { + radix_lg: DecompositionBaseLog(radix_lg), + digits_nb: DecompositionLevelCount(digits_nb), + } + } + + pub fn radix_lg(&self) -> DecompositionBaseLog { + self.radix_lg + } + pub fn digits_nb(&self) -> DecompositionLevelCount { + self.digits_nb + } + + /// Convert an index expressed in Natural Order into 'pdrev' Order + /// Generalized pseudo reverse is: + /// * Nat_order from 0..rank + /// * Rev_order from rank..digits + pub fn idx_pdrev(&self, digits: usize, rank: usize, nat_val: usize) -> usize { + let mask = (1 << ((digits - rank) * self.radix_lg.0)) - 1; + let to_be_reversed = (nat_val >> (rank * self.radix_lg.0)) & mask; + let reversed = Self::new(1 << self.radix_lg.0, digits - rank).idx_rev(to_be_reversed); + + let to_be_zeroed = nat_val & (mask << (rank * self.radix_lg.0)); + let mut result = nat_val & !to_be_zeroed; + result |= reversed << (rank * self.radix_lg.0); + + result + } + + /// Convert an index expressed in 'pdrev' Order into Natural Order + #[inline] + pub fn idx_pdrev_inv(&self, digits: usize, rank: usize, pdrev_val: usize) -> usize { + self.idx_pdrev(digits, rank, pdrev_val) + } + + /// Convert an index expressed in Natural Order into `reverse` Order + pub fn idx_rev(&self, mut nat_val: usize) -> usize { + let mask = (1 << self.radix_lg.0) - 1; + let mut result = 0; + for i in (0..self.digits_nb.0).rev() { + result |= (nat_val & mask) << (i * self.radix_lg.0); + nat_val >>= self.radix_lg.0; + } + + result + } + + /// Convert an index expressed in 'reverse' Order into Natural Order + #[inline] + pub fn idx_rev_inv(&mut self, pdrev_val: usize) -> usize { + self.idx_rev(pdrev_val) + } +} + +/// Utility function to shuffle a polynomial in a reverse order +pub fn poly_order(dst: &mut [Scalar], src: &[Scalar], rb_conv: &RadixBasis, f: F) +where + Scalar: UnsignedInteger, + F: Fn(Scalar) -> Scalar, +{ + assert_eq!(src.len(), dst.len(), "Poly src/ dst length mismtach"); + assert_eq!( + src.len(), + ((1 << rb_conv.radix_lg().0) as usize).pow(rb_conv.digits_nb().0 as u32), + "Poly length mismtach with RadixBasis configuration" + ); + + for (idx, v) in dst.iter_mut().enumerate() { + let src_idx = rb_conv.idx_rev(idx); + *v = f(src[src_idx]); + } +} + +#[derive(Debug, Clone)] +pub struct PcgNetwork { + stage_nb: usize, + rb_conv: RadixBasis, +} + +impl PcgNetwork { + /// Create network instance from NttParameters + pub fn new(radix: usize, stg_nb: usize) -> Self { + Self { + stage_nb: stg_nb, + rb_conv: RadixBasis::new(radix, stg_nb), + } + } + + /// For a given position idx (in 0..N-1), at processing step delta_idx, + /// find the corresponding position idx (consider the input of the node) + pub fn get_pos_id(&mut self, delta_idx: usize, pos_idx: usize) -> usize { + let node_idx = pos_idx / (1 << self.rb_conv.radix_lg().0); + let rmn_idx = pos_idx % (1 << self.rb_conv.radix_lg().0); + let pdrev_idx = self + .rb_conv + .idx_pdrev(self.stage_nb - 1, delta_idx, node_idx); + pdrev_idx * (1 << self.rb_conv.radix_lg().0) + rmn_idx + } +} diff --git a/tfhe/src/core_crypto/hpu/entities/glwe_ciphertext.rs b/tfhe/src/core_crypto/hpu/entities/glwe_ciphertext.rs new file mode 100644 index 000000000..86bd384ea --- /dev/null +++ b/tfhe/src/core_crypto/hpu/entities/glwe_ciphertext.rs @@ -0,0 +1,32 @@ +//! Module containing the definition of the HpuGlweCiphertext. + +use tfhe_hpu_backend::prelude::*; + +use super::algorithms::{modswitch, order}; +use crate::core_crypto::prelude::*; + +impl CreateFrom> + for HpuGlweCiphertextOwned +{ + type Metadata = HpuParameters; + fn create_from(cpu_glwe: GlweCiphertextView<'_, Scalar>, meta: Self::Metadata) -> Self { + let mut hpu_glwe = Self::new(Scalar::ZERO, meta.clone()); + + let ntt_p = &meta.ntt_params; + let pbs_p = &meta.pbs_params; + + // NB: Glwe polynomial must be in reversed order + let rb_conv = order::RadixBasis::new(ntt_p.radix, ntt_p.stg_nb); + + // Put glwe in reverse order and align on lsb + std::iter::zip( + hpu_glwe.as_mut().chunks_mut(pbs_p.polynomial_size), + cpu_glwe.as_polynomial_list().iter(), + ) + .for_each(|(hw, cpu)| { + order::poly_order(hw, cpu.into_container(), &rb_conv, |x| x); + modswitch::msb2lsb_align(&meta, hw); + }); + hpu_glwe + } +} diff --git a/tfhe/src/core_crypto/hpu/entities/glwe_lookuptable.rs b/tfhe/src/core_crypto/hpu/entities/glwe_lookuptable.rs new file mode 100644 index 000000000..ced40559e --- /dev/null +++ b/tfhe/src/core_crypto/hpu/entities/glwe_lookuptable.rs @@ -0,0 +1,134 @@ +//! Module containing the definition of the HpuGlweCiphertext. + +use hpu_asm::PbsLut; +use tfhe_hpu_backend::prelude::*; + +use super::algorithms::{modswitch, order}; +use crate::core_crypto::commons::traits::*; +use crate::core_crypto::entities::*; +use crate::core_crypto::prelude::{CiphertextModulus, GlweDimension, PolynomialSize}; + +impl CreateFrom> + for HpuGlweLookuptableOwned +{ + type Metadata = HpuParameters; + fn create_from(cpu_glwe: GlweCiphertextView<'_, Scalar>, meta: Self::Metadata) -> Self { + let mut hpu_lut = Self::new(Scalar::ZERO, meta.clone()); + let ntt_p = &meta.ntt_params; + + // NB: Glwe polynomial must be in reversed order + let rb_conv = order::RadixBasis::new(ntt_p.radix, ntt_p.stg_nb); + + // Put glwe in reverse order and align on lsb + // Only handle Body since Lut is encoded as trivial Glwe + order::poly_order( + hpu_lut.as_mut(), + cpu_glwe.get_body().as_polynomial().into_container(), + &rb_conv, + |x| x, + ); + modswitch::msb2lsb_align(&meta, hpu_lut.as_mut()); + hpu_lut + } +} + +impl From> for GlweCiphertextOwned { + fn from(hpu_lut: HpuGlweLookuptableView<'_, u64>) -> Self { + let hpu_p = hpu_lut.params(); + let pbs_p = hpu_p.pbs_params; + + let mut cpu_glwe = Self::new( + 0, + GlweDimension(pbs_p.glwe_dimension).to_glwe_size(), + PolynomialSize(pbs_p.polynomial_size), + CiphertextModulus::try_new_power_of_2(pbs_p.ciphertext_width) + .expect("Invalid ciphertext width"), + ); + // NB: GlweLut polynomial is in reversed order + let rb_conv = order::RadixBasis::new(hpu_p.ntt_params.radix, hpu_p.ntt_params.stg_nb); + + // Put HpuLut back in standard order and align on msb + order::poly_order( + cpu_glwe.get_mut_body().as_mut_polynomial().into_container(), + // hpu_lut.as_view().into_container(), + hpu_lut.as_ref(), + &rb_conv, + |x| x, + ); + modswitch::lsb2msb_align( + hpu_p, + cpu_glwe.get_mut_body().as_mut_polynomial().into_container(), + ); + cpu_glwe + } +} + +pub fn create_hpu_lookuptable( + params: HpuParameters, + pbs: &hpu_asm::Pbs, +) -> HpuGlweLookuptableOwned { + // Create Glwe + let pbs_p = params.pbs_params; + let mut cpu_acc = GlweCiphertext::new( + 0, + GlweDimension(pbs_p.glwe_dimension).to_glwe_size(), + PolynomialSize(pbs_p.polynomial_size), + CiphertextModulus::try_new_power_of_2(pbs_p.ciphertext_width) + .expect("Invalid ciphertext width"), + ); + + // Zeroed mask + let mut cpu_acc_view = cpu_acc.as_mut_view(); + cpu_acc_view.get_mut_mask().as_mut().fill(0); + + // Populate body + // Modulus of the msg contained in the msg bits and operations buffer + let modulus_sup = 1 << (pbs_p.message_width + pbs_p.carry_width); + + // N/(p/2) = size of each block + let box_size = pbs_p.polynomial_size / modulus_sup; + + // Value of the shift we multiply our messages by + // NB: Tfhe-rs always align information in MSB whatever power_of_two modulus is used + // This is why we compute the encoding delta based on container width instead of + // real modulus width + let encode = |x: Cleartext| { + let cleartext_and_padding_width = pbs_p.message_width + pbs_p.carry_width + 1; + let delta = 1 << (u64::BITS - cleartext_and_padding_width as u32); + Plaintext(x.0.wrapping_mul(delta)) + }; + + let mut body = cpu_acc_view.get_mut_body(); + let body_u64 = body.as_mut(); + + let digits_params = hpu_asm::DigitParameters { + msg_w: params.pbs_params.message_width, + carry_w: params.pbs_params.carry_width, + }; + + let lut_nb = pbs.lut_nb() as usize; + + let single_function_sub_lut_size = (modulus_sup / lut_nb) * box_size; + + for (pos, function_sub_lut) in body_u64 + .chunks_mut(single_function_sub_lut_size) + .enumerate() + { + for (msg_value, sub_lut_box) in function_sub_lut.chunks_exact_mut(box_size).enumerate() { + let function_eval = pbs.fn_at(pos, &digits_params, msg_value) as u64; + sub_lut_box.fill(encode(Cleartext(function_eval)).0); + } + } + + let half_box_size = box_size / 2; + + // Negate the first half_box_size coefficients + for a_i in body_u64[0..half_box_size].iter_mut() { + *a_i = (*a_i).wrapping_neg(); + } + + // Rotate the accumulator + body_u64.rotate_left(half_box_size); + + HpuGlweLookuptableOwned::create_from(cpu_acc.as_view(), params) +} diff --git a/tfhe/src/core_crypto/hpu/entities/lwe_bootstrap_key.rs b/tfhe/src/core_crypto/hpu/entities/lwe_bootstrap_key.rs new file mode 100644 index 000000000..3659c8aef --- /dev/null +++ b/tfhe/src/core_crypto/hpu/entities/lwe_bootstrap_key.rs @@ -0,0 +1,384 @@ +//! Module containing the definition of the HpuGlweCiphertext. + +use tfhe_hpu_backend::prelude::*; + +use super::algorithms::order; +use crate::core_crypto::prelude::*; + +impl CreateFrom> for HpuLweBootstrapKeyOwned { + type Metadata = HpuParameters; + fn create_from(cpu_bsk: LweBootstrapKey<&[u64]>, meta: Self::Metadata) -> Self { + // Convert the LweBootstrapKey in Ntt domain + let mut ntt_bsk = NttLweBootstrapKeyOwned::::new( + 0_u64, + cpu_bsk.input_lwe_dimension(), + cpu_bsk.glwe_size(), + cpu_bsk.polynomial_size(), + cpu_bsk.decomposition_base_log(), + cpu_bsk.decomposition_level_count(), + CiphertextModulus::new(u64::from(&meta.ntt_params.prime_modulus) as u128), + ); + + // Conversion to ntt domain + par_convert_standard_lwe_bootstrap_key_to_ntt64( + &cpu_bsk, + &mut ntt_bsk, + NttLweBootstrapKeyOption::Raw, + ); + + Self::create_from(ntt_bsk.as_view(), meta) + } +} + +/// Shuffle BSK for GF64 Ntt architecture +/// This architectures don't use an internal network, however, inputs polynomial was in a custom +/// order and not bit-reversed one +fn shuffle_gf64( + ntt_bsk: &NttLweBootstrapKeyView, + params: &HpuParameters, + cut_w: &[u8], +) -> HpuLweBootstrapKeyOwned { + let mut hpu_bsk = HpuLweBootstrapKeyOwned::::new(0_u64, params.clone()); + + // Extract params inner values for ease of writing + let ntt_p = ¶ms.ntt_params; + let pbs_p = ¶ms.pbs_params; + let glwe_n = pbs_p.polynomial_size; + let glwe_kp1 = pbs_p.glwe_dimension + 1; + let pbs_l = pbs_p.pbs_level; + + // Recursive function used to define the expected polynomial order + fn bsk_order(cut_w: &[u8]) -> Vec { + if cut_w.len() == 1 { + (0..2_usize.pow(cut_w[0] as u32)).collect::>() + } else { + let coefs_left = 2_usize.pow(cut_w[0] as u32); + let sub_order = bsk_order(&cut_w[1..]); + + (0..coefs_left) + .flat_map(|j| { + sub_order + .iter() + .map(|idx| coefs_left * idx + j) + .collect::>() + }) + .collect::>() + } + } + + // Compute Gf64 polynomial order based on cut_w + let mut gf64_order = bsk_order(cut_w); + // gf64_idx must be expressed in bitreverse (to compensate the fact that ntt output is in + // bitreverse + let rb_conv = order::RadixBasis::new(2, cut_w.iter().sum::() as usize); + for x in gf64_order.iter_mut() { + *x = rb_conv.idx_rev(*x); + } + + let mut wr_idx = 0; + for ggsw in ntt_bsk.as_view().into_ggsw_iter() { + // Arch dependant iterations + for glwe_idx in 0..glwe_kp1 { + for stg_iter in 0..ntt_p.stg_iter(glwe_n) { + for g_idx in 0..glwe_kp1 { + for l_idx in 0..pbs_l { + let p_view = GgswIndex { + s_dim: g_idx, + lvl_dim: l_idx, + glwe_dim: glwe_idx, + } + .poly_view(&ggsw); + + for p in 0..ntt_p.psi { + for r in 0..ntt_p.radix { + let c_idx = + stg_iter * ntt_p.psi * ntt_p.radix + ntt_p.radix * p + r; + hpu_bsk[wr_idx] = p_view[gf64_order[c_idx]]; + wr_idx += 1; + } + } + } + } + } + } + } + hpu_bsk +} + +/// UnShuffle BSK for GF64 Ntt architecture +fn unshuffle_gf64( + hpu_bsk: &HpuLweBootstrapKeyView, + cut_w: &[u8], +) -> NttLweBootstrapKeyOwned { + // Extract params inner values for ease of writing + let params = hpu_bsk.params(); + let ntt_p = ¶ms.ntt_params; + let pbs_p = ¶ms.pbs_params; + let glwe_n = pbs_p.polynomial_size; + let glwe_kp1 = pbs_p.glwe_dimension + 1; + let pbs_l = pbs_p.pbs_level; + + let mut ntt_bsk = NttLweBootstrapKeyOwned::new( + 0, + LweDimension(pbs_p.lwe_dimension), + GlweDimension(pbs_p.glwe_dimension).to_glwe_size(), + PolynomialSize(pbs_p.polynomial_size), + DecompositionBaseLog(pbs_p.pbs_base_log), + DecompositionLevelCount(pbs_p.pbs_level), + CiphertextModulus::new(u64::from(&hpu_bsk.params().ntt_params.prime_modulus) as u128), + ); + + // Recursive function used to define the expected polynomial order + fn bsk_order(cut_w: &[u8]) -> Vec { + if cut_w.len() == 1 { + (0..2_usize.pow(cut_w[0] as u32)).collect::>() + } else { + let coefs_left = 2_usize.pow(cut_w[0] as u32); + let sub_order = bsk_order(&cut_w[1..]); + + (0..coefs_left) + .flat_map(|j| { + sub_order + .iter() + .map(|idx| coefs_left * idx + j) + .collect::>() + }) + .collect::>() + } + } + + // Compute Gf64 polynomial order based on cut_w + let mut gf64_order = bsk_order(cut_w); + // gf64_idx must be expressed in bitreverse (to compensate the fact that ntt output is in + // bitreverse + let rb_conv = order::RadixBasis::new(2, cut_w.iter().sum::() as usize); + for x in gf64_order.iter_mut() { + *x = rb_conv.idx_rev(*x); + } + + let mut rd_idx = 0; + for mut ggsw in ntt_bsk.as_mut_view().into_ggsw_iter() { + // Arch dependant iterations + for glwe_idx in 0..glwe_kp1 { + for stg_iter in 0..ntt_p.stg_iter(glwe_n) { + for g_idx in 0..glwe_kp1 { + for l_idx in 0..pbs_l { + let p_view = GgswIndex { + s_dim: g_idx, + lvl_dim: l_idx, + glwe_dim: glwe_idx, + } + .poly_mut_view(&mut ggsw); + + for p in 0..ntt_p.psi { + for r in 0..ntt_p.radix { + let c_idx = + stg_iter * ntt_p.psi * ntt_p.radix + ntt_p.radix * p + r; + p_view[gf64_order[c_idx]] = hpu_bsk[rd_idx]; + rd_idx += 1; + } + } + } + } + } + } + } + + ntt_bsk +} + +/// Shuffle BSK for Wmm Ntt architecture +/// These architectures used a network internally +/// With those architecture, the structural order and the iteration order differe and required a +/// custom Bsk layout +fn shuffle_wmm( + ntt_bsk: &NttLweBootstrapKeyView, + params: &HpuParameters, +) -> HpuLweBootstrapKeyOwned { + let mut hpu_bsk = HpuLweBootstrapKeyOwned::::new(0_u64, params.clone()); + + // Extract params inner values for ease of writing + let ntt_p = ¶ms.ntt_params; + let pbs_p = ¶ms.pbs_params; + let glwe_n = pbs_p.polynomial_size; + let glwe_kp1 = pbs_p.glwe_dimension + 1; + let pbs_l = pbs_p.pbs_level; + + // NB: Ntt output polynomial in bit reverse order in Ntt domain + // Hw expect ntt in reverse order. + // We currently use keep value as is but this must be modified when + // arch radix != 2 + assert_eq!( + 2, ntt_p.radix, + "Error: With radix !=2 bsk must be converted from bit-reverse in radix-reverse order" + ); + + // Instantiate Ntt network + let mut ntw = order::PcgNetwork::new(ntt_p.radix, ntt_p.stg_nb); + + let mut wr_idx = 0; + for ggsw in ntt_bsk.as_view().into_ggsw_iter() { + // Arch dependant iterations + for glwe_idx in 0..glwe_kp1 { + for stg_iter in 0..ntt_p.stg_iter(glwe_n) { + for g_idx in 0..glwe_kp1 { + for l_idx in 0..pbs_l { + let p_view = GgswIndex { + s_dim: g_idx, + lvl_dim: l_idx, + glwe_dim: glwe_idx, + } + .poly_view(&ggsw); + + for p in 0..ntt_p.psi { + for r in 0..ntt_p.radix { + let c_idx = + stg_iter * ntt_p.psi * ntt_p.radix + ntt_p.radix * p + r; + let c_id = ntw.get_pos_id(ntt_p.ls_delta(), c_idx); + hpu_bsk[wr_idx] = p_view[c_id]; + wr_idx += 1; + } + } + } + } + } + } + } + hpu_bsk +} + +/// UnShuffle BSK for Wmm Ntt architecture +fn unshuffle_wmm(hpu_bsk: &HpuLweBootstrapKeyView) -> NttLweBootstrapKeyOwned { + // Extract params inner values for ease of writing + let params = hpu_bsk.params(); + let ntt_p = ¶ms.ntt_params; + let pbs_p = ¶ms.pbs_params; + let glwe_n = pbs_p.polynomial_size; + let glwe_kp1 = pbs_p.glwe_dimension + 1; + let pbs_l = pbs_p.pbs_level; + + let mut ntt_bsk = NttLweBootstrapKeyOwned::new( + 0, + LweDimension(pbs_p.lwe_dimension), + GlweDimension(pbs_p.glwe_dimension).to_glwe_size(), + PolynomialSize(pbs_p.polynomial_size), + DecompositionBaseLog(pbs_p.pbs_base_log), + DecompositionLevelCount(pbs_p.pbs_level), + CiphertextModulus::new(u64::from(&hpu_bsk.params().ntt_params.prime_modulus) as u128), + ); + + // Instantiate Ntt network + let mut ntw = order::PcgNetwork::new(ntt_p.radix, ntt_p.stg_nb); + + let mut rd_idx = 0; + for mut ggsw in ntt_bsk.as_mut_view().into_ggsw_iter() { + // Arch dependant iterations + for glwe_idx in 0..glwe_kp1 { + for stg_iter in 0..ntt_p.stg_iter(glwe_n) { + for g_idx in 0..glwe_kp1 { + for l_idx in 0..pbs_l { + let p_view = GgswIndex { + s_dim: g_idx, + lvl_dim: l_idx, + glwe_dim: glwe_idx, + } + .poly_mut_view(&mut ggsw); + + for p in 0..ntt_p.psi { + for r in 0..ntt_p.radix { + let c_idx = + stg_iter * ntt_p.psi * ntt_p.radix + ntt_p.radix * p + r; + let c_id = ntw.get_pos_id(ntt_p.ls_delta(), c_idx); + p_view[c_id] = hpu_bsk[rd_idx]; + rd_idx += 1; + } + } + } + } + } + } + } + ntt_bsk +} + +/// Uploading BSK on HW required custom polynomial interleaving. +/// The following structure enable OutOfOrder access of GGSW polynomial to ease +/// the interleaving description +pub struct GgswIndex { + pub s_dim: usize, + pub lvl_dim: usize, + pub glwe_dim: usize, +} + +impl GgswIndex { + /// Ease out of order iteration over a Ggsw ciphertext. + /// This is useful for Bootstrapping key shuffling to match expected HW + /// order + pub fn poly_view<'a, Scalar: UnsignedInteger>( + self, + ggsw: &'a NttGgswCiphertextView, + ) -> &'a [Scalar] { + let decomp_level = ggsw.decomposition_level_count().0; + let row_cnt = ggsw.glwe_size().0; + let poly_cnt = ggsw.glwe_size().0; + + ggsw.as_ref() + .split_into(decomp_level) + .nth(self.lvl_dim) + .unwrap() + .split_into(row_cnt) + .nth(self.s_dim) + .unwrap() + .split_into(poly_cnt) + .nth(self.glwe_dim) + .unwrap() + } + + /// Ease out of order iteration over a mutable Ggsw ciphertext. + /// This is useful for Bootstrapping key shuffling to match expected HW + /// order + pub fn poly_mut_view<'a, Scalar: UnsignedInteger>( + self, + ggsw: &'a mut NttGgswCiphertextMutView, + ) -> &'a mut [Scalar] { + let decomp_level = ggsw.decomposition_level_count().0; + let row_cnt = ggsw.glwe_size().0; + let poly_cnt = ggsw.glwe_size().0; + + ggsw.as_mut() + .split_into(decomp_level) + .nth(self.lvl_dim) + .unwrap() + .split_into(row_cnt) + .nth(self.s_dim) + .unwrap() + .split_into(poly_cnt) + .nth(self.glwe_dim) + .unwrap() + } +} + +impl<'a> CreateFrom> for HpuLweBootstrapKeyOwned { + type Metadata = HpuParameters; + fn create_from(cpu_bsk: NttLweBootstrapKeyView<'a, u64>, meta: Self::Metadata) -> Self { + match meta.ntt_params.core_arch.clone() { + // Shuffle required by GF64 Ntt without internal network + HpuNttCoreArch::GF64(cut_w) => shuffle_gf64(&cpu_bsk, &meta, &cut_w), + // Legacy shuffle required by WmmNtt with internal network + HpuNttCoreArch::WmmCompactPcg | HpuNttCoreArch::WmmUnfoldPcg => { + shuffle_wmm(&cpu_bsk, &meta) + } + } + } +} + +impl<'a> From> for NttLweBootstrapKeyOwned { + fn from(hpu_bsk: HpuLweBootstrapKeyView<'a, u64>) -> Self { + match hpu_bsk.params().ntt_params.core_arch.clone() { + // Shuffle required by GF64 Ntt without internal network + HpuNttCoreArch::GF64(cut_w) => unshuffle_gf64(&hpu_bsk, &cut_w), + // Legacy shuffle required by WmmNtt with internal network + HpuNttCoreArch::WmmCompactPcg | HpuNttCoreArch::WmmUnfoldPcg => unshuffle_wmm(&hpu_bsk), + } + } +} diff --git a/tfhe/src/core_crypto/hpu/entities/lwe_ciphertext.rs b/tfhe/src/core_crypto/hpu/entities/lwe_ciphertext.rs new file mode 100644 index 000000000..0b055dba2 --- /dev/null +++ b/tfhe/src/core_crypto/hpu/entities/lwe_ciphertext.rs @@ -0,0 +1,86 @@ +//! Module containing the definition of the HpuLweCiphertext conversion traits. +//! +//! NB: LweCiphertext need to be: +//! * Sent to Hw -> Conversion from Cpu world to Hpu World +//! * Retrieved from Hw -> Conversion from Hpu world to Cpu World + +use tfhe_hpu_backend::prelude::*; + +use super::algorithms::{modswitch, order}; +use crate::core_crypto::commons::parameters::*; +use crate::core_crypto::commons::traits::*; +use crate::core_crypto::entities::*; + +impl CreateFrom> + for HpuLweCiphertextOwned +{ + type Metadata = HpuParameters; + fn create_from(cpu_lwe: LweCiphertextView<'_, Scalar>, meta: Self::Metadata) -> Self { + let mut hpu_lwe = Self::new(Scalar::ZERO, meta.clone()); + let ntt_p = &meta.ntt_params; + let pbs_p = &meta.pbs_params; + let poly_size = pbs_p.polynomial_size; + + // NB: lwe mask is view as polynomial and must be in reversed order + // Allocate translation buffer and reversed vector here + let rb_conv = order::RadixBasis::new(ntt_p.radix, ntt_p.stg_nb); + let lwe_len = hpu_lwe.len(); + // Copy lwe mask in reverse order and update alignment + cpu_lwe + .get_mask() + .as_ref() + .chunks(poly_size) + .enumerate() + .for_each(|(pid, poly)| { + for idx in 0..poly_size { + let dst_idx = pid * poly_size + idx; + let src_poly_idx = rb_conv.idx_rev(idx); + hpu_lwe[dst_idx] = modswitch::msb2lsb(&meta, poly[src_poly_idx]); + } + }); + // Add body + hpu_lwe[lwe_len - 1] = modswitch::msb2lsb(&meta, *cpu_lwe.get_body().data); + + hpu_lwe + } +} + +#[allow(clippy::fallible_impl_from)] +impl From> + for LweCiphertextOwned +{ + fn from(hpu_lwe: HpuLweCiphertextView<'_, Scalar>) -> Self { + // NB: HPU only handle Big Lwe over it's boundaries + let ntt_p = &hpu_lwe.params().ntt_params; + let pbs_p = &hpu_lwe.params().pbs_params; + let poly_size = pbs_p.polynomial_size; + + let mut cpu_lwe = Self::new( + Scalar::ZERO, + LweSize(hpu_lwe.len()), + CiphertextModulus::try_new_power_of_2(pbs_p.ciphertext_width).unwrap(), + ); + + // Reverse Glwe back to natural order + // Allocate translation buffer and reversed vector here + let rb_conv = order::RadixBasis::new(ntt_p.radix, ntt_p.stg_nb); + let lwe_len = hpu_lwe.len(); + // Copy lwe mask in reverse order and update alignment + cpu_lwe + .get_mut_mask() + .as_mut() + .chunks_mut(poly_size) + .enumerate() + .for_each(|(pid, poly)| { + for (idx, coeff) in poly.iter_mut().enumerate().take(poly_size) { + let src_poly_idx = rb_conv.idx_rev(idx); + let src_idx = pid * poly_size + src_poly_idx; + *coeff = modswitch::lsb2msb(hpu_lwe.params(), hpu_lwe[src_idx]); + } + }); + // Add body + *cpu_lwe.get_mut_body().data = modswitch::lsb2msb(hpu_lwe.params(), hpu_lwe[lwe_len - 1]); + + cpu_lwe + } +} diff --git a/tfhe/src/core_crypto/hpu/entities/lwe_keyswitch_key.rs b/tfhe/src/core_crypto/hpu/entities/lwe_keyswitch_key.rs new file mode 100644 index 000000000..7771606f3 --- /dev/null +++ b/tfhe/src/core_crypto/hpu/entities/lwe_keyswitch_key.rs @@ -0,0 +1,262 @@ +//! Module containing the definition of the HpuGlweCiphertext. + +use tfhe_hpu_backend::prelude::*; + +use super::algorithms::order; +use crate::core_crypto::prelude::*; + +impl CreateFrom> for HpuLweKeyswitchKeyOwned +where + Scalar: UnsignedInteger + CastInto, +{ + type Metadata = HpuParameters; + fn create_from(cpu_ksk: LweKeyswitchKeyView<'_, Scalar>, meta: Self::Metadata) -> Self { + let mut hpu_ksk = Self::new(0, meta.clone()); + + // Allocate radix_basis converter + let rb_conv = order::RadixBasis::new(meta.ntt_params.radix, meta.ntt_params.stg_nb); + + // Extract params inner values for ease of writing + let pbs_p = &meta.pbs_params; + let lwe_k = pbs_p.lwe_dimension; + let glwe_k = pbs_p.glwe_dimension; + let glwe_n = pbs_p.polynomial_size; + let ks_p = &meta.ks_params; + + // View KsK as a polyhedral with rectangles faces. + // Front face is a rectangle of size (N*Glwe_k)x(Lwe_k + 1) + // Depth is ksk_level + // -------------- + // ksk / / / + // lvl / / / | + // -------------- | Y Z + // |s|lwe_k +1 | | | / + //glwe_k|l| | | | / + // * N |i| | / |/ |c| |/ / ------> X -e------------ + // + // Ksk is sliced in one slot face over x. + // This slice is then decomposed in rectancles lby*lbz. + // These rectangle are iterated in natural order. + // Within this rectangle lbZ coefs are merged in one 64b coefs + // and iterated over y dim. + // Furthermore it's possible that ksk polyhedron isn't a multiple of lbx/lby/lbz. + // Incomplete rectangle are then extend with xx and iterate as usual + + let mut hw_idx = 0; + for outer_x in (0..lwe_k + 1).step_by(ks_p.lbx) { + for inner_x in 0..ks_p.lbx { + // -> Iterate over Slices + let raw_x = outer_x + inner_x; + let abs_x = if raw_x < (lwe_k + 1) { + Some(raw_x) + } else { + None + }; + + for outer_y in (0..(glwe_k * glwe_n)).step_by(ks_p.lby) { + for outer_z in (0..pbs_p.ks_level).step_by(ks_p.lbz) { + // -> Iterate over rectangles lby*lbz + for inner_y in 0..ks_p.lby { + let raw_y = outer_y + inner_y; + let abs_y = if raw_y < (glwe_k * glwe_n) { + // Hw-order expect y-dim to be in bitreverse + // Compute it inflight + // NB: raw_y represent the index over Y in [0; glwe_k*glwe_n] and + // the bitreverse must be only + // applied over glwe_n + // -> split raw_y in poly_y, coef_y and bitreverse only the coef_y + let poly_y = raw_y / glwe_n; + let coef_y = raw_y % glwe_n; + let brev_coef_y = rb_conv.idx_rev(coef_y); + let abs_y = poly_y * glwe_n + brev_coef_y; + Some(abs_y) + } else { + None + }; + + let pack_z: u64 = (0..ks_p.lbz).fold(0, |acc, inner_z| { + let raw_z = outer_z + inner_z; + let abs_z = if raw_z < pbs_p.ks_level { + Some(raw_z) + } else { + None + }; + let cur_coef = match (abs_x, abs_y, abs_z) { + (Some(x), Some(y), Some(z)) => { + *KskIndex { x, y, z }.coef_view(&cpu_ksk) + } + _ => Scalar::ZERO, /* At least one dimension overflow + * -> return 0 */ + }; + // NB: In Sw, the information is kept in MSB, but Hw required them + // in LSB Handle bit alignment + let coef_ralign = { + let coef_orig: u64 = cur_coef.cast_into(); + coef_orig >> (Scalar::BITS - ks_p.width) + }; + // println!("@{inner_z} => 0x{acc:x} [0x{coef_rounded_ralign:x}]"); + acc + (coef_ralign << (inner_z * ks_p.width)) + }); + hpu_ksk[hw_idx] = pack_z; + hw_idx += 1; + } + } + } + } + } + hpu_ksk + } +} + +/// Shuffling KSK in HW order required custom coefs interleaving. +/// The following structure enable OutOfOrder access of KSK coefs to ease +/// the interleaving description +/// Abstract tfhe-rs view from hw view (i.e polyhedron) +#[derive(Debug)] +struct KskIndex { + pub x: usize, + pub y: usize, + pub z: usize, +} + +impl KskIndex { + /// Ease out of order iteration over a ksk coefs. + fn coef_view<'a, Scalar: UnsignedInteger>( + self, + ksk: &'a LweKeyswitchKeyView, + ) -> &'a Scalar { + let decomp_level = ksk.decomposition_level_count().0; + let in_lwe_elem = ksk.input_key_lwe_dimension().0; + // NB: Decomposition is in reverse order in tfhe-rs (i.e MSB to LSB) + // -> However, inversion is already handled during keyswitching key generation + // Ksk coefs is order as follow (from outer dim to inner dim): + // * input_lwe_key_dim + // * decomp_lvl + // * out_lwe_key_size + &ksk.as_ref() + .split_into(in_lwe_elem) + .nth(self.y) + .unwrap() + .split_into(decomp_level) + .nth(self.z) + .unwrap()[self.x] + } + + /// Ease out of order mutable iteration over a ksk coefs. + fn coef_mut_view<'a, Scalar: UnsignedInteger>( + self, + ksk: &'a mut LweKeyswitchKeyMutView, + ) -> &'a mut Scalar { + let decomp_level = ksk.decomposition_level_count().0; + let in_lwe_elem = ksk.input_key_lwe_dimension().0; + // NB: Decomposition is in reverse order in tfhe-rs (i.e MSB to LSB) + // -> However, inversion is already handled during keyswitching key generation + // Ksk coefs is order as follow (from outer dim to inner dim): + // * input_lwe_key_dim + // * decomp_lvl + // * out_lwe_key_size + &mut ksk + .as_mut() + .split_into(in_lwe_elem) + .nth(self.y) + .unwrap() + .split_into(decomp_level) + .nth(self.z) + .unwrap()[self.x] + } +} + +impl<'a, Scalar> From> for LweKeyswitchKeyOwned +where + Scalar: UnsignedInteger + CastFrom, +{ + fn from(hpu_ksk: HpuLweKeyswitchKeyView<'a, u64>) -> Self { + let pbs_p = &hpu_ksk.params().pbs_params; + let ks_p = &hpu_ksk.params().ks_params; + + let mut cpu_ksk = Self::new( + Scalar::ZERO, + DecompositionBaseLog(pbs_p.ks_base_log), + DecompositionLevelCount(pbs_p.ks_level), + LweDimension(pbs_p.glwe_dimension * pbs_p.polynomial_size), + LweDimension(pbs_p.lwe_dimension), + CiphertextModulus::new(1_u128 << ks_p.width), + ); + + // Unshuffle Keyswitch key from Hw order to Cpu order + + // Allocate radix_basis converter + let params = hpu_ksk.params(); + let rb_conv = order::RadixBasis::new(params.ntt_params.radix, params.ntt_params.stg_nb); + + // Extract params inner values for ease of writing + let pbs_p = ¶ms.pbs_params; + let lwe_k = pbs_p.lwe_dimension; + let glwe_k = pbs_p.glwe_dimension; + let glwe_n = pbs_p.polynomial_size; + let ks_p = ¶ms.ks_params; + + // Revert transformation made in FromWith + let mut hw_idx = 0; + for outer_x in (0..lwe_k + 1).step_by(ks_p.lbx) { + for inner_x in 0..ks_p.lbx { + // -> Iterate over Slices + let raw_x = outer_x + inner_x; + let abs_x = if raw_x < (lwe_k + 1) { + Some(raw_x) + } else { + None + }; + + for outer_y in (0..(glwe_k * glwe_n)).step_by(ks_p.lby) { + for outer_z in (0..pbs_p.ks_level).step_by(ks_p.lbz) { + // -> Iterate over rectangles lby*lbz + for inner_y in 0..ks_p.lby { + let raw_y = outer_y + inner_y; + let abs_y = if raw_y < (glwe_k * glwe_n) { + // Hw-order expect y-dim to be in bitreverse + // Compute it inflight + // NB: raw_y represent the index over Y in [0; glwe_k*glwe_n] and + // the bitreverse must be only + // applied over glwe_n + // -> split raw_y in poly_y, coef_y and bitreverse only the coef_y + let poly_y = raw_y / glwe_n; + let coef_y = raw_y % glwe_n; + let brev_coef_y = rb_conv.idx_rev(coef_y); + let abs_y = poly_y * glwe_n + brev_coef_y; + Some(abs_y) + } else { + None + }; + + // Unpack over Z dimension + (0..ks_p.lbz).for_each(|inner_z| { + let raw_z = outer_z + inner_z; + let abs_z = if raw_z < pbs_p.ks_level { + Some(raw_z) + } else { + None + }; + + if let (Some(x), Some(y), Some(z)) = (abs_x, abs_y, abs_z) { + let mut cpu_ksk_view = cpu_ksk.as_mut_view(); + let cpu_coef = + KskIndex { x, y, z }.coef_mut_view(&mut cpu_ksk_view); + let hpu_val = (hpu_ksk[hw_idx] >> (inner_z * ks_p.width)) + & ((1_u64 << ks_p.width) - 1); + // Cpu expect value MSB Align + *cpu_coef = + Scalar::cast_from(hpu_val << (Scalar::BITS - ks_p.width)); + } + // Otherwise, at least one dimension overflow, it's padded with 0 in + // the Hw view => Skipped + }); + hw_idx += 1; + } + } + } + } + } + cpu_ksk + } +} diff --git a/tfhe/src/core_crypto/hpu/entities/mod.rs b/tfhe/src/core_crypto/hpu/entities/mod.rs new file mode 100644 index 000000000..c5af582af --- /dev/null +++ b/tfhe/src/core_crypto/hpu/entities/mod.rs @@ -0,0 +1,10 @@ +use super::algorithms; + +// Export tfhe-hpu-backend type for use external crate +pub use tfhe_hpu_backend::prelude::*; + +pub mod glwe_ciphertext; +pub mod glwe_lookuptable; +pub mod lwe_bootstrap_key; +pub mod lwe_ciphertext; +pub mod lwe_keyswitch_key; diff --git a/tfhe/src/core_crypto/hpu/mod.rs b/tfhe/src/core_crypto/hpu/mod.rs new file mode 100644 index 000000000..59bb298be --- /dev/null +++ b/tfhe/src/core_crypto/hpu/mod.rs @@ -0,0 +1,3 @@ +pub mod algorithms; +pub mod entities; +pub use entities::*; diff --git a/tfhe/src/core_crypto/mod.rs b/tfhe/src/core_crypto/mod.rs index a15ef7c01..aaeb6aa58 100644 --- a/tfhe/src/core_crypto/mod.rs +++ b/tfhe/src/core_crypto/mod.rs @@ -20,6 +20,10 @@ pub mod fft_impl; #[cfg(feature = "gpu")] pub mod gpu; + +#[cfg(feature = "hpu")] +pub mod hpu; + #[cfg(test)] pub mod keycache; diff --git a/tfhe/src/high_level_api/array/dynamic/booleans.rs b/tfhe/src/high_level_api/array/dynamic/booleans.rs index ff8354105..3f3b0ade8 100644 --- a/tfhe/src/high_level_api/array/dynamic/booleans.rs +++ b/tfhe/src/high_level_api/array/dynamic/booleans.rs @@ -113,6 +113,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } @@ -140,6 +144,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } @@ -174,6 +182,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } diff --git a/tfhe/src/high_level_api/array/dynamic/signed.rs b/tfhe/src/high_level_api/array/dynamic/signed.rs index 8f3d7b800..4bd0831ce 100644 --- a/tfhe/src/high_level_api/array/dynamic/signed.rs +++ b/tfhe/src/high_level_api/array/dynamic/signed.rs @@ -195,6 +195,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } @@ -222,6 +226,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } @@ -346,6 +354,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } diff --git a/tfhe/src/high_level_api/array/dynamic/unsigned.rs b/tfhe/src/high_level_api/array/dynamic/unsigned.rs index 149e03706..b8d6eae10 100644 --- a/tfhe/src/high_level_api/array/dynamic/unsigned.rs +++ b/tfhe/src/high_level_api/array/dynamic/unsigned.rs @@ -202,6 +202,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } @@ -229,6 +233,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } @@ -353,6 +361,10 @@ where Some(Device::CudaGpu) => { panic!("Not supported by Cuda devices") } + #[cfg(feature = "hpu")] + Some(Device::Hpu) => { + panic!("Not supported by Hpu devices") + } None => { panic!("{}", crate::high_level_api::errors::UninitializedServerKey); } diff --git a/tfhe/src/high_level_api/array/mod.rs b/tfhe/src/high_level_api/array/mod.rs index fb46e3e4f..8099e1ea0 100644 --- a/tfhe/src/high_level_api/array/mod.rs +++ b/tfhe/src/high_level_api/array/mod.rs @@ -382,6 +382,10 @@ pub fn fhe_uint_array_eq(lhs: &[FheUint], rhs: &[FheUint] let result = gpu_key.key.key.all_eq_slices(&tmp_lhs, &tmp_rhs, streams); FheBool::new(result, gpu_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support Array yet.") + } }) } @@ -422,6 +426,10 @@ pub fn fhe_uint_array_contains_sub_slice( .contains_sub_slice(&tmp_lhs, &tmp_pattern, streams); FheBool::new(result, gpu_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support Array yet.") + } }) } @@ -461,6 +469,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda does not support FheBool dot product") } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support FheBool dot product") + } }) } diff --git a/tfhe/src/high_level_api/booleans/base.rs b/tfhe/src/high_level_api/booleans/base.rs index db05aac24..523e0b2ab 100644 --- a/tfhe/src/high_level_api/booleans/base.rs +++ b/tfhe/src/high_level_api/booleans/base.rs @@ -25,6 +25,11 @@ use std::borrow::Borrow; use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign}; use tfhe_versionable::Versionize; +#[cfg(feature = "hpu")] +use crate::integer::hpu::ciphertext::HpuRadixCiphertext; +#[cfg(feature = "hpu")] +use tfhe_hpu_backend::prelude::*; + /// The FHE boolean data type. /// /// # Example @@ -122,6 +127,8 @@ impl FheBool { InnerBoolean::Cpu(ct) => ct.into_raw_parts(), #[cfg(feature = "gpu")] InnerBoolean::Cuda(_) => unreachable!(), + #[cfg(feature = "hpu")] + InnerBoolean::Hpu(_) => unreachable!(), } } @@ -225,6 +232,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda does not support if_then_else with clear input") } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support if_then_else with clear input") + } }) } } @@ -272,6 +283,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda does not support if_then_else with clear input") } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support if_then_else with clear input") + } }) } } @@ -319,6 +334,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda does not support if_then_else with clear input") } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support if_then_else with clear input") + } }) } } @@ -366,6 +385,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda does not support if_then_else with clear input") } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support if_then_else with clear input") + } }) } } @@ -395,6 +418,10 @@ impl ScalarIfThenElse<&Self, &Self> for FheBool { let boolean_inner = CudaBooleanBlock(inner); (InnerBoolean::Cuda(boolean_inner), cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support if_then_else with clear input") + } }); Self::new(ciphertext, tag) } @@ -432,6 +459,30 @@ where FheUint::new(inner, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_then = ct_then.ciphertext.on_hpu(device); + let hpu_else = ct_else.ciphertext.on_hpu(device); + let hpu_cond = self.ciphertext.on_hpu(device); + + let (opcode, proto) = { + let asm_iop = &hpu_asm::iop::IOP_IF_THEN_ELSE; + ( + asm_iop.opcode(), + &asm_iop.format().expect("Unspecified IOP format").proto, + ) + }; + // These clones are cheap are they are just Arc + let hpu_result = HpuRadixCiphertext::exec( + proto, + opcode, + &[hpu_then.clone(), hpu_else.clone(), hpu_cond.clone()], + &[], + ) + .pop() + .unwrap(); + FheUint::new(hpu_result, device.tag.clone()) + } }) } } @@ -465,6 +516,10 @@ impl IfThenElse> for FheBool { FheInt::new(inner, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support signed integers") + } }) } } @@ -492,6 +547,10 @@ impl IfThenElse for FheBool { let boolean_inner = CudaBooleanBlock(inner); (InnerBoolean::Cuda(boolean_inner), cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bool if then else") + } }); Self::new(ciphertext, tag) } @@ -550,6 +609,10 @@ where let ciphertext = InnerBoolean::Cuda(inner); Self::new(ciphertext, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support FheBool::eq") + } }) } @@ -592,6 +655,10 @@ where let ciphertext = InnerBoolean::Cuda(inner); Self::new(ciphertext, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support FheBool::ne") + } }) } } @@ -636,6 +703,10 @@ impl FheEq for FheBool { ); (InnerBoolean::Cuda(inner), cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support FheBool::eq with a bool") + } }); Self::new(ciphertext, tag) } @@ -679,6 +750,10 @@ impl FheEq for FheBool { ); (InnerBoolean::Cuda(inner), cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support FheBool::ne with a bool") + } }); Self::new(ciphertext, tag) } @@ -759,6 +834,10 @@ where cuda_key.tag.clone(), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitand (&)") + } }); FheBool::new(ciphertext, tag) } @@ -843,6 +922,10 @@ where cuda_key.tag.clone(), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitor (|)") + } }); FheBool::new(ciphertext, tag) } @@ -927,6 +1010,10 @@ where cuda_key.tag.clone(), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitxor (^)") + } }); FheBool::new(ciphertext, tag) } @@ -1003,6 +1090,10 @@ impl BitAnd for &FheBool { cuda_key.tag.clone(), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("hpu does not bitand (&) with a bool") + } }); FheBool::new(ciphertext, tag) } @@ -1079,6 +1170,10 @@ impl BitOr for &FheBool { cuda_key.tag.clone(), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("hpu does not bitor (|) with a bool") + } }); FheBool::new(ciphertext, tag) } @@ -1155,6 +1250,10 @@ impl BitXor for &FheBool { cuda_key.tag.clone(), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("hpu does not bitxor (^) with a bool") + } }); FheBool::new(ciphertext, tag) } @@ -1353,6 +1452,10 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitand assign (&=)") + } }); } } @@ -1396,6 +1499,10 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitor assign (|=)") + } }); } } @@ -1439,6 +1546,10 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitxor assign (^=)") + } }); } } @@ -1476,6 +1587,10 @@ impl BitAndAssign for FheBool { streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitand assign (&=) with a bool") + } }); } } @@ -1513,6 +1628,10 @@ impl BitOrAssign for FheBool { streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitor assign (|=) with a bool") + } }); } } @@ -1550,6 +1669,10 @@ impl BitXorAssign for FheBool { streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitor assign (^=) with a bool") + } }); } } @@ -1619,6 +1742,10 @@ impl std::ops::Not for &FheBool { cuda_key.tag.clone(), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitnot (!)") + } }); FheBool::new(ciphertext, tag) } diff --git a/tfhe/src/high_level_api/booleans/encrypt.rs b/tfhe/src/high_level_api/booleans/encrypt.rs index c339a2b8d..696f707ec 100644 --- a/tfhe/src/high_level_api/booleans/encrypt.rs +++ b/tfhe/src/high_level_api/booleans/encrypt.rs @@ -101,6 +101,10 @@ impl FheTryTrivialEncrypt for FheBool { )); (ct, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support trivial encryption") + } }); Ok(Self::new(ciphertext, tag)) } diff --git a/tfhe/src/high_level_api/booleans/inner.rs b/tfhe/src/high_level_api/booleans/inner.rs index 49b22edb3..56bbc74de 100644 --- a/tfhe/src/high_level_api/booleans/inner.rs +++ b/tfhe/src/high_level_api/booleans/inner.rs @@ -12,11 +12,22 @@ use crate::Device; use serde::{Deserializer, Serializer}; use tfhe_versionable::{Unversionize, UnversionizeError, Versionize, VersionizeOwned}; +#[cfg(feature = "hpu")] +use crate::high_level_api::keys::HpuTaggedDevice; +#[cfg(feature = "gpu")] +use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; +#[cfg(feature = "gpu")] +use crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext; +#[cfg(feature = "hpu")] +use crate::integer::hpu::ciphertext::HpuRadixCiphertext; + /// Enum that manages the current inner representation of a boolean. pub(in crate::high_level_api) enum InnerBoolean { Cpu(BooleanBlock), #[cfg(feature = "gpu")] Cuda(crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock), + #[cfg(feature = "hpu")] + Hpu(HpuRadixCiphertext), } impl Clone for InnerBoolean { @@ -27,6 +38,8 @@ impl Clone for InnerBoolean { Self::Cuda(inner) => { with_thread_local_cuda_streams(|streams| Self::Cuda(inner.duplicate(streams))) } + #[cfg(feature = "hpu")] + Self::Hpu(inner) => Self::Hpu(inner.clone()), } } } @@ -39,6 +52,8 @@ impl serde::Serialize for InnerBoolean { Self::Cpu(cpu_ct) => cpu_ct.serialize(serializer), #[cfg(feature = "gpu")] Self::Cuda(_) => self.on_cpu().serialize(serializer), + #[cfg(feature = "hpu")] + Self::Hpu(_) => self.on_cpu().serialize(serializer), } } } @@ -57,9 +72,7 @@ impl<'de> serde::Deserialize<'de> for InnerBoolean { // Only CPU data are serialized so we only versionize the CPU type. #[derive(serde::Serialize, serde::Deserialize)] #[cfg_attr(dylint_lib = "tfhe_lints", allow(serialize_without_versionize))] -pub(crate) struct InnerBooleanVersionOwned( - ::VersionedOwned, -); +pub(crate) struct InnerBooleanVersionOwned(::VersionedOwned); impl Versionize for InnerBoolean { type Versioned<'vers> = InnerBooleanVersionedOwned; @@ -85,7 +98,7 @@ impl Unversionize for InnerBoolean { fn unversionize(versioned: Self::VersionedOwned) -> Result { match versioned { InnerBooleanVersionedOwned::V0(v0) => { - let mut unversioned = Self::Cpu(crate::integer::BooleanBlock::unversionize(v0.0)?); + let mut unversioned = Self::Cpu(BooleanBlock::unversionize(v0.0)?); unversioned.move_to_device_of_server_key_if_set(); Ok(unversioned) } @@ -106,12 +119,21 @@ impl From for } } +#[cfg(feature = "hpu")] +impl From for InnerBoolean { + fn from(value: HpuRadixCiphertext) -> Self { + Self::Hpu(value) + } +} + impl InnerBoolean { pub(crate) fn current_device(&self) -> Device { match self { Self::Cpu(_) => Device::Cpu, #[cfg(feature = "gpu")] Self::Cuda(_) => Device::CudaGpu, + #[cfg(feature = "hpu")] + Self::Hpu(_) => Device::Hpu, } } @@ -126,6 +148,8 @@ impl InnerBoolean { MaybeCloned::Cloned(ct.to_boolean_block(streams)) }) } + #[cfg(feature = "hpu")] + Self::Hpu(ct) => MaybeCloned::Cloned(ct.to_boolean_block()), } } @@ -135,32 +159,38 @@ impl InnerBoolean { pub(crate) fn on_gpu( &self, streams: &CudaStreams, - ) -> MaybeCloned<'_, crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext> { - match self { - Self::Cpu(ct) => with_thread_local_cuda_streams(|streams| { - let ct_as_radix = crate::integer::RadixCiphertext::from(vec![ct.0.clone()]); - let cuda_ct = - crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext::from_radix_ciphertext( - &ct_as_radix, - streams, - ); - MaybeCloned::Cloned(cuda_ct) - }), - #[cfg(feature = "gpu")] - Self::Cuda(ct) => { - if ct.gpu_indexes() == streams.gpu_indexes() { - MaybeCloned::Borrowed(ct.as_ref()) - } else { - MaybeCloned::Cloned(ct.duplicate(streams).0) + ) -> MaybeCloned<'_, CudaUnsignedRadixCiphertext> { + #[allow(clippy::match_wildcard_for_single_variants)] + let cpu_radix = match self { + Self::Cuda(gpu_radix) => { + if gpu_radix.gpu_indexes() == streams.gpu_indexes() { + return MaybeCloned::Borrowed(&gpu_radix.0); } + return MaybeCloned::Cloned(gpu_radix.duplicate(streams).0); } - } + _ => self.on_cpu(), + }; + + let gpu_radix = CudaBooleanBlock::from_boolean_block(&cpu_radix, streams); + MaybeCloned::Cloned(gpu_radix.0) + } + + #[cfg(feature = "hpu")] + pub(crate) fn on_hpu(&self, device: &HpuTaggedDevice) -> MaybeCloned<'_, HpuRadixCiphertext> { + #[allow(clippy::match_wildcard_for_single_variants)] + let cpu_radix = match self { + Self::Hpu(hpu_radix) => return MaybeCloned::Borrowed(hpu_radix), + _ => self.on_cpu(), + }; + + let hpu_ct = HpuRadixCiphertext::from_boolean_ciphertext(&cpu_radix, &device.device); + MaybeCloned::Cloned(hpu_ct) } pub(crate) fn as_cpu_mut(&mut self) -> &mut BooleanBlock { match self { Self::Cpu(block) => block, - #[cfg(feature = "gpu")] + #[cfg(any(feature = "gpu", feature = "hpu"))] _ => { self.move_to_device(Device::Cpu); self.as_cpu_mut() @@ -170,84 +200,94 @@ impl InnerBoolean { #[cfg(feature = "gpu")] #[track_caller] - pub(crate) fn as_gpu_mut( - &mut self, - streams: &CudaStreams, - ) -> &mut crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext { + pub(crate) fn as_gpu_mut(&mut self, streams: &CudaStreams) -> &mut CudaUnsignedRadixCiphertext { use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; + let cpu_radix = if let Self::Cuda(cuda_ct) = self { + if cuda_ct.gpu_indexes() != streams.gpu_indexes() { + *cuda_ct = cuda_ct.duplicate(streams); + } + return &mut cuda_ct.0; + } else { + self.on_cpu() + }; + + let cuda_ct = CudaBooleanBlock::from_boolean_block(&cpu_radix, streams); + *self = Self::Cuda(cuda_ct); + let Self::Cuda(cuda_ct) = self else { + unreachable!() + }; + &mut cuda_ct.0 + } + + #[cfg(feature = "gpu")] + pub(crate) fn into_cpu(self) -> BooleanBlock { match self { - Self::Cpu(cpu_ct) => { - let ct_as_radix = crate::integer::RadixCiphertext::from(vec![cpu_ct.0.clone()]); - let cuda_ct = crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct_as_radix, streams); - let cuda_ct = CudaBooleanBlock::from_cuda_radix_ciphertext(cuda_ct.ciphertext); - *self = Self::Cuda(cuda_ct); - let Self::Cuda(cuda_ct) = self else { - unreachable!() - }; - &mut cuda_ct.0 - } - Self::Cuda(cuda_ct) => { - if cuda_ct.gpu_indexes() != streams.gpu_indexes() { - *cuda_ct = cuda_ct.duplicate(streams); - } - &mut cuda_ct.0 + Self::Cpu(cpu_ct) => cpu_ct, + #[cfg(feature = "gpu")] + Self::Cuda(ct) => { + with_thread_local_cuda_streams_for_gpu_indexes(ct.gpu_indexes(), |streams| { + ct.to_boolean_block(streams) + }) } + #[cfg(feature = "hpu")] + Self::Hpu(hpu_ct) => hpu_ct.to_boolean_block(), } } #[cfg(feature = "gpu")] - pub(crate) fn into_gpu( - self, - streams: &CudaStreams, - ) -> crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock { - match self { - Self::Cpu(cpu_ct) => with_thread_local_cuda_streams(|streams| { - crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock::from_boolean_block( - &cpu_ct, streams, - ) - }), - Self::Cuda(ct) => ct.move_to_stream(streams), - } + pub(crate) fn into_gpu(self, streams: &CudaStreams) -> CudaBooleanBlock { + #[allow(clippy::match_wildcard_for_single_variants)] + let cpu_bool = match self { + Self::Cuda(gpu_bool) => return gpu_bool.move_to_stream(streams), + _ => self.into_cpu(), + }; + CudaBooleanBlock::from_boolean_block(&cpu_bool, streams) } #[allow(clippy::needless_pass_by_ref_mut)] - pub(crate) fn move_to_device(&mut self, device: Device) { - match (&self, device) { - (Self::Cpu(_), Device::Cpu) => { - // Nothing to do, we already are on the correct device - } + pub(crate) fn move_to_device(&mut self, target_device: Device) { + let current_device = self.current_device(); + + if current_device == target_device { #[cfg(feature = "gpu")] - (Self::Cuda(cuda_ct), Device::CudaGpu) => { - // We are on a GPU, but it may not be the correct one - let new = with_thread_local_cuda_streams(|streams| { - if cuda_ct.gpu_indexes() == streams.gpu_indexes() { - None - } else { - Some(cuda_ct.duplicate(streams)) + // We may not be on the correct Cuda device + if let Self::Cuda(cuda_ct) = self { + with_thread_local_cuda_streams(|streams| { + if cuda_ct.gpu_indexes() != streams.gpu_indexes() { + *cuda_ct = cuda_ct.duplicate(streams); } - }); - if let Some(ct) = new { - *self = Self::Cuda(ct); - } + }) + } + return; + } + + // The logic is that the common device is the CPU, all other devices + // know how to transfer from and to CPU. + + // So we first transfer to CPU + let cpu_ct = self.on_cpu(); + + // Then we can transfer the desired device + match target_device { + Device::Cpu => { + let _ = cpu_ct; } #[cfg(feature = "gpu")] - (Self::Cpu(ct), Device::CudaGpu) => { + Device::CudaGpu => { let new_inner = with_thread_local_cuda_streams(|streams| { crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock::from_boolean_block( - ct, - streams, + &cpu_ct, streams, ) }); *self = Self::Cuda(new_inner); } - #[cfg(feature = "gpu")] - (Self::Cuda(ct), Device::Cpu) => { - let new_inner = - with_thread_local_cuda_streams_for_gpu_indexes(ct.gpu_indexes(), |streams| { - ct.to_boolean_block(streams) - }); - *self = Self::Cpu(new_inner); + #[cfg(feature = "hpu")] + Device::Hpu => { + let hpu_ct = global_state::with_thread_local_hpu_device(|device| { + HpuRadixCiphertext::from_boolean_ciphertext(&cpu_ct, &device.device) + }); + *self = Self::Hpu(hpu_ct); } } } diff --git a/tfhe/src/high_level_api/booleans/oprf.rs b/tfhe/src/high_level_api/booleans/oprf.rs index 270ef554a..f3a2836ed 100644 --- a/tfhe/src/high_level_api/booleans/oprf.rs +++ b/tfhe/src/high_level_api/booleans/oprf.rs @@ -13,7 +13,7 @@ use tfhe_csprng::seeders::Seed; impl FheBool { /// Generates an encrypted boolean /// taken uniformly using the given seed. - /// The encryted value is oblivious to the server. + /// The encrypted value is oblivious to the server. /// It can be useful to make server random generation deterministic. /// /// ```rust @@ -53,6 +53,10 @@ impl FheBool { cuda_key.tag.clone(), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support random bool generation") + } }); Self::new(ciphertext, tag) } diff --git a/tfhe/src/high_level_api/booleans/squashed_noise.rs b/tfhe/src/high_level_api/booleans/squashed_noise.rs index 3587680f0..70b9916b7 100644 --- a/tfhe/src/high_level_api/booleans/squashed_noise.rs +++ b/tfhe/src/high_level_api/booleans/squashed_noise.rs @@ -106,8 +106,8 @@ impl InnerSquashedNoiseBoolean { (Self::Cpu(_), Device::Cpu) => { // Nothing to do, we already are on the correct device } - #[cfg(feature = "gpu")] - _ => panic!("Cuda devices do not support noise squashing yet"), + #[cfg(any(feature = "gpu", feature = "hpu"))] + _ => panic!("Cuda/Hpu devices do not support noise squashing yet"), } } @@ -180,6 +180,10 @@ impl SquashNoise for FheBool { InternalServerKey::Cuda(_) => Err(crate::error!( "Cuda devices do not support noise squashing yet" )), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + Err(crate::error!("Hpu devices do not support noise squashing")) + } }) } } diff --git a/tfhe/src/high_level_api/compact_list.rs b/tfhe/src/high_level_api/compact_list.rs index 3a71ec403..6415b4a9c 100644 --- a/tfhe/src/high_level_api/compact_list.rs +++ b/tfhe/src/high_level_api/compact_list.rs @@ -199,7 +199,7 @@ impl CompactCiphertextList { inner, tag: self.tag.clone(), }), - #[cfg(feature = "gpu")] + #[cfg(any(feature = "gpu", feature = "hpu"))] Some(_) => Err(crate::Error::new("Expected a CPU server key".to_string())), }) } @@ -314,7 +314,7 @@ mod zk { inner: expander, tag: self.tag.clone(), }), - #[cfg(feature = "gpu")] + #[cfg(any(feature = "gpu", feature = "hpu"))] Some(_) => Err(crate::Error::new("Expected a CPU server key".to_string())), }) } @@ -346,7 +346,7 @@ mod zk { inner: expander, tag: self.tag.clone(), }), - #[cfg(feature = "gpu")] + #[cfg(any(feature = "gpu", feature = "hpu"))] Some(_) => Err(crate::Error::new("Expected a CPU server key".to_string())), }) } diff --git a/tfhe/src/high_level_api/compressed_ciphertext_list.rs b/tfhe/src/high_level_api/compressed_ciphertext_list.rs index 45849923c..cc18c6029 100644 --- a/tfhe/src/high_level_api/compressed_ciphertext_list.rs +++ b/tfhe/src/high_level_api/compressed_ciphertext_list.rs @@ -26,7 +26,7 @@ use crate::integer::gpu::ciphertext::CudaRadixCiphertext; use crate::named::Named; use crate::prelude::{CiphertextList, Tagged}; use crate::shortint::Ciphertext; -use crate::{FheBool, FheInt, FheUint, Tag}; +use crate::{Device, FheBool, FheInt, FheUint, Tag}; impl HlCompressible for FheUint { fn compress_into(self, messages: &mut Vec<(ToBeCompressed, DataKind)>) { @@ -42,6 +42,10 @@ impl HlCompressible for FheUint { let kind = DataKind::Unsigned(blocks.info.blocks.len()); messages.push((ToBeCompressed::Cuda(blocks), kind)); } + #[cfg(feature = "hpu")] + crate::high_level_api::integers::unsigned::RadixCiphertext::Hpu(_) => { + panic!("HPU does not support compression"); + } } } } @@ -74,6 +78,8 @@ impl HlCompressible for FheBool { let kind = DataKind::Boolean; messages.push((ToBeCompressed::Cuda(cuda_bool.0.ciphertext), kind)); } + #[cfg(feature = "hpu")] + InnerBoolean::Hpu(_) => panic!("HPU does not support compression"), } } } @@ -209,6 +215,10 @@ impl CompressedCiphertextListBuilder { } }) } + #[cfg(feature = "hpu")] + Some(InternalServerKey::Hpu(_)) => Err(crate::Error::new( + "Hpu does not support compression".to_string(), + )), None => Err(UninitializedServerKey.into()), }) } @@ -255,38 +265,45 @@ impl InnerCompressedCiphertextList { } } - fn move_to_device(&mut self, device: crate::Device) { - let new_value = match (&self, device) { - (Self::Cpu(_), crate::Device::Cpu) => None, + #[allow(clippy::needless_pass_by_ref_mut)] + fn move_to_device(&mut self, target_device: Device) { + let current_device = self.current_device(); + + if current_device == target_device { #[cfg(feature = "gpu")] - (Self::Cuda(cuda_ct), crate::Device::CudaGpu) => { + // We may not be on the correct Cuda device + if let Self::Cuda(cuda_ct) = self { with_thread_local_cuda_streams(|streams| { - if cuda_ct.gpu_indexes() == streams.gpu_indexes() { - None - } else { - Some(Self::Cuda(cuda_ct.duplicate(streams))) + if cuda_ct.gpu_indexes() != streams.gpu_indexes() { + *cuda_ct = cuda_ct.duplicate(streams); } }) } - #[cfg(feature = "gpu")] - (Self::Cuda(cuda_ct), crate::Device::Cpu) => { - let cpu_ct = with_thread_local_cuda_streams_for_gpu_indexes( - cuda_ct.gpu_indexes(), - |streams| cuda_ct.to_compressed_ciphertext_list(streams), - ); - Some(Self::Cpu(cpu_ct)) + return; + } + + // The logic is that the common device is the CPU, all other devices + // know how to transfer from and to CPU. + + // So we first transfer to CPU + let cpu_ct = self.on_cpu(); + + // Then we can transfer the desired device + match target_device { + Device::Cpu => { + let _ = cpu_ct; } #[cfg(feature = "gpu")] - (Self::Cpu(cpu_ct), crate::Device::CudaGpu) => { - let cuda_ct = with_thread_local_cuda_streams(|streams| { + Device::CudaGpu => { + let new_inner = with_thread_local_cuda_streams(|streams| { cpu_ct.to_cuda_compressed_ciphertext_list(streams) }); - Some(Self::Cuda(cuda_ct)) + *self = Self::Cuda(new_inner); + } + #[cfg(feature = "hpu")] + Device::Hpu => { + panic!("HPU does not support compression"); } - }; - - if let Some(v) = new_value { - *self = v; } } @@ -468,6 +485,10 @@ impl CiphertextList for CompressedCiphertextList { } ct }), + #[cfg(feature = "hpu")] + Some(InternalServerKey::Hpu(_)) => { + panic!("HPU does not support compression"); + } None => Err(UninitializedServerKey.into()), }) } diff --git a/tfhe/src/high_level_api/config.rs b/tfhe/src/high_level_api/config.rs index d5cabaf53..a8ede6b54 100644 --- a/tfhe/src/high_level_api/config.rs +++ b/tfhe/src/high_level_api/config.rs @@ -13,6 +13,13 @@ pub struct Config { } impl Config { + #[cfg(feature = "hpu")] + pub fn from_hpu_device(hpu_device: &tfhe_hpu_backend::prelude::HpuDevice) -> Self { + let pbs_params = + crate::shortint::parameters::KeySwitch32PBSParameters::from(hpu_device.params()); + ConfigBuilder::with_custom_parameters(pbs_params).build() + } + pub fn public_key_encryption_parameters( &self, ) -> Result diff --git a/tfhe/src/high_level_api/global_state.rs b/tfhe/src/high_level_api/global_state.rs index f4f3e9a3d..e4e0fb1d4 100644 --- a/tfhe/src/high_level_api/global_state.rs +++ b/tfhe/src/high_level_api/global_state.rs @@ -129,11 +129,7 @@ pub(in crate::high_level_api) fn device_of_internal_keys() -> Option crate::Device::Cpu, - #[cfg(feature = "gpu")] - InternalServerKey::Cuda(_) => crate::Device::CudaGpu, - }) + cell.as_ref().map(InternalServerKey::device) }) } @@ -146,6 +142,8 @@ pub(in crate::high_level_api) fn tag_of_internal_server_key() -> crate::Result cpu_key.tag.clone(), #[cfg(feature = "gpu")] InternalServerKey::Cuda(cuda_key) => cuda_key.tag.clone(), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(hpu_device) => hpu_device.tag.clone(), }) }) } @@ -162,13 +160,15 @@ where .as_ref() .ok_or(UninitializedServerKey) .unwrap_display(); - match key { - InternalServerKey::Cpu(key) => func(key), - #[cfg(feature = "gpu")] - InternalServerKey::Cuda(_) => { - panic!("Cpu key requested but only cuda key is available") - } - } + #[allow(irrefutable_let_patterns, reason = "It depends on hardware features")] + let InternalServerKey::Cpu(cpu_key) = key + else { + panic!( + "Cpu key requested but only the key for {:?} is available", + key.device() + ) + }; + func(cpu_key) }) } @@ -185,12 +185,13 @@ where .as_ref() .ok_or(UninitializedServerKey) .unwrap_display(); - match key { - InternalServerKey::Cuda(key) => func(key), - InternalServerKey::Cpu(_) => { - panic!("Cuda key requested but only cpu key is available") - } - } + let InternalServerKey::Cuda(cuda_key) = key else { + panic!( + "Cpu key requested but only the key for {:?} is available", + key.device() + ) + }; + func(cuda_key) }) } @@ -307,3 +308,27 @@ mod gpu { } } } + +#[cfg(feature = "hpu")] +pub(in crate::high_level_api) use hpu::with_thread_local_hpu_device; + +#[cfg(feature = "hpu")] +mod hpu { + use super::*; + + use crate::high_level_api::keys::HpuTaggedDevice; + + use super::INTERNAL_KEYS; + + pub(in crate::high_level_api) fn with_thread_local_hpu_device(func: F) -> R + where + F: FnOnce(&HpuTaggedDevice) -> R, + { + INTERNAL_KEYS.with_borrow(|keys| { + let Some(InternalServerKey::Hpu(device)) = keys else { + panic!("Hpu device was requested but it is not available") + }; + func(device) + }) + } +} diff --git a/tfhe/src/high_level_api/integers/oprf.rs b/tfhe/src/high_level_api/integers/oprf.rs index 1c40f5e70..1fb33dec2 100644 --- a/tfhe/src/high_level_api/integers/oprf.rs +++ b/tfhe/src/high_level_api/integers/oprf.rs @@ -6,6 +6,7 @@ use crate::high_level_api::keys::InternalServerKey; #[cfg(feature = "gpu")] use crate::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext}; use crate::{FheInt, Seed}; + impl FheUint { /// Generates an encrypted unsigned integer /// taken uniformly in its full range using the given seed. @@ -50,6 +51,10 @@ impl FheUint { Self::new(d_ct, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } /// Generates an encrypted `num_block` blocks unsigned integer @@ -99,6 +104,10 @@ impl FheUint { ); Self::new(d_ct, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -148,6 +157,10 @@ impl FheInt { Self::new(d_ct, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -199,6 +212,10 @@ impl FheInt { ); Self::new(d_ct, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } diff --git a/tfhe/src/high_level_api/integers/signed/base.rs b/tfhe/src/high_level_api/integers/signed/base.rs index 675b19e94..3d989607c 100644 --- a/tfhe/src/high_level_api/integers/signed/base.rs +++ b/tfhe/src/high_level_api/integers/signed/base.rs @@ -204,6 +204,10 @@ where .abs(&*self.ciphertext.on_gpu(streams), streams); Self::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -240,6 +244,10 @@ where .is_even(&*self.ciphertext.on_gpu(streams), streams); FheBool::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -276,6 +284,10 @@ where .is_odd(&*self.ciphertext.on_gpu(streams), streams); FheBool::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -321,6 +333,10 @@ where ); crate::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -366,6 +382,10 @@ where ); crate::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -411,6 +431,10 @@ where ); crate::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -456,6 +480,10 @@ where ); crate::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -493,6 +521,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support count_ones yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -530,6 +562,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support count_zeros yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -577,6 +613,10 @@ where ); crate::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -634,6 +674,10 @@ where FheBool::new(is_ok, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -708,6 +752,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support reverse yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -755,6 +803,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support if_then_else yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet."); + } }) } @@ -816,6 +868,10 @@ where ); Self::new(new_ciphertext, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -860,6 +916,10 @@ where ); Self::new(new_ciphertext, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -907,6 +967,10 @@ where ); Self::new(inner, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } diff --git a/tfhe/src/high_level_api/integers/signed/encrypt.rs b/tfhe/src/high_level_api/integers/signed/encrypt.rs index d4e284fd5..446331d15 100644 --- a/tfhe/src/high_level_api/integers/signed/encrypt.rs +++ b/tfhe/src/high_level_api/integers/signed/encrypt.rs @@ -121,6 +121,8 @@ where ); Ok(Self::new(inner, cuda_key.tag.clone())) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => panic!("Hpu does not currently support signed operation"), }) } } diff --git a/tfhe/src/high_level_api/integers/signed/inner.rs b/tfhe/src/high_level_api/integers/signed/inner.rs index 7b24f6d0f..85a98cde1 100644 --- a/tfhe/src/high_level_api/integers/signed/inner.rs +++ b/tfhe/src/high_level_api/integers/signed/inner.rs @@ -240,12 +240,15 @@ impl SignedRadixCiphertext { } #[cfg(feature = "gpu")] (Self::Cuda(ct), Device::Cpu) => { - let new_inner = - with_thread_local_cuda_streams_for_gpu_indexes(ct.gpu_indexes(), |streams| { - ct.to_signed_radix_ciphertext(streams) - }); + let new_inner = with_thread_local_cuda_streams(|streams| { + ct.to_signed_radix_ciphertext(streams) + }); *self = Self::Cpu(new_inner); } + #[cfg(feature = "hpu")] + (_, Device::Hpu) => { + panic!("Hpu device do not support signed integer yet",) + } } } diff --git a/tfhe/src/high_level_api/integers/signed/ops.rs b/tfhe/src/high_level_api/integers/signed/ops.rs index ebac40a4d..f388aaee0 100644 --- a/tfhe/src/high_level_api/integers/signed/ops.rs +++ b/tfhe/src/high_level_api/integers/signed/ops.rs @@ -106,6 +106,10 @@ where Self::new(inner, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -152,6 +156,10 @@ where ); Self::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -198,6 +206,10 @@ where ); Self::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -255,6 +267,10 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -294,6 +310,10 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -359,6 +379,10 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -398,6 +422,10 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -437,6 +465,10 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -476,6 +508,10 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -561,6 +597,10 @@ where FheInt::::new(r, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -640,6 +680,10 @@ generic_integer_impl_operation!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -683,6 +727,10 @@ generic_integer_impl_operation!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -726,6 +774,10 @@ generic_integer_impl_operation!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -767,6 +819,10 @@ generic_integer_impl_operation!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -808,6 +864,10 @@ generic_integer_impl_operation!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -849,6 +909,10 @@ generic_integer_impl_operation!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -899,6 +963,10 @@ generic_integer_impl_operation!( .div(&*lhs.ciphertext.on_gpu(streams), &*rhs.ciphertext.on_gpu(streams), streams); FheInt::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -950,6 +1018,10 @@ generic_integer_impl_operation!( .rem(&*lhs.ciphertext.on_gpu(streams), &*rhs.ciphertext.on_gpu(streams), streams); FheInt::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1063,6 +1135,10 @@ generic_integer_impl_shift_rotate!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -1107,6 +1183,10 @@ generic_integer_impl_shift_rotate!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -1151,6 +1231,10 @@ generic_integer_impl_shift_rotate!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -1195,6 +1279,10 @@ generic_integer_impl_shift_rotate!( FheInt::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -1247,6 +1335,10 @@ where ); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1294,6 +1386,10 @@ where ); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1341,6 +1437,10 @@ where ); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1386,6 +1486,10 @@ where ); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1431,6 +1535,10 @@ where ); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1476,6 +1584,10 @@ where ); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1528,6 +1640,10 @@ where *cuda_lhs = cuda_result; }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1580,6 +1696,10 @@ where *cuda_lhs = cuda_result; }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1635,6 +1755,10 @@ where ); }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1689,6 +1813,10 @@ where ); }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1744,6 +1872,10 @@ where ); }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1799,6 +1931,10 @@ where ); }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1870,6 +2006,10 @@ where .neg(&*self.ciphertext.on_gpu(streams), streams); FheInt::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1939,6 +2079,10 @@ where .bitnot(&*self.ciphertext.on_gpu(streams), streams); FheInt::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } diff --git a/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs b/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs index 840d1c443..097cc2834 100644 --- a/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs +++ b/tfhe/src/high_level_api/integers/signed/overflowing_ops.rs @@ -64,6 +64,10 @@ where FheBool::new(overflow, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -160,6 +164,10 @@ where FheBool::new(overflow, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -294,6 +302,10 @@ where FheBool::new(overflow, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -389,6 +401,10 @@ where FheBool::new(overflow, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -477,6 +493,10 @@ where InternalServerKey::Cuda(_) => { todo!("Cuda devices do not support signed integer"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } diff --git a/tfhe/src/high_level_api/integers/signed/scalar_ops.rs b/tfhe/src/high_level_api/integers/signed/scalar_ops.rs index 6dc5aa41c..00713c02d 100644 --- a/tfhe/src/high_level_api/integers/signed/scalar_ops.rs +++ b/tfhe/src/high_level_api/integers/signed/scalar_ops.rs @@ -64,6 +64,10 @@ where Self::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -112,6 +116,10 @@ where Self::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -159,6 +167,10 @@ where FheBool::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -200,6 +212,10 @@ where FheBool::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -246,6 +262,10 @@ where FheBool::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -286,6 +306,10 @@ where FheBool::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -326,6 +350,10 @@ where FheBool::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -366,6 +394,10 @@ where FheBool::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -422,6 +454,10 @@ macro_rules! generic_integer_impl_scalar_div_rem { <$concrete_type>::new(r, cuda_key.tag.clone()), ) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -463,6 +499,10 @@ macro_rules! define_scalar_rotate_shifts { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -492,6 +532,10 @@ macro_rules! define_scalar_rotate_shifts { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -521,6 +565,10 @@ macro_rules! define_scalar_rotate_shifts { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -550,6 +598,10 @@ macro_rules! define_scalar_rotate_shifts { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -576,6 +628,10 @@ macro_rules! define_scalar_rotate_shifts { .scalar_left_shift_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -602,6 +658,10 @@ macro_rules! define_scalar_rotate_shifts { .scalar_right_shift_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -628,6 +688,10 @@ macro_rules! define_scalar_rotate_shifts { .scalar_rotate_left_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -654,6 +718,10 @@ macro_rules! define_scalar_rotate_shifts { .scalar_rotate_right_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -769,6 +837,10 @@ macro_rules! define_scalar_ops { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -798,6 +870,10 @@ macro_rules! define_scalar_ops { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -827,6 +903,10 @@ macro_rules! define_scalar_ops { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -857,6 +937,10 @@ macro_rules! define_scalar_ops { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -886,6 +970,10 @@ macro_rules! define_scalar_ops { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -916,6 +1004,10 @@ macro_rules! define_scalar_ops { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -945,6 +1037,10 @@ macro_rules! define_scalar_ops { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -975,6 +1071,10 @@ macro_rules! define_scalar_ops { }); SignedRadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1021,6 +1121,10 @@ macro_rules! define_scalar_ops { SignedRadixCiphertext::Cuda(result) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1115,6 +1219,10 @@ macro_rules! define_scalar_ops { .scalar_add_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1146,6 +1254,10 @@ macro_rules! define_scalar_ops { .scalar_sub_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1173,6 +1285,10 @@ macro_rules! define_scalar_ops { .scalar_mul_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1201,6 +1317,10 @@ macro_rules! define_scalar_ops { .scalar_bitand_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1228,6 +1348,10 @@ macro_rules! define_scalar_ops { .scalar_bitor_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1254,6 +1378,10 @@ macro_rules! define_scalar_ops { .scalar_bitxor_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1278,7 +1406,11 @@ macro_rules! define_scalar_ops { let cuda_lhs = lhs.ciphertext.as_gpu_mut(streams); let cuda_result = cuda_key.pbs_key().signed_scalar_div(&cuda_lhs, rhs, streams); *cuda_lhs = cuda_result; - }) + }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1303,7 +1435,11 @@ macro_rules! define_scalar_ops { let cuda_lhs = lhs.ciphertext.as_gpu_mut(streams); let cuda_result = cuda_key.pbs_key().signed_scalar_rem(&cuda_lhs, rhs, streams); *cuda_lhs = cuda_result; - }) + }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, diff --git a/tfhe/src/high_level_api/integers/signed/squashed_noise.rs b/tfhe/src/high_level_api/integers/signed/squashed_noise.rs index 7459014ea..a15e51c7e 100644 --- a/tfhe/src/high_level_api/integers/signed/squashed_noise.rs +++ b/tfhe/src/high_level_api/integers/signed/squashed_noise.rs @@ -114,8 +114,8 @@ impl InnerSquashedNoiseSignedRadixCiphertext { (Self::Cpu(_), Device::Cpu) => { // Nothing to do, we already are on the correct device } - #[cfg(feature = "gpu")] - _ => panic!("Cuda devices do not support noise squashing yet"), + #[cfg(any(feature = "gpu", feature = "hpu"))] + _ => panic!("Cuda/Hpu devices do not support noise squashing yet"), } } @@ -199,6 +199,10 @@ impl SquashNoise for FheInt { InternalServerKey::Cuda(_) => Err(crate::error!( "Cuda devices do not support noise squashing yet" )), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + Err(crate::error!("Hpu devices do not support noise squashing")) + } }) } } diff --git a/tfhe/src/high_level_api/integers/unsigned/base.rs b/tfhe/src/high_level_api/integers/unsigned/base.rs index 972ff5423..9aec3a358 100644 --- a/tfhe/src/high_level_api/integers/unsigned/base.rs +++ b/tfhe/src/high_level_api/integers/unsigned/base.rs @@ -9,7 +9,7 @@ use crate::high_level_api::global_state::with_thread_local_cuda_streams; use crate::high_level_api::integers::signed::{FheInt, FheIntId}; use crate::high_level_api::integers::IntegerId; use crate::high_level_api::keys::InternalServerKey; -use crate::high_level_api::traits::Tagged; +use crate::high_level_api::traits::{FheWait, Tagged}; use crate::high_level_api::{global_state, Device}; use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom}; #[cfg(feature = "gpu")] @@ -25,6 +25,11 @@ use crate::GpuIndex; use crate::{FheBool, ServerKey, Tag}; use std::marker::PhantomData; +#[cfg(feature = "hpu")] +use crate::high_level_api::traits::{FheHpu, HpuHandle}; +#[cfg(feature = "hpu")] +use tfhe_hpu_backend::prelude::*; + #[derive(Debug)] pub enum GenericIntegerBlockError { NumberOfBlocks(usize, usize), @@ -147,6 +152,56 @@ where } } +impl FheWait for FheUint +where + Id: FheUintId, +{ + fn wait(&self) { + self.ciphertext.wait() + } +} + +#[cfg(feature = "hpu")] +impl FheHpu for FheUint +where + Id: FheUintId, +{ + fn iop_exec(iop: &hpu_asm::AsmIOpcode, src: HpuHandle<&Self>) -> HpuHandle { + use crate::integer::hpu::ciphertext::HpuRadixCiphertext; + global_state::with_thread_local_hpu_device(|device| { + let mut srcs = Vec::new(); + for n in src.native.iter() { + srcs.push(n.ciphertext.on_hpu(device).clone()); + } + for b in src.boolean.iter() { + srcs.push(b.ciphertext.on_hpu(device).clone()); + } + + let (opcode, proto) = { + ( + iop.opcode(), + &iop.format().expect("Unspecified IOP format").proto, + ) + }; + // These clones are cheap are they are just Arc + let hpu_res = HpuRadixCiphertext::exec(proto, opcode, &srcs, &src.imm); + HpuHandle { + native: hpu_res + .iter() + .filter(|x| !x.0.is_boolean()) + .map(|x| Self::new(x.clone(), device.tag.clone())) + .collect::>(), + boolean: hpu_res + .iter() + .filter(|x| x.0.is_boolean()) + .map(|x| FheBool::new(x.clone(), device.tag.clone())) + .collect::>(), + imm: Vec::new(), + } + }) + } +} + impl FheUint where Id: FheUintId, @@ -217,12 +272,12 @@ where /// slice is empty #[cfg(feature = "gpu")] pub fn gpu_indexes(&self) -> &[GpuIndex] { + #[allow(clippy::match_wildcard_for_single_variants)] match &self.ciphertext { - RadixCiphertext::Cpu(_) => &[], RadixCiphertext::Cuda(cuda_ct) => cuda_ct.gpu_indexes(), + _ => &[], } } - /// Returns a FheBool that encrypts `true` if the value is even /// /// # Example @@ -256,6 +311,10 @@ where .is_even(&*self.ciphertext.on_gpu(streams), streams); FheBool::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -292,6 +351,10 @@ where .is_odd(&*self.ciphertext.on_gpu(streams), streams); FheBool::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -430,6 +493,10 @@ where ); super::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -475,6 +542,10 @@ where ); super::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -520,6 +591,10 @@ where ); super::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -565,6 +640,10 @@ where ); super::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -602,6 +681,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support count_ones yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -639,6 +722,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support count_zeros yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -686,6 +773,10 @@ where ); super::FheUint32::new(result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -743,6 +834,10 @@ where FheBool::new(is_ok, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -826,6 +921,10 @@ where Err(crate::Error::new("Output type does not have enough bits to represent all possible output values".to_string())) } }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -902,6 +1001,10 @@ where Err(crate::Error::new("Output type does not have enough bits to represent all possible output values".to_string())) } }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -938,6 +1041,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support reverse yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -984,6 +1091,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support if_then_else yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet."); + } }) } @@ -1023,6 +1134,10 @@ where cuda_key.key.key.carry_modulus, cuda_key.key.key.message_modulus, ), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }); // Check number of blocks @@ -1109,6 +1224,10 @@ where ); Self::new(casted, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1153,6 +1272,10 @@ where ); Self::new(casted, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1197,6 +1320,10 @@ where ); Self::new(inner, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } diff --git a/tfhe/src/high_level_api/integers/unsigned/encrypt.rs b/tfhe/src/high_level_api/integers/unsigned/encrypt.rs index 593e580e6..a67ff9839 100644 --- a/tfhe/src/high_level_api/integers/unsigned/encrypt.rs +++ b/tfhe/src/high_level_api/integers/unsigned/encrypt.rs @@ -123,6 +123,10 @@ where ); Ok(Self::new(inner, cuda_key.tag.clone())) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support trivial encryption") + } }) } } diff --git a/tfhe/src/high_level_api/integers/unsigned/inner.rs b/tfhe/src/high_level_api/integers/unsigned/inner.rs index 2ca75142f..b2789752d 100644 --- a/tfhe/src/high_level_api/integers/unsigned/inner.rs +++ b/tfhe/src/high_level_api/integers/unsigned/inner.rs @@ -7,16 +7,24 @@ use crate::high_level_api::global_state; use crate::high_level_api::global_state::{ with_thread_local_cuda_streams, with_thread_local_cuda_streams_for_gpu_indexes, }; +#[cfg(feature = "hpu")] +use crate::high_level_api::keys::HpuTaggedDevice; #[cfg(feature = "gpu")] use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext}; +#[cfg(feature = "hpu")] +use crate::integer::hpu::ciphertext::HpuRadixCiphertext; use crate::Device; use serde::{Deserializer, Serializer}; +#[cfg(feature = "hpu")] +use tfhe_hpu_backend::prelude::*; use tfhe_versionable::{Unversionize, UnversionizeError, Versionize, VersionizeOwned}; pub(crate) enum RadixCiphertext { Cpu(crate::integer::RadixCiphertext), #[cfg(feature = "gpu")] Cuda(crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext), + #[cfg(feature = "hpu")] + Hpu(HpuRadixCiphertext), } impl From for RadixCiphertext { @@ -32,6 +40,13 @@ impl From for Radi } } +#[cfg(feature = "hpu")] +impl From for RadixCiphertext { + fn from(value: HpuRadixCiphertext) -> Self { + Self::Hpu(value) + } +} + impl Clone for RadixCiphertext { fn clone(&self) -> Self { match self { @@ -40,6 +55,24 @@ impl Clone for RadixCiphertext { Self::Cuda(inner) => { with_thread_local_cuda_streams(|streams| Self::Cuda(inner.duplicate(streams))) } + #[cfg(feature = "hpu")] + Self::Hpu(inner) => { + // NB: Hpu backends flavor behavs differently regarding memory. + // Some of them has duplicated memory on Host with sync mechanism. + // But it's not the case for all. + // To prevent special cases, all the "deep" clone are made on HPU side + let (opcode, proto) = { + let asm_iop = &hpu_asm::iop::IOP_MEMCPY; + ( + asm_iop.opcode(), + &asm_iop.format().expect("Unspecified IOP format").proto, + ) + }; + let deep_clone = HpuRadixCiphertext::exec(proto, opcode, &[inner.clone()], &[]) + .pop() + .expect("IOP_MEMCPY must return 1 operand"); + Self::Hpu(deep_clone) + } } } } @@ -107,11 +140,23 @@ impl Unversionize for RadixCiphertext { } impl RadixCiphertext { + pub(crate) fn wait(&self) { + match self { + Self::Cpu(_) => {} + #[cfg(feature = "gpu")] + Self::Cuda(_) => {} + #[cfg(feature = "hpu")] + Self::Hpu(hpu_ct) => hpu_ct.0.wait(), + } + } + pub(crate) fn current_device(&self) -> Device { match self { Self::Cpu(_) => Device::Cpu, #[cfg(feature = "gpu")] Self::Cuda(_) => Device::CudaGpu, + #[cfg(feature = "hpu")] + Self::Hpu(_) => Device::Hpu, } } @@ -121,11 +166,14 @@ impl RadixCiphertext { match self { Self::Cpu(ct) => MaybeCloned::Borrowed(ct), #[cfg(feature = "gpu")] - Self::Cuda(ct) => { - with_thread_local_cuda_streams_for_gpu_indexes(ct.gpu_indexes(), |streams| { - let cpu_ct = ct.to_radix_ciphertext(streams); - MaybeCloned::Cloned(cpu_ct) - }) + Self::Cuda(ct) => with_thread_local_cuda_streams(|streams| { + let cpu_ct = ct.to_radix_ciphertext(streams); + MaybeCloned::Cloned(cpu_ct) + }), + #[cfg(feature = "hpu")] + Self::Hpu(hpu_ct) => { + let cpu_inner = hpu_ct.to_radix_ciphertext(); + MaybeCloned::Cloned(cpu_inner) } } } @@ -136,30 +184,38 @@ impl RadixCiphertext { pub(crate) fn on_gpu( &self, streams: &CudaStreams, - ) -> MaybeCloned<'_, crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext> { - match self { - Self::Cpu(ct) => with_thread_local_cuda_streams(|streams| { - let ct = - crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext::from_radix_ciphertext( - ct, streams, - ); - MaybeCloned::Cloned(ct) - }), - #[cfg(feature = "gpu")] - Self::Cuda(ct) => { - if ct.gpu_indexes() == streams.gpu_indexes() { - MaybeCloned::Borrowed(ct) - } else { - MaybeCloned::Cloned(ct.duplicate(streams)) + ) -> MaybeCloned<'_, CudaUnsignedRadixCiphertext> { + #[allow(clippy::match_wildcard_for_single_variants)] + let cpu_radix = match self { + Self::Cuda(gpu_radix) => { + if gpu_radix.gpu_indexes() == streams.gpu_indexes() { + return MaybeCloned::Borrowed(gpu_radix); } + return MaybeCloned::Cloned(gpu_radix.duplicate(streams)); } - } + _ => self.on_cpu(), + }; + + let gpu_radix = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&cpu_radix, streams); + MaybeCloned::Cloned(gpu_radix) + } + + #[cfg(feature = "hpu")] + pub(crate) fn on_hpu(&self, device: &HpuTaggedDevice) -> MaybeCloned<'_, HpuRadixCiphertext> { + #[allow(clippy::match_wildcard_for_single_variants)] + let cpu_radix = match self { + Self::Hpu(hpu_radix) => return MaybeCloned::Borrowed(hpu_radix), + _ => self.on_cpu(), + }; + + let hpu_ct = HpuRadixCiphertext::from_radix_ciphertext(&cpu_radix, &device.device); + MaybeCloned::Cloned(hpu_ct) } pub(crate) fn as_cpu_mut(&mut self) -> &mut crate::integer::RadixCiphertext { match self { Self::Cpu(radix_ct) => radix_ct, - #[cfg(feature = "gpu")] + #[cfg(any(feature = "gpu", feature = "hpu"))] _ => { self.move_to_device(Device::Cpu); self.as_cpu_mut() @@ -168,25 +224,36 @@ impl RadixCiphertext { } #[cfg(feature = "gpu")] - pub(crate) fn as_gpu_mut( - &mut self, - streams: &CudaStreams, - ) -> &mut crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext { - match self { - Self::Cpu(cpu_ct) => { - let cuda_ct = crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext::from_radix_ciphertext(cpu_ct, streams); - *self = Self::Cuda(cuda_ct); - let Self::Cuda(cuda_ct) = self else { - unreachable!() - }; - cuda_ct - } - Self::Cuda(cuda_ct) => { - if cuda_ct.gpu_indexes() != streams.gpu_indexes() { - *cuda_ct = cuda_ct.duplicate(streams); - } - cuda_ct + pub(crate) fn as_gpu_mut(&mut self, streams: &CudaStreams) -> &mut CudaUnsignedRadixCiphertext { + let cpu_radix = if let Self::Cuda(cuda_ct) = self { + if cuda_ct.gpu_indexes() != streams.gpu_indexes() { + *cuda_ct = cuda_ct.duplicate(streams); } + return cuda_ct; + } else { + self.on_cpu() + }; + + let cuda_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&cpu_radix, streams); + *self = Self::Cuda(cuda_ct); + let Self::Cuda(cuda_ct) = self else { + unreachable!() + }; + cuda_ct + } + + #[cfg(feature = "hpu")] + pub(crate) fn as_hpu_mut(&mut self, device: &HpuTaggedDevice) -> &mut HpuRadixCiphertext { + if let Self::Hpu(radix_ct) = self { + radix_ct + } else { + let cpu_ct = self.on_cpu(); + let hpu_ct = HpuRadixCiphertext::from_radix_ciphertext(&cpu_ct, &device.device); + *self = Self::Hpu(hpu_ct); + let Self::Hpu(hpu_ct) = self else { + unreachable!() + }; + hpu_ct } } @@ -199,55 +266,74 @@ impl RadixCiphertext { ct.to_radix_ciphertext(streams) }) } + #[cfg(feature = "hpu")] + Self::Hpu(hpu_ct) => hpu_ct.to_radix_ciphertext(), } } #[cfg(feature = "gpu")] pub(crate) fn into_gpu(self, streams: &CudaStreams) -> CudaUnsignedRadixCiphertext { - match self { - Self::Cpu(cpu_ct) => { - CudaUnsignedRadixCiphertext::from_radix_ciphertext(&cpu_ct, streams) - } - Self::Cuda(ct) => ct.move_to_stream(streams), + #[allow(clippy::match_wildcard_for_single_variants)] + let cpu_radix = match self { + Self::Cuda(gpu_radix) => return gpu_radix.move_to_stream(streams), + _ => self.into_cpu(), + }; + CudaUnsignedRadixCiphertext::from_radix_ciphertext(&cpu_radix, streams) + } + + #[cfg(feature = "hpu")] + pub(crate) fn into_hpu(self, device: &HpuTaggedDevice) -> HpuRadixCiphertext { + if let Self::Hpu(radix_ct) = self { + radix_ct + } else { + let cpu_ct = self.on_cpu(); + HpuRadixCiphertext::from_radix_ciphertext(&cpu_ct, &device.device) } } #[allow(clippy::needless_pass_by_ref_mut)] - pub(crate) fn move_to_device(&mut self, device: Device) { - match (&self, device) { - (Self::Cpu(_), Device::Cpu) => { - // Nothing to do, we already are on the correct device - } + pub(crate) fn move_to_device(&mut self, target_device: Device) { + let current_device = self.current_device(); + + if current_device == target_device { #[cfg(feature = "gpu")] - (Self::Cuda(cuda_ct), Device::CudaGpu) => { - // We are on a GPU, but it may not be the correct one - let new = with_thread_local_cuda_streams(|streams| { - if cuda_ct.gpu_indexes() == streams.gpu_indexes() { - None - } else { - Some(cuda_ct.duplicate(streams)) + // We may not be on the correct Cuda device + if let Self::Cuda(cuda_ct) = self { + with_thread_local_cuda_streams(|streams| { + if cuda_ct.gpu_indexes() != streams.gpu_indexes() { + *cuda_ct = cuda_ct.duplicate(streams); } - }); - if let Some(ct) = new { - *self = Self::Cuda(ct); - } + }) + } + return; + } + + // The logic is that the common device is the CPU, all other devices + // know how to transfer from and to CPU. + + // So we first transfer to CPU + let cpu_ct = self.on_cpu(); + + // Then we can transfer the desired device + match target_device { + Device::Cpu => { + let _ = cpu_ct; } #[cfg(feature = "gpu")] - (Self::Cpu(ct), Device::CudaGpu) => { + Device::CudaGpu => { let new_inner = with_thread_local_cuda_streams(|streams| { crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext::from_radix_ciphertext( - ct, streams, + &cpu_ct, streams, ) }); *self = Self::Cuda(new_inner); } - #[cfg(feature = "gpu")] - (Self::Cuda(ct), Device::Cpu) => { - let new_inner = - with_thread_local_cuda_streams_for_gpu_indexes(ct.gpu_indexes(), |streams| { - ct.to_radix_ciphertext(streams) - }); - *self = Self::Cpu(new_inner); + #[cfg(feature = "hpu")] + Device::Hpu => { + let hpu_ct = global_state::with_thread_local_hpu_device(|device| { + HpuRadixCiphertext::from_radix_ciphertext(&cpu_ct, &device.device) + }); + *self = Self::Hpu(hpu_ct); } } } diff --git a/tfhe/src/high_level_api/integers/unsigned/ops.rs b/tfhe/src/high_level_api/integers/unsigned/ops.rs index 1efe9cce8..dd7678372 100644 --- a/tfhe/src/high_level_api/integers/unsigned/ops.rs +++ b/tfhe/src/high_level_api/integers/unsigned/ops.rs @@ -1,6 +1,7 @@ // Ask clippy not to worry about this // this is the pattern we use for the macros #![allow(clippy::redundant_closure_call)] + use super::inner::RadixCiphertext; #[cfg(feature = "gpu")] use crate::high_level_api::details::MaybeCloned; @@ -17,12 +18,16 @@ use crate::high_level_api::traits::{ }; #[cfg(feature = "gpu")] use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext; +#[cfg(feature = "hpu")] +use crate::integer::hpu::ciphertext::HpuRadixCiphertext; use crate::{FheBool, FheUint}; use std::borrow::Borrow; use std::ops::{ Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Div, DivAssign, Mul, MulAssign, Neg, Not, Rem, RemAssign, Shl, ShlAssign, Shr, ShrAssign, Sub, SubAssign, }; +#[cfg(feature = "hpu")] +use tfhe_hpu_backend::prelude::*; impl std::iter::Sum for FheUint where @@ -93,6 +98,15 @@ where }); Self::new(inner, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let mut iter = iter; + let mut result = iter.next().unwrap().ciphertext.into_hpu(device); + for o in iter { + result += o.ciphertext.into_hpu(device); + } + Self::new(result, device.tag.clone()) + } }) } } @@ -186,6 +200,21 @@ where Self::new(inner, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let mut iter = iter; + let first = iter.next().unwrap().ciphertext.on_hpu(device); + + let Some(second) = iter.next() else { + return Self::new(first.clone(), device.tag.clone()); + }; + + let mut result = &*first + &*second.ciphertext.on_hpu(device); + for o in iter { + result += &*o.ciphertext.on_hpu(device); + } + Self::new(result, device.tag.clone()) + } }) } } @@ -232,6 +261,10 @@ where ); Self::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -278,6 +311,10 @@ where ); Self::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -335,6 +372,28 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + let (opcode, proto) = { + let asm_iop = &hpu_asm::iop::IOP_CMP_EQ; + ( + asm_iop.opcode(), + &asm_iop.format().expect("Unspecified IOP format").proto, + ) + }; + // These clones are cheap are they are just Arc + let hpu_result = HpuRadixCiphertext::exec( + proto, + opcode, + &[hpu_lhs.clone(), hpu_rhs.clone()], + &[], + ) + .pop() + .unwrap(); + FheBool::new(hpu_result, device.tag.clone()) + } }) } @@ -374,6 +433,28 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + let (opcode, proto) = { + let asm_iop = &hpu_asm::iop::IOP_CMP_NEQ; + ( + asm_iop.opcode(), + &asm_iop.format().expect("Unspecified IOP format").proto, + ) + }; + // These clones are cheap are they are just Arc + let hpu_result = HpuRadixCiphertext::exec( + proto, + opcode, + &[hpu_lhs.clone(), hpu_rhs.clone()], + &[], + ) + .pop() + .unwrap(); + FheBool::new(hpu_result, device.tag.clone()) + } }) } } @@ -439,6 +520,28 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + let (opcode, proto) = { + let asm_iop = &hpu_asm::iop::IOP_CMP_LT; + ( + asm_iop.opcode(), + &asm_iop.format().expect("Unspecified IOP format").proto, + ) + }; + // These clones are cheap are they are just Arc + let hpu_result = HpuRadixCiphertext::exec( + proto, + opcode, + &[hpu_lhs.clone(), hpu_rhs.clone()], + &[], + ) + .pop() + .unwrap(); + FheBool::new(hpu_result, device.tag.clone()) + } }) } @@ -478,6 +581,28 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + let (opcode, proto) = { + let asm_iop = &hpu_asm::iop::IOP_CMP_LTE; + ( + asm_iop.opcode(), + &asm_iop.format().expect("Unspecified IOP format").proto, + ) + }; + // These clones are cheap are they are just Arc + let hpu_result = HpuRadixCiphertext::exec( + proto, + opcode, + &[hpu_lhs.clone(), hpu_rhs.clone()], + &[], + ) + .pop() + .unwrap(); + FheBool::new(hpu_result, device.tag.clone()) + } }) } @@ -517,6 +642,28 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + let (opcode, proto) = { + let asm_iop = &hpu_asm::iop::IOP_CMP_GT; + ( + asm_iop.opcode(), + &asm_iop.format().expect("Unspecified IOP format").proto, + ) + }; + // These clones are cheap are they are just Arc + let hpu_result = HpuRadixCiphertext::exec( + proto, + opcode, + &[hpu_lhs.clone(), hpu_rhs.clone()], + &[], + ) + .pop() + .unwrap(); + FheBool::new(hpu_result, device.tag.clone()) + } }) } @@ -556,6 +703,28 @@ where ); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + let (opcode, proto) = { + let asm_iop = &hpu_asm::iop::IOP_CMP_GTE; + ( + asm_iop.opcode(), + &asm_iop.format().expect("Unspecified IOP format").proto, + ) + }; + // These clones are cheap are they are just Arc + let hpu_result = HpuRadixCiphertext::exec( + proto, + opcode, + &[hpu_lhs.clone(), hpu_rhs.clone()], + &[], + ) + .pop() + .unwrap(); + FheBool::new(hpu_result, device.tag.clone()) + } }) } } @@ -642,6 +811,10 @@ where FheUint::::new(inner_result.1, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -723,6 +896,12 @@ generic_integer_impl_operation!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = lhs.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + FheUint::new(&*hpu_lhs + &*hpu_rhs, device.tag.clone()) + } }) } }, @@ -766,6 +945,12 @@ generic_integer_impl_operation!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = lhs.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + FheUint::new(&*hpu_lhs - &*hpu_rhs, device.tag.clone()) + } }) } }, @@ -809,6 +994,12 @@ generic_integer_impl_operation!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = lhs.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + FheUint::new(&*hpu_lhs * &*hpu_rhs, device.tag.clone()) + } }) } }, @@ -850,6 +1041,12 @@ generic_integer_impl_operation!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = lhs.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + FheUint::new(&*hpu_lhs & &*hpu_rhs, device.tag.clone()) + } }) } }, @@ -891,6 +1088,12 @@ generic_integer_impl_operation!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = lhs.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + FheUint::new(&*hpu_lhs | &*hpu_rhs, device.tag.clone()) + } }) } }, @@ -932,6 +1135,12 @@ generic_integer_impl_operation!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = lhs.ciphertext.on_hpu(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + FheUint::new(&*hpu_lhs ^ &*hpu_rhs, device.tag.clone()) + } }) } }, @@ -982,6 +1191,10 @@ generic_integer_impl_operation!( .div(&*lhs.ciphertext.on_gpu(streams), &*rhs.ciphertext.on_gpu(streams), streams); FheUint::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1033,6 +1246,10 @@ generic_integer_impl_operation!( .rem(&*lhs.ciphertext.on_gpu(streams), &*rhs.ciphertext.on_gpu(streams), streams); FheUint::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1146,6 +1363,10 @@ generic_integer_impl_shift_rotate!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -1190,6 +1411,10 @@ generic_integer_impl_shift_rotate!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -1234,6 +1459,10 @@ generic_integer_impl_shift_rotate!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -1278,6 +1507,10 @@ generic_integer_impl_shift_rotate!( FheUint::new(inner_result, cuda_key.tag.clone()) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -1328,6 +1561,12 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.as_hpu_mut(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + *hpu_lhs += &*hpu_rhs; + } }) } } @@ -1373,6 +1612,12 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.as_hpu_mut(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + *hpu_lhs -= &*hpu_rhs; + } }) } } @@ -1418,6 +1663,12 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.as_hpu_mut(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + *hpu_lhs *= &*hpu_rhs; + } }) } } @@ -1461,6 +1712,12 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.as_hpu_mut(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + *hpu_lhs &= &*hpu_rhs; + } }) } } @@ -1504,6 +1761,12 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.as_hpu_mut(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + *hpu_lhs |= &*hpu_rhs; + } }) } } @@ -1547,6 +1810,12 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let hpu_lhs = self.ciphertext.as_hpu_mut(device); + let hpu_rhs = rhs.ciphertext.on_hpu(device); + *hpu_lhs ^= &*hpu_rhs; + } }) } } @@ -1595,6 +1864,10 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1643,6 +1916,10 @@ where streams, ); }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1698,6 +1975,10 @@ where ); }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1752,6 +2033,10 @@ where ); }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1807,6 +2092,10 @@ where ); }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1862,6 +2151,10 @@ where ); }); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -1941,6 +2234,10 @@ where .neg(&*self.ciphertext.on_gpu(streams), streams); FheUint::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -2010,6 +2307,10 @@ where .bitnot(&*self.ciphertext.on_gpu(streams), streams); FheUint::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support bitnot (operator `!`)") + } }) } } diff --git a/tfhe/src/high_level_api/integers/unsigned/overflowing_ops.rs b/tfhe/src/high_level_api/integers/unsigned/overflowing_ops.rs index 9b2e274d4..20aa6dbf7 100644 --- a/tfhe/src/high_level_api/integers/unsigned/overflowing_ops.rs +++ b/tfhe/src/high_level_api/integers/unsigned/overflowing_ops.rs @@ -64,6 +64,10 @@ where FheBool::new(inner_result.1, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -160,6 +164,10 @@ where FheBool::new(inner_result.1, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -296,6 +304,10 @@ where FheBool::new(inner_result.1, cuda_key.tag.clone()), ) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -384,6 +396,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support overflowing_add yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -471,6 +487,10 @@ where InternalServerKey::Cuda(_) => { todo!("Cuda devices do not support overflowing_mul"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } diff --git a/tfhe/src/high_level_api/integers/unsigned/scalar_ops.rs b/tfhe/src/high_level_api/integers/unsigned/scalar_ops.rs index 6f93356ef..c86e727ce 100644 --- a/tfhe/src/high_level_api/integers/unsigned/scalar_ops.rs +++ b/tfhe/src/high_level_api/integers/unsigned/scalar_ops.rs @@ -68,6 +68,10 @@ where .scalar_eq(&*self.ciphertext.on_gpu(streams), rhs, streams); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -107,6 +111,10 @@ where .scalar_ne(&*self.ciphertext.on_gpu(streams), rhs, streams); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -152,6 +160,10 @@ where .scalar_lt(&*self.ciphertext.on_gpu(streams), rhs, streams); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -191,6 +203,10 @@ where .scalar_le(&*self.ciphertext.on_gpu(streams), rhs, streams); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -230,6 +246,10 @@ where .scalar_gt(&*self.ciphertext.on_gpu(streams), rhs, streams); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -269,6 +289,10 @@ where .scalar_ge(&*self.ciphertext.on_gpu(streams), rhs, streams); FheBool::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -316,6 +340,10 @@ where .scalar_max(&*self.ciphertext.on_gpu(streams), rhs, streams); Self::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -363,6 +391,10 @@ where .scalar_min(&*self.ciphertext.on_gpu(streams), rhs, streams); Self::new(inner_result, cuda_key.tag.clone()) }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -413,6 +445,10 @@ where InternalServerKey::Cuda(_) => { panic!("Cuda devices do not support bitslice yet"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -504,6 +540,10 @@ macro_rules! generic_integer_impl_scalar_div_rem { <$concrete_type>::new(r, cuda_key.tag.clone()) ) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } } }) } @@ -680,6 +720,10 @@ macro_rules! define_scalar_rotate_shifts { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -709,6 +753,10 @@ macro_rules! define_scalar_rotate_shifts { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -738,6 +786,10 @@ macro_rules! define_scalar_rotate_shifts { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -767,6 +819,10 @@ macro_rules! define_scalar_rotate_shifts { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -793,6 +849,10 @@ macro_rules! define_scalar_rotate_shifts { .scalar_left_shift_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -819,6 +879,10 @@ macro_rules! define_scalar_rotate_shifts { .scalar_right_shift_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -845,6 +909,10 @@ macro_rules! define_scalar_rotate_shifts { .scalar_rotate_left_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -871,6 +939,10 @@ macro_rules! define_scalar_rotate_shifts { .scalar_rotate_right_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -987,6 +1059,13 @@ macro_rules! define_scalar_ops { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let lhs = lhs.ciphertext.on_hpu(device); + let rhs = u128::try_from(rhs).unwrap(); + + RadixCiphertext::Hpu(&*lhs + rhs) + } }) } }, @@ -1016,6 +1095,13 @@ macro_rules! define_scalar_ops { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let lhs = lhs.ciphertext.on_hpu(device); + let rhs = u128::try_from(rhs).unwrap(); + + RadixCiphertext::Hpu(&*lhs - rhs) + } }) } }, @@ -1045,6 +1131,13 @@ macro_rules! define_scalar_ops { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let lhs = lhs.ciphertext.on_hpu(device); + let rhs = u128::try_from(rhs).unwrap(); + + RadixCiphertext::Hpu(&*lhs * rhs) + } }) } }, @@ -1075,6 +1168,10 @@ macro_rules! define_scalar_ops { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1104,6 +1201,10 @@ macro_rules! define_scalar_ops { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1134,6 +1235,10 @@ macro_rules! define_scalar_ops { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1163,6 +1268,10 @@ macro_rules! define_scalar_ops { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1193,6 +1302,10 @@ macro_rules! define_scalar_ops { }); RadixCiphertext::Cuda(inner_result) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1238,6 +1351,10 @@ macro_rules! define_scalar_ops { RadixCiphertext::Cuda(result) }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1332,6 +1449,13 @@ macro_rules! define_scalar_ops { .scalar_add_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let lhs = lhs.ciphertext.as_hpu_mut(device); + let rhs = u128::try_from(rhs).unwrap(); + + *lhs += rhs; + } }) } }, @@ -1363,6 +1487,13 @@ macro_rules! define_scalar_ops { .scalar_sub_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let lhs = lhs.ciphertext.as_hpu_mut(device); + let rhs = u128::try_from(rhs).unwrap(); + + *lhs -= rhs; + } }) } }, @@ -1390,6 +1521,13 @@ macro_rules! define_scalar_ops { .scalar_mul_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(device) => { + let lhs = lhs.ciphertext.as_hpu_mut(device); + let rhs = u128::try_from(rhs).unwrap(); + + *lhs *= rhs; + } }) } }, @@ -1418,6 +1556,10 @@ macro_rules! define_scalar_ops { .scalar_bitand_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1445,6 +1587,10 @@ macro_rules! define_scalar_ops { .scalar_bitor_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1471,6 +1617,10 @@ macro_rules! define_scalar_ops { .scalar_bitxor_assign(lhs.ciphertext.as_gpu_mut(streams), rhs, streams); }) } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1495,7 +1645,11 @@ macro_rules! define_scalar_ops { let cuda_lhs = lhs.ciphertext.as_gpu_mut(streams); let cuda_result = cuda_key.pbs_key().scalar_div(&cuda_lhs, rhs, streams); *cuda_lhs = cuda_result; - }) + }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, @@ -1520,7 +1674,11 @@ macro_rules! define_scalar_ops { let cuda_lhs = lhs.ciphertext.as_gpu_mut(streams); let cuda_result = cuda_key.pbs_key().scalar_rem(&cuda_lhs, rhs, streams); *cuda_lhs = cuda_result; - }) + }), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + panic!("Hpu does not support this operation yet.") + } }) } }, diff --git a/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs b/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs index 8ec971ee9..d03d50131 100644 --- a/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs +++ b/tfhe/src/high_level_api/integers/unsigned/squashed_noise.rs @@ -109,8 +109,8 @@ impl InnerSquashedNoiseRadixCiphertext { (Self::Cpu(_), Device::Cpu) => { // Nothing to do, we already are on the correct device } - #[cfg(feature = "gpu")] - _ => panic!("Cuda devices do not support noise squashing yet"), + #[cfg(any(feature = "gpu", feature = "hpu"))] + _ => panic!("Cuda/Hpu devices do not support noise squashing yet"), } } @@ -194,6 +194,10 @@ impl SquashNoise for FheUint { InternalServerKey::Cuda(_) => Err(crate::error!( "Cuda devices do not support noise squashing yet" )), + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_device) => { + Err(crate::error!("Hpu devices do not support noise squashing")) + } }) } } diff --git a/tfhe/src/high_level_api/integers/unsigned/tests/cpu.rs b/tfhe/src/high_level_api/integers/unsigned/tests/cpu.rs index e4d06d3c9..6a41700bf 100644 --- a/tfhe/src/high_level_api/integers/unsigned/tests/cpu.rs +++ b/tfhe/src/high_level_api/integers/unsigned/tests/cpu.rs @@ -74,12 +74,48 @@ fn test_uint64_quickstart() { super::test_case_uint64_quickstart(&client_key); } +#[test] +fn test_uint32_arith() { + let client_key = setup_default_cpu(); + super::test_case_uint32_arith(&client_key); +} + +#[test] +fn test_uint32_arith_assign() { + let client_key = setup_default_cpu(); + super::test_case_uint32_arith_assign(&client_key); +} + +#[test] +fn test_uint32_scalar_arith() { + let client_key = setup_default_cpu(); + super::test_case_uint32_scalar_arith(&client_key); +} + +#[test] +fn test_uint32_scalar_arith_assign() { + let client_key = setup_default_cpu(); + super::test_case_uint32_scalar_arith_assign(&client_key); +} + +#[test] +fn test_uint32_clone() { + let client_key = setup_default_cpu(); + super::test_case_clone(&client_key); +} + #[test] fn test_uint8_compare() { let client_key = setup_default_cpu(); super::test_case_uint8_compare(&client_key); } +#[test] +fn test_uint8_compare_scalar() { + let client_key = setup_default_cpu(); + super::test_case_uint8_compare_scalar(&client_key); +} + #[test] fn test_uint32_shift() { let client_key = setup_default_cpu(); @@ -104,6 +140,18 @@ fn test_uint32_bitwise() { super::test_case_uint32_bitwise(&client_key); } +#[test] +fn test_uint32_bitwise_assign() { + let client_key = setup_default_cpu(); + super::test_case_uint32_bitwise_assign(&client_key); +} + +#[test] +fn test_uint32_scalar_bitwise() { + let client_key = setup_default_cpu(); + super::test_case_uint32_scalar_bitwise(&client_key); +} + #[test] fn test_uint32_rotate() { let client_key = setup_default_cpu(); diff --git a/tfhe/src/high_level_api/integers/unsigned/tests/gpu.rs b/tfhe/src/high_level_api/integers/unsigned/tests/gpu.rs index 0fc510370..16ee8238f 100644 --- a/tfhe/src/high_level_api/integers/unsigned/tests/gpu.rs +++ b/tfhe/src/high_level_api/integers/unsigned/tests/gpu.rs @@ -91,6 +91,18 @@ fn test_uint32_bitwise_gpu_multibit() { super::test_case_uint32_bitwise(&client_key); } +#[test] +fn test_uint32_scalar_bitwise_gpu() { + let client_key = setup_default_gpu(); + super::test_case_uint32_scalar_bitwise(&client_key); +} + +#[test] +fn test_uint32_scalar_bitwise_gpu_multibit() { + let client_key = setup_gpu(Some(PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS)); + super::test_case_uint32_scalar_bitwise(&client_key); +} + #[test] fn test_if_then_else_gpu() { let client_key = setup_default_gpu(); diff --git a/tfhe/src/high_level_api/integers/unsigned/tests/hpu.rs b/tfhe/src/high_level_api/integers/unsigned/tests/hpu.rs new file mode 100644 index 000000000..a802118e6 --- /dev/null +++ b/tfhe/src/high_level_api/integers/unsigned/tests/hpu.rs @@ -0,0 +1,84 @@ +use std::sync::LazyLock; +use tfhe_hpu_backend::prelude::HpuDevice; + +use crate::{set_server_key, ClientKey, CompressedServerKey, Config}; + +fn setup_hpu(hpu_device_cfg_path: &str) -> ClientKey { + let hpu_device = HpuDevice::from_config(hpu_device_cfg_path); + + let config = Config::from_hpu_device(&hpu_device); + + // Generate Keys + let cks = ClientKey::generate(config); + let csks = CompressedServerKey::new(&cks); + + set_server_key((hpu_device, csks)); + + cks +} + +static HPU_CLIENT_KEY: LazyLock = LazyLock::new(|| { + let config_name = std::env::var("HPU_CONFIG").unwrap(); + let backend_dir = std::env::var("HPU_BACKEND_DIR").unwrap(); + let config_path = format!("{backend_dir}/config_store/{config_name}/hpu_config.toml"); + + setup_hpu(&config_path) +}); + +fn setup_default_hpu() -> ClientKey { + HPU_CLIENT_KEY.clone() +} + +#[test] +fn test_uint8_quickstart_hpu() { + let client_key = setup_default_hpu(); + super::test_case_uint8_quickstart(&client_key); +} + +#[test] +fn test_uint64_quickstart_hpu() { + let client_key = setup_default_hpu(); + super::test_case_uint64_quickstart(&client_key); +} + +#[test] +fn test_uint8_compare_hpu() { + let client_key = setup_default_hpu(); + super::test_case_uint8_compare(&client_key); +} + +#[test] +fn test_uint32_bitwise() { + let client_key = setup_default_hpu(); + super::test_case_uint32_bitwise(&client_key); +} + +#[test] +fn test_uint32_arith_hpu() { + let client_key = setup_default_hpu(); + super::test_case_uint32_arith(&client_key); +} + +#[test] +fn test_uint32_arith_assign_hpu() { + let client_key = setup_default_hpu(); + super::test_case_uint32_arith_assign(&client_key); +} + +#[test] +fn test_uint32_scalar_arith_hpu() { + let client_key = setup_default_hpu(); + super::test_case_uint32_scalar_arith(&client_key); +} + +#[test] +fn test_uint32_scalar_arith_assign_hpu() { + let client_key = setup_default_hpu(); + super::test_case_uint32_scalar_arith_assign(&client_key); +} + +#[test] +fn test_uint32_clone_hpu() { + let client_key = setup_default_hpu(); + super::test_case_clone(&client_key); +} diff --git a/tfhe/src/high_level_api/integers/unsigned/tests/mod.rs b/tfhe/src/high_level_api/integers/unsigned/tests/mod.rs index 42005cc30..6c5758f09 100644 --- a/tfhe/src/high_level_api/integers/unsigned/tests/mod.rs +++ b/tfhe/src/high_level_api/integers/unsigned/tests/mod.rs @@ -7,6 +7,8 @@ use rand::{thread_rng, Rng}; mod cpu; #[cfg(feature = "gpu")] pub(crate) mod gpu; +#[cfg(feature = "hpu")] +mod hpu; fn test_case_uint8_quickstart(client_key: &ClientKey) { let clear_a = 27u8; @@ -58,6 +60,43 @@ fn test_case_uint64_quickstart(cks: &ClientKey) { assert_eq!(decrypted, clear_a.wrapping_add(clear_b)); } +fn test_case_clone(cks: &ClientKey) { + let mut rng = rand::thread_rng(); + let clear_a = rng.gen::(); + let clear_b = rng.gen::(); + + let a = FheUint32::try_encrypt(clear_a, cks).unwrap(); + let b = FheUint32::try_encrypt(clear_b, cks).unwrap(); + + let c = &a + &b; + + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_add(clear_b)); + + // We expect clones to be full clones and not some incremented ref-count + let mut cloned_a = a.clone(); + + let decrypted: u32 = cloned_a.decrypt(cks); + assert_eq!(decrypted, clear_a); + let decrypted: u32 = b.decrypt(cks); + assert_eq!(decrypted, clear_b); + + let c = &cloned_a + &b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_add(clear_b)); + + cloned_a += &b; + + let decrypted: u32 = cloned_a.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_add(clear_b)); + let decrypted: u32 = b.decrypt(cks); + assert_eq!(decrypted, clear_b); + let decrypted: u32 = a.decrypt(cks); + assert_eq!(decrypted, clear_a); + let decrypted: u32 = b.decrypt(cks); + assert_eq!(decrypted, clear_b); +} + fn test_case_uint8_trivial(client_key: &ClientKey) { let a = FheUint8::try_encrypt_trivial(234u8).unwrap(); @@ -65,6 +104,94 @@ fn test_case_uint8_trivial(client_key: &ClientKey) { assert_eq!(clear, 234); } +fn test_case_uint32_arith(cks: &ClientKey) { + let mut rng = rand::thread_rng(); + let clear_a = rng.gen::(); + let clear_b = rng.gen::(); + + let a = FheUint32::try_encrypt(clear_a, cks).unwrap(); + let b = FheUint32::try_encrypt(clear_b, cks).unwrap(); + + let c = &a + &b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_add(clear_b)); + + let c = &a - &b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_sub(clear_b)); + + let c = &a * &b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_mul(clear_b)); +} + +fn test_case_uint32_arith_assign(cks: &ClientKey) { + let mut rng = rand::thread_rng(); + let mut clear_a = rng.gen::(); + let clear_b = rng.gen::(); + + let mut a = FheUint32::try_encrypt(clear_a, cks).unwrap(); + let b = FheUint32::try_encrypt(clear_b, cks).unwrap(); + + a += &b; + clear_a = clear_a.wrapping_add(clear_b); + let decrypted: u32 = a.decrypt(cks); + assert_eq!(decrypted, clear_a); + + a -= &b; + let decrypted: u32 = a.decrypt(cks); + clear_a = clear_a.wrapping_sub(clear_b); + assert_eq!(decrypted, clear_a); + + a *= &b; + let decrypted: u32 = a.decrypt(cks); + clear_a = clear_a.wrapping_mul(clear_b); + assert_eq!(decrypted, clear_a); +} + +fn test_case_uint32_scalar_arith(cks: &ClientKey) { + let mut rng = rand::thread_rng(); + let clear_a = rng.gen::(); + let clear_b = rng.gen::(); + + let a = FheUint32::try_encrypt(clear_a, cks).unwrap(); + + let c = &a + clear_b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_add(clear_b)); + + let c = &a - clear_b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_sub(clear_b)); + + let c = &a * clear_b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a.wrapping_mul(clear_b)); +} + +fn test_case_uint32_scalar_arith_assign(cks: &ClientKey) { + let mut rng = rand::thread_rng(); + let mut clear_a = rng.gen::(); + let clear_b = rng.gen::(); + + let mut a = FheUint32::try_encrypt(clear_a, cks).unwrap(); + + a += clear_b; + clear_a = clear_a.wrapping_add(clear_b); + let decrypted: u32 = a.decrypt(cks); + assert_eq!(decrypted, clear_a); + + a -= clear_b; + let decrypted: u32 = a.decrypt(cks); + clear_a = clear_a.wrapping_sub(clear_b); + assert_eq!(decrypted, clear_a); + + a *= clear_b; + let decrypted: u32 = a.decrypt(cks); + clear_a = clear_a.wrapping_mul(clear_b); + assert_eq!(decrypted, clear_a); +} + fn test_case_uint256_trivial(client_key: &ClientKey) { let clear_a = U256::from(u128::MAX); let a = FheUint256::try_encrypt_trivial(clear_a).unwrap(); @@ -74,97 +201,101 @@ fn test_case_uint256_trivial(client_key: &ClientKey) { #[allow(clippy::eq_op)] fn test_case_uint8_compare(client_key: &ClientKey) { - let clear_a = 27u8; - let clear_b = 128u8; + let mut rng = rand::thread_rng(); + let clear_a = rng.gen::(); + let clear_b = rng.gen::(); let a = FheUint8::encrypt(clear_a, client_key); let b = FheUint8::encrypt(clear_b, client_key); - // Test comparing encrypted with encrypted - { - let result = &a.eq(&b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a == clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.eq(&b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a == clear_b; + assert_eq!(decrypted_result, clear_result); - let result = &a.eq(&a); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a == clear_a; - assert_eq!(decrypted_result, clear_result); + let result = &a.eq(&a); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a == clear_a; + assert_eq!(decrypted_result, clear_result); - let result = &a.ne(&b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a != clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.ne(&b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a != clear_b; + assert_eq!(decrypted_result, clear_result); - let result = &a.ne(&a); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a != clear_a; - assert_eq!(decrypted_result, clear_result); + let result = &a.ne(&a); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a != clear_a; + assert_eq!(decrypted_result, clear_result); - let result = &a.le(&b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a <= clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.le(&b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a <= clear_b; + assert_eq!(decrypted_result, clear_result); - let result = &a.lt(&b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a < clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.lt(&b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a < clear_b; + assert_eq!(decrypted_result, clear_result); - let result = &a.ge(&b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a >= clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.ge(&b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a >= clear_b; + assert_eq!(decrypted_result, clear_result); - let result = &a.gt(&b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a > clear_b; - assert_eq!(decrypted_result, clear_result); - } + let result = &a.gt(&b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a > clear_b; + assert_eq!(decrypted_result, clear_result); +} - // Test comparing encrypted with clear - { - let result = &a.eq(clear_b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a == clear_b; - assert_eq!(decrypted_result, clear_result); +#[allow(clippy::eq_op)] +fn test_case_uint8_compare_scalar(client_key: &ClientKey) { + let mut rng = rand::thread_rng(); + let clear_a = rng.gen::(); + let clear_b = rng.gen::(); - let result = &a.eq(clear_a); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a == clear_a; - assert_eq!(decrypted_result, clear_result); + let a = FheUint8::encrypt(clear_a, client_key); - let result = &a.ne(clear_b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a != clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.eq(clear_b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a == clear_b; + assert_eq!(decrypted_result, clear_result); - let result = &a.ne(clear_a); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a != clear_a; - assert_eq!(decrypted_result, clear_result); + let result = &a.eq(clear_a); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a == clear_a; + assert_eq!(decrypted_result, clear_result); - let result = &a.le(clear_b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a <= clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.ne(clear_b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a != clear_b; + assert_eq!(decrypted_result, clear_result); - let result = &a.lt(clear_b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a < clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.ne(clear_a); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a != clear_a; + assert_eq!(decrypted_result, clear_result); - let result = &a.ge(clear_b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a >= clear_b; - assert_eq!(decrypted_result, clear_result); + let result = &a.le(clear_b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a <= clear_b; + assert_eq!(decrypted_result, clear_result); - let result = &a.gt(clear_b); - let decrypted_result = result.decrypt(client_key); - let clear_result = clear_a > clear_b; - assert_eq!(decrypted_result, clear_result); - } + let result = &a.lt(clear_b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a < clear_b; + assert_eq!(decrypted_result, clear_result); + + let result = &a.ge(clear_b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a >= clear_b; + assert_eq!(decrypted_result, clear_result); + + let result = &a.gt(clear_b); + let decrypted_result = result.decrypt(client_key); + let clear_result = clear_a > clear_b; + assert_eq!(decrypted_result, clear_result); } fn test_case_uint32_shift(cks: &ClientKey) { @@ -226,65 +357,72 @@ fn test_case_uint32_bitwise(cks: &ClientKey) { let a = FheUint32::try_encrypt(clear_a, cks).unwrap(); let b = FheUint32::try_encrypt(clear_b, cks).unwrap(); - // encrypted bitwise - { - let c = &a | &b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a | clear_b); + let c = &a | &b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a | clear_b); - let c = &a & &b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a & clear_b); + let c = &a & &b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a & clear_b); - let c = &a ^ &b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a ^ clear_b); + let c = &a ^ &b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a ^ clear_b); +} - let mut c = a.clone(); - c |= &b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a | clear_b); +fn test_case_uint32_bitwise_assign(cks: &ClientKey) { + let mut rng = rand::thread_rng(); + let mut clear_a = rng.gen::(); + let clear_b = rng.gen::(); - let mut c = a.clone(); - c &= &b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a & clear_b); + let mut a = FheUint32::try_encrypt(clear_a, cks).unwrap(); + let b = FheUint32::try_encrypt(clear_b, cks).unwrap(); - let mut c = a.clone(); - c ^= &b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a ^ clear_b); - } + a &= &b; + clear_a &= clear_b; + let decrypted: u32 = a.decrypt(cks); + assert_eq!(decrypted, clear_a); - // clear bitwise - { - let c = &a | b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a | clear_b); + a |= &b; + let decrypted: u32 = a.decrypt(cks); + clear_a |= clear_b; + assert_eq!(decrypted, clear_a); - let c = &a & clear_b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a & clear_b); + a ^= &b; + let decrypted: u32 = a.decrypt(cks); + clear_a ^= clear_b; + assert_eq!(decrypted, clear_a); +} - let c = &a ^ clear_b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a ^ clear_b); +fn test_case_uint32_scalar_bitwise(cks: &ClientKey) { + let mut rng = rand::thread_rng(); + let clear_a = rng.gen::(); + let clear_b = rng.gen::(); - let mut c = a.clone(); - c |= clear_b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a | clear_b); + let a = FheUint32::try_encrypt(clear_a, cks).unwrap(); - let mut c = a.clone(); - c &= clear_b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a & clear_b); + let c = &a & clear_b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a & clear_b); - let mut c = a; - c ^= clear_b; - let decrypted: u32 = c.decrypt(cks); - assert_eq!(decrypted, clear_a ^ clear_b); - } + let c = &a ^ clear_b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a ^ clear_b); + + let mut c = a.clone(); + c |= clear_b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a | clear_b); + + let mut c = a.clone(); + c &= clear_b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a & clear_b); + + let mut c = a; + c ^= clear_b; + let decrypted: u32 = c.decrypt(cks); + assert_eq!(decrypted, clear_a ^ clear_b); } fn test_case_uint32_rotate(cks: &ClientKey) { diff --git a/tfhe/src/high_level_api/keys/mod.rs b/tfhe/src/high_level_api/keys/mod.rs index c803b4557..cd2e633cc 100644 --- a/tfhe/src/high_level_api/keys/mod.rs +++ b/tfhe/src/high_level_api/keys/mod.rs @@ -12,6 +12,8 @@ pub use key_switching_key::KeySwitchingKey; pub use public::{CompactPublicKey, CompressedCompactPublicKey, CompressedPublicKey, PublicKey}; #[cfg(feature = "gpu")] pub use server::CudaServerKey; +#[cfg(feature = "hpu")] +pub(in crate::high_level_api) use server::HpuTaggedDevice; pub(crate) use server::InternalServerKey; pub use server::{CompressedServerKey, ServerKey}; diff --git a/tfhe/src/high_level_api/keys/server.rs b/tfhe/src/high_level_api/keys/server.rs index 26bb0703e..c0dd431e8 100644 --- a/tfhe/src/high_level_api/keys/server.rs +++ b/tfhe/src/high_level_api/keys/server.rs @@ -1,3 +1,7 @@ +#[cfg(feature = "hpu")] +pub(in crate::high_level_api) use hpu::HpuTaggedDevice; +#[cfg(feature = "hpu")] +use tfhe_hpu_backend::prelude::HpuDevice; use tfhe_versionable::Versionize; use super::ClientKey; @@ -18,7 +22,7 @@ use crate::prelude::Tagged; use crate::shortint::MessageModulus; #[cfg(feature = "gpu")] use crate::GpuIndex; -use crate::Tag; +use crate::{Device, Tag}; use std::sync::Arc; /// Key of the server @@ -391,6 +395,20 @@ pub enum InternalServerKey { Cpu(ServerKey), #[cfg(feature = "gpu")] Cuda(CudaServerKey), + #[cfg(feature = "hpu")] + Hpu(HpuTaggedDevice), +} + +impl InternalServerKey { + pub(crate) fn device(&self) -> Device { + match self { + Self::Cpu(_) => Device::Cpu, + #[cfg(feature = "gpu")] + Self::Cuda(_) => Device::CudaGpu, + #[cfg(feature = "hpu")] + Self::Hpu(_) => Device::Hpu, + } + } } impl std::fmt::Debug for InternalServerKey { @@ -399,6 +417,8 @@ impl std::fmt::Debug for InternalServerKey { Self::Cpu(_) => f.debug_tuple("Cpu").finish(), #[cfg(feature = "gpu")] Self::Cuda(_) => f.debug_tuple("Cuda").finish(), + #[cfg(feature = "hpu")] + Self::Hpu(_) => f.debug_tuple("Hpu").finish(), } } } @@ -415,6 +435,29 @@ impl From for InternalServerKey { } } +#[cfg(feature = "hpu")] +mod hpu { + use super::*; + + pub struct HpuTaggedDevice { + // The device holds the keys (there can only be one set of keys) + // So we attach the tag to it instead of the key + pub(in crate::high_level_api) device: Box, + pub(in crate::high_level_api) tag: Tag, + } + + impl From<(HpuDevice, CompressedServerKey)> for InternalServerKey { + fn from((device, csks): (HpuDevice, CompressedServerKey)) -> Self { + let CompressedServerKey { integer_key, tag } = csks; + crate::integer::hpu::init_device(&device, integer_key.key).expect("Invalid key"); + Self::Hpu(HpuTaggedDevice { + device: Box::new(device), + tag, + }) + } + } +} + use crate::high_level_api::keys::inner::IntegerServerKeyConformanceParams; impl ParameterSetConformant for ServerKey { diff --git a/tfhe/src/high_level_api/mod.rs b/tfhe/src/high_level_api/mod.rs index 04413acdd..4e4c60f37 100644 --- a/tfhe/src/high_level_api/mod.rs +++ b/tfhe/src/high_level_api/mod.rs @@ -164,6 +164,8 @@ pub enum Device { Cpu, #[cfg(feature = "gpu")] CudaGpu, + #[cfg(feature = "hpu")] + Hpu, } #[derive(FromRepr, Copy, Clone, PartialEq, Eq, Debug)] diff --git a/tfhe/src/high_level_api/prelude.rs b/tfhe/src/high_level_api/prelude.rs index f4fa28772..9ad9c1dea 100644 --- a/tfhe/src/high_level_api/prelude.rs +++ b/tfhe/src/high_level_api/prelude.rs @@ -8,10 +8,12 @@ //! ``` pub use crate::high_level_api::traits::{ BitSlice, CiphertextList, DivRem, FheDecrypt, FheEncrypt, FheEq, FheKeyswitch, FheMax, FheMin, - FheOrd, FheTrivialEncrypt, FheTryEncrypt, FheTryTrivialEncrypt, IfThenElse, OverflowingAdd, - OverflowingMul, OverflowingSub, RotateLeft, RotateLeftAssign, RotateRight, RotateRightAssign, - ScalarIfThenElse, SquashNoise, Tagged, + FheOrd, FheTrivialEncrypt, FheTryEncrypt, FheTryTrivialEncrypt, FheWait, IfThenElse, + OverflowingAdd, OverflowingMul, OverflowingSub, RotateLeft, RotateLeftAssign, RotateRight, + RotateRightAssign, ScalarIfThenElse, SquashNoise, Tagged, }; +#[cfg(feature = "hpu")] +pub use crate::high_level_api::traits::{FheHpu, HpuHandle}; pub use crate::conformance::ParameterSetConformant; pub use crate::core_crypto::prelude::{CastFrom, CastInto}; diff --git a/tfhe/src/high_level_api/strings/ascii/comp.rs b/tfhe/src/high_level_api/strings/ascii/comp.rs index 3fdf8e8ab..643821fbb 100644 --- a/tfhe/src/high_level_api/strings/ascii/comp.rs +++ b/tfhe/src/high_level_api/strings/ascii/comp.rs @@ -18,6 +18,10 @@ impl FheEq<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings eq"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -33,6 +37,10 @@ impl FheEq<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings ne"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -48,6 +56,10 @@ impl FheEq<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings eq"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -61,6 +73,10 @@ impl FheEq<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings ne"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -78,6 +94,10 @@ impl FheOrd<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings lt"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -93,6 +113,10 @@ impl FheOrd<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings le"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -108,6 +132,10 @@ impl FheOrd<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings gt"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -123,6 +151,10 @@ impl FheOrd<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings ge"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -138,6 +170,10 @@ impl FheOrd<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings lt"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -151,6 +187,10 @@ impl FheOrd<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings le"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -164,6 +204,10 @@ impl FheOrd<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings gt"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } @@ -177,6 +221,10 @@ impl FheOrd<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings ge"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -216,6 +264,10 @@ impl FheEqIgnoreCase for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings eq_ignore_case"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } } @@ -255,6 +307,10 @@ impl FheEqIgnoreCase for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings eq_ignore_case"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("Hpu does not support this operation yet.") + } }) } } diff --git a/tfhe/src/high_level_api/strings/ascii/contains.rs b/tfhe/src/high_level_api/strings/ascii/contains.rs index 02870865d..c89f2aff3 100644 --- a/tfhe/src/high_level_api/strings/ascii/contains.rs +++ b/tfhe/src/high_level_api/strings/ascii/contains.rs @@ -40,6 +40,10 @@ impl FheStringMatching<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings contains"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support contains"); + } }) } @@ -77,6 +81,10 @@ impl FheStringMatching<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings starts_with"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support starts_with"); + } }) } @@ -114,6 +122,10 @@ impl FheStringMatching<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings ends_with"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support ends_with"); + } }) } } @@ -131,6 +143,10 @@ impl FheStringMatching<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings contains"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support contains"); + } }) } @@ -146,6 +162,10 @@ impl FheStringMatching<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings starts_with"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support starts_with"); + } }) } @@ -161,6 +181,10 @@ impl FheStringMatching<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings ends_with"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support ends_with"); + } }) } } diff --git a/tfhe/src/high_level_api/strings/ascii/find.rs b/tfhe/src/high_level_api/strings/ascii/find.rs index adf425b6b..57348f248 100644 --- a/tfhe/src/high_level_api/strings/ascii/find.rs +++ b/tfhe/src/high_level_api/strings/ascii/find.rs @@ -46,6 +46,10 @@ impl FheStringFind<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings find"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings find"); + } }) } @@ -89,6 +93,10 @@ impl FheStringFind<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings rfind"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings rfind"); + } }) } } @@ -132,6 +140,10 @@ impl FheStringFind<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings find"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings find"); + } }) } @@ -173,6 +185,10 @@ impl FheStringFind<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings rfind"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings rfind"); + } }) } } diff --git a/tfhe/src/high_level_api/strings/ascii/mod.rs b/tfhe/src/high_level_api/strings/ascii/mod.rs index 6da06db2d..54863612a 100644 --- a/tfhe/src/high_level_api/strings/ascii/mod.rs +++ b/tfhe/src/high_level_api/strings/ascii/mod.rs @@ -377,6 +377,8 @@ impl<'a> FheTryTrivialEncrypt> for FheAsciiString { } #[cfg(feature = "gpu")] Some(InternalServerKey::Cuda(_)) => Err(crate::error!("CUDA does not support string")), + #[cfg(feature = "hpu")] + Some(InternalServerKey::Hpu(_)) => Err(crate::error!("Hpu does not support string")), None => Err(UninitializedServerKey.into()), }) } diff --git a/tfhe/src/high_level_api/strings/ascii/no_pattern.rs b/tfhe/src/high_level_api/strings/ascii/no_pattern.rs index e898a2465..f9dd468b3 100644 --- a/tfhe/src/high_level_api/strings/ascii/no_pattern.rs +++ b/tfhe/src/high_level_api/strings/ascii/no_pattern.rs @@ -105,6 +105,10 @@ impl FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings len"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings len"); + } }) } @@ -148,7 +152,11 @@ impl FheAsciiString { } #[cfg(feature = "gpu")] InternalServerKey::Cuda(_) => { - panic!("gpu does not support strings len"); + panic!("gpu does not support strings is_empty"); + } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings is_empty"); } }) } @@ -180,6 +188,10 @@ impl FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings to_lowercase"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings to_lowercase"); + } }) } @@ -210,6 +222,10 @@ impl FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings to_uppercase"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings to_uppercase"); + } }) } @@ -246,6 +262,10 @@ impl FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings concatenating"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings concatenating"); + } }) } } @@ -304,6 +324,10 @@ impl FheStringRepeat for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings repeat"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings repeat"); + } }) } } @@ -351,6 +375,10 @@ impl FheStringRepeat<(FheUint16, u16)> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings repeat"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings repeat"); + } }) } } diff --git a/tfhe/src/high_level_api/strings/ascii/replace.rs b/tfhe/src/high_level_api/strings/ascii/replace.rs index acbabc9ad..7f2ad8918 100644 --- a/tfhe/src/high_level_api/strings/ascii/replace.rs +++ b/tfhe/src/high_level_api/strings/ascii/replace.rs @@ -45,6 +45,10 @@ impl FheStringReplace<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings replace"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings replace"); + } }) } } @@ -87,6 +91,10 @@ impl FheStringReplace<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings replace"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings replace"); + } }) } } @@ -125,6 +133,10 @@ impl FheStringReplaceN<&Self, u16> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings replacen"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings replacen"); + } }) } } @@ -145,6 +157,10 @@ impl FheStringReplaceN<&Self, (FheUint16, u16)> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings replacen"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings replacen"); + } }) } } @@ -183,6 +199,10 @@ impl FheStringReplaceN<&ClearString, u16> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings replacen"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings replacen"); + } }) } } @@ -203,6 +223,11 @@ impl FheStringReplaceN<&ClearString, (FheUint16, u16)> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strings replacen"); } + + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings replacen"); + } }) } } diff --git a/tfhe/src/high_level_api/strings/ascii/strip.rs b/tfhe/src/high_level_api/strings/ascii/strip.rs index 53047f123..e97a4b4e0 100644 --- a/tfhe/src/high_level_api/strings/ascii/strip.rs +++ b/tfhe/src/high_level_api/strings/ascii/strip.rs @@ -49,6 +49,10 @@ impl FheStringStrip<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strip_prefix"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings strip_prefix"); + } }) } @@ -95,6 +99,10 @@ impl FheStringStrip<&Self> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strip_suffix"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings string_suffix"); + } }) } } @@ -143,6 +151,10 @@ impl FheStringStrip<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strip_prefix"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings strip_prefix"); + } }) } @@ -189,6 +201,10 @@ impl FheStringStrip<&ClearString> for FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support strip_suffix"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support strings strip_suffix"); + } }) } } diff --git a/tfhe/src/high_level_api/strings/ascii/trim.rs b/tfhe/src/high_level_api/strings/ascii/trim.rs index a0dd59b57..84909c64d 100644 --- a/tfhe/src/high_level_api/strings/ascii/trim.rs +++ b/tfhe/src/high_level_api/strings/ascii/trim.rs @@ -32,6 +32,10 @@ impl FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support trim_start"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support trim_start"); + } }) } @@ -64,6 +68,10 @@ impl FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support trim_end"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support trim_end"); + } }) } @@ -96,6 +104,10 @@ impl FheAsciiString { InternalServerKey::Cuda(_) => { panic!("gpu does not support trim"); } + #[cfg(feature = "hpu")] + InternalServerKey::Hpu(_) => { + panic!("hpu does not support trim"); + } }) } } diff --git a/tfhe/src/high_level_api/tests/tags_on_entities.rs b/tfhe/src/high_level_api/tests/tags_on_entities.rs index 24a75031e..ff46180e9 100644 --- a/tfhe/src/high_level_api/tests/tags_on_entities.rs +++ b/tfhe/src/high_level_api/tests/tags_on_entities.rs @@ -215,6 +215,10 @@ fn test_tag_propagation( set_server_key(sks); } + #[cfg(feature = "hpu")] + Device::Hpu => { + todo!() + } } // Check encrypting regular ct with client key diff --git a/tfhe/src/high_level_api/traits.rs b/tfhe/src/high_level_api/traits.rs index 6dfa69385..3fbf577ce 100644 --- a/tfhe/src/high_level_api/traits.rs +++ b/tfhe/src/high_level_api/traits.rs @@ -219,3 +219,27 @@ pub trait SizeOnGpu { pub trait AddAssignSizeOnGpu { fn get_add_assign_size_on_gpu(&self, amount: Rhs) -> u64; } + +/// Trait used to have a generic way of waiting Hw accelerator result +pub trait FheWait { + fn wait(&self); +} + +/// Struct used to have a generic way of starting custom Hpu IOp +#[cfg(feature = "hpu")] +pub struct HpuHandle { + pub native: Vec, + pub boolean: Vec, + pub imm: Vec, +} + +#[cfg(feature = "hpu")] +pub trait FheHpu +where + Self: Sized, +{ + fn iop_exec( + iop: &tfhe_hpu_backend::prelude::hpu_asm::AsmIOpcode, + src: HpuHandle<&Self>, + ) -> HpuHandle; +} diff --git a/tfhe/src/integer/bigint/static_unsigned.rs b/tfhe/src/integer/bigint/static_unsigned.rs index 927f6f8e8..b060ec21f 100644 --- a/tfhe/src/integer/bigint/static_unsigned.rs +++ b/tfhe/src/integer/bigint/static_unsigned.rs @@ -484,3 +484,15 @@ impl Numeric for StaticUnsignedBigInt { impl UnsignedNumeric for StaticUnsignedBigInt { type NumericSignedType = super::static_signed::StaticSignedBigInt; } + +impl TryFrom> for u128 { + type Error = &'static str; + + fn try_from(value: StaticUnsignedBigInt) -> Result { + if N > 2 && value.0[2..].iter().any(|e| *e != 0) { + Err("Value is too big to be converted to u128") + } else { + Ok(Self::cast_from(value)) + } + } +} diff --git a/tfhe/src/integer/hpu/ciphertext/mod.rs b/tfhe/src/integer/hpu/ciphertext/mod.rs new file mode 100644 index 000000000..2d96d6783 --- /dev/null +++ b/tfhe/src/integer/hpu/ciphertext/mod.rs @@ -0,0 +1,240 @@ +use hpu_asm::iop::*; +use tfhe_hpu_backend::prelude::*; + +use crate::core_crypto::prelude::{CreateFrom, LweCiphertextOwned}; +use crate::integer::{BooleanBlock, RadixCiphertext}; +use crate::shortint::ciphertext::{Degree, NoiseLevel}; +use crate::shortint::parameters::KeySwitch32PBSParameters; +use crate::shortint::{AtomicPatternKind, Ciphertext}; + +/// Simple wrapper over HpuVar +/// Add method to convert from/to cpu radix ciphertext +#[derive(Clone)] +pub struct HpuRadixCiphertext(pub(crate) HpuVarWrapped); + +impl HpuRadixCiphertext { + fn new(hpu_var: HpuVarWrapped) -> Self { + Self(hpu_var) + } + + /// Create a Hpu Radix ciphertext based on a Cpu one. + /// + /// No transfer with FPGA will occur until an operation on the HpuRadixCiphertext is requested + pub fn from_radix_ciphertext(cpu_ct: &RadixCiphertext, device: &HpuDevice) -> Self { + let params = device.params().clone(); + + let hpu_ct = cpu_ct + .blocks + .iter() + .map(|blk| HpuLweCiphertextOwned::create_from(blk.ct.as_view(), params.clone())) + .collect::>(); + + Self(device.new_var_from(hpu_ct, VarMode::Native)) + } + + /// Create a Cpu radix ciphertext copy from a Hpu one. + pub fn to_radix_ciphertext(&self) -> RadixCiphertext { + // NB: We clone the inner part of HpuRadixCiphertext but it is not costly since + // it's wrapped inside an Arc + let hpu_ct = self.0.clone().into_ct(); + let cpu_ct = hpu_ct + .into_iter() + .map(|ct| { + let pbs_p = KeySwitch32PBSParameters::from(ct.params()); + let cpu_ct = LweCiphertextOwned::from(ct.as_view()); + // Hpu output clean ciphertext without carry + Ciphertext::new( + cpu_ct, + Degree::new(pbs_p.message_modulus.0 - 1), + NoiseLevel::NOMINAL, + pbs_p.message_modulus, + pbs_p.carry_modulus, + AtomicPatternKind::KeySwitch32, + ) + }) + .collect::>(); + RadixCiphertext { blocks: cpu_ct } + } + + /// Create a Hpu boolean ciphertext based on a Cpu one. + /// + /// No transfer with FPGA will occur until an operation on the HpuRadixCiphertext is requested + pub fn from_boolean_ciphertext(cpu_ct: &BooleanBlock, device: &HpuDevice) -> Self { + let params = device.params().clone(); + + let hpu_ct = vec![HpuLweCiphertextOwned::create_from( + cpu_ct.0.ct.as_view(), + params, + )]; + Self(device.new_var_from(hpu_ct, VarMode::Bool)) + } + + /// Create a Cpu boolean block from a Hpu one + /// + /// # Panics + /// + /// This function panic if the underlying RadixCiphertext does not encrypt 0 or 1 + pub fn to_boolean_block(&self) -> BooleanBlock { + assert!( + self.0.is_boolean(), + "Error try to extract boolean value from invalid ciphertext" + ); + let mut boolean_ct = self + .to_radix_ciphertext() + .blocks + .into_iter() + .next() + .unwrap(); + boolean_ct.degree = Degree::new(1); + BooleanBlock::new_unchecked(boolean_ct) + } +} + +// Use to easily build HpuCmd exec request directly on HpuRadixCiphertext +impl std::ops::Deref for HpuRadixCiphertext { + type Target = HpuVarWrapped; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl HpuRadixCiphertext { + pub fn exec( + proto: &IOpProto, + opcode: IOpcode, + rhs_ct: &[Self], + rhs_imm: &[HpuImm], + ) -> Vec { + let rhs_var = rhs_ct.iter().map(|x| x.0.clone()).collect::>(); + let res_var = HpuCmd::exec(proto, opcode, &rhs_var, rhs_imm); + res_var.into_iter().map(Self::new).collect::>() + } + + pub fn exec_assign(proto: &IOpProto, opcode: IOpcode, rhs_ct: &[Self], rhs_imm: &[HpuImm]) { + let rhs_var = rhs_ct.iter().map(|x| x.0.clone()).collect::>(); + HpuCmd::exec_assign(proto, opcode, &rhs_var, rhs_imm) + } +} + +// Below we map common Hpu operation to std::ops rust trait ------------------- +#[macro_export] +/// Easily map an Hpu operation to std::ops rust trait +macro_rules! map_ct_ct { + ($hpu_op: ident -> $rust_op: literal) => { + ::paste::paste! { + impl std::ops::[<$rust_op:camel>] for HpuRadixCiphertext { + type Output = Self; + + fn [<$rust_op:lower>](self, rhs: Self) -> Self::Output { + let opcode = $hpu_op.opcode(); + let proto = &$hpu_op.format().expect("Bind to std::ops a unspecified IOP").proto; + + let res = HpuCmd::exec(proto, opcode, &[self.0, rhs.0], &[]); + Self::Output::new(res[0].clone()) + } + } + + impl<'a> std::ops::[<$rust_op:camel>] for &'a HpuRadixCiphertext { + type Output = HpuRadixCiphertext; + + fn [<$rust_op:lower>](self, rhs: Self) -> Self::Output { + let opcode = $hpu_op.opcode(); + let proto = &$hpu_op.format().expect("Bind to std::ops a unspecified IOP").proto; + + let res = HpuCmd::exec(proto, opcode, &[self.0.clone(), rhs.0.clone()], &[]); + Self::Output::new(res[0].clone()) + } + } + + + impl std::ops::[<$rust_op:camel Assign>] for HpuRadixCiphertext { + fn [<$rust_op:lower _assign>](&mut self, rhs: Self) { + let opcode = $hpu_op.opcode(); + let proto = &$hpu_op.format().expect("Bind to std::ops a unspecified IOP").proto; + + HpuCmd::exec_assign(proto, opcode, &[self.0.clone(), rhs.0], &[]) + } + } + + impl<'a> std::ops::[<$rust_op:camel Assign>]<&'a Self> for HpuRadixCiphertext { + fn [<$rust_op:lower _assign>](&mut self, rhs: &'a Self) { + let opcode = $hpu_op.opcode(); + let proto = &$hpu_op.format().expect("Bind to std::ops a unspecified IOP").proto; + + HpuCmd::exec_assign(proto, opcode, &[self.0.clone(), rhs.0.clone()], &[]) + } + } + } + }; +} +macro_rules! map_ct_scalar { + ($hpu_op: ident -> $rust_op: literal) => { + ::paste::paste! { + impl std::ops::[<$rust_op:camel>] for HpuRadixCiphertext { + type Output = Self; + + fn [<$rust_op:lower>](self, rhs: u128) -> Self::Output { + let opcode = $hpu_op.opcode(); + let proto = &$hpu_op.format().expect("Bind to std::ops a unspecified IOP").proto; + + let res = HpuCmd::exec(proto, opcode, &[self.0], &[rhs]); + Self::Output::new(res[0].clone()) + } + } + + impl<'a> std::ops::[<$rust_op:camel>] for &'a HpuRadixCiphertext { + type Output = HpuRadixCiphertext; + + fn [<$rust_op:lower>](self, rhs: u128) -> Self::Output { + let opcode = $hpu_op.opcode(); + let proto = &$hpu_op.format().expect("Bind to std::ops a unspecified IOP").proto; + + let res = HpuCmd::exec(proto, opcode, &[self.0.clone()], &[rhs]); + Self::Output::new(res[0].clone()) + } + } + + impl std::ops::[<$rust_op:camel Assign>] for HpuRadixCiphertext { + fn [<$rust_op:lower _assign>](&mut self, rhs: u128) { + let opcode = $hpu_op.opcode(); + let proto = &$hpu_op.format().expect("Bind to std::ops a unspecified IOP").proto; + + HpuCmd::exec_assign(proto, opcode, &[self.0.clone()], &[rhs]) + } + } + } + }; +} + +macro_rules! map_scalar_ct { + ($hpu_op: ident -> $rust_op: literal) => { + ::paste::paste! { + impl std::ops::[<$rust_op:camel>] for u128 { + type Output = HpuRadixCiphertext; + + fn [<$rust_op:lower>](self, rhs: HpuRadixCiphertext) -> Self::Output { + let opcode = $hpu_op.opcode(); + let proto = &$hpu_op.format().expect("Bind to std::ops a unspecified IOP").proto; + + let res = HpuCmd::exec(proto, opcode, &[rhs.0], &[self]); + Self::Output::new(res[0].clone()) + } + } + } + }; +} + +map_ct_ct!(IOP_ADD -> "Add"); +map_ct_ct!(IOP_SUB -> "Sub"); +map_ct_ct!(IOP_MUL -> "Mul"); +map_ct_ct!(IOP_BW_AND -> "BitAnd"); +map_ct_ct!(IOP_BW_OR -> "BitOr"); +map_ct_ct!(IOP_BW_XOR -> "BitXor"); + +map_ct_scalar!(IOP_ADDS -> "Add"); +map_scalar_ct!(IOP_ADDS -> "Add"); +map_ct_scalar!(IOP_SUBS -> "Sub"); +map_scalar_ct!(IOP_SSUB -> "Sub"); +map_ct_scalar!(IOP_MULS -> "Mul"); +map_scalar_ct!(IOP_MULS -> "Mul"); diff --git a/tfhe/src/integer/hpu/mod.rs b/tfhe/src/integer/hpu/mod.rs new file mode 100644 index 000000000..eb02fcdf0 --- /dev/null +++ b/tfhe/src/integer/hpu/mod.rs @@ -0,0 +1,78 @@ +use crate::core_crypto::prelude::CreateFrom; +use crate::shortint::parameters::KeySwitch32PBSParameters; +use tfhe_hpu_backend::prelude::*; + +use super::CompressedServerKey; +pub mod ciphertext; + +/// Utility function for HpuDevice initialisation +/// Init from Compressed material +pub fn init_device(device: &HpuDevice, server_key: CompressedServerKey) -> crate::Result<()> { + let params = device.params().clone(); + let tfhe_params = KeySwitch32PBSParameters::from(¶ms); + + let ap_key = match server_key.key.compressed_ap_server_key { + crate::shortint::atomic_pattern::compressed::CompressedAtomicPatternServerKey::Standard(_) => { + Err("Hpu not support Standard keys. Required a KeySwitch32 keys") + } + crate::shortint::atomic_pattern::compressed::CompressedAtomicPatternServerKey::KeySwitch32(keys) => Ok(keys), + }?; + + // Extract and convert bsk + let bsk = match ap_key.bootstrapping_key() { + crate::shortint::server_key::ShortintCompressedBootstrappingKey::Classic { + bsk, .. + } => { + let bsk = bsk + .clone() // TODO fix API this shouldn't be needed + .decompress_into_lwe_bootstrap_key(); + + // Check that given key is compliant with current device configuration + if tfhe_params.lwe_dimension != bsk.input_lwe_dimension() { + return Err("BootstrappingKey has incompatible input_lwe_dimension".into()); + } + if tfhe_params.glwe_dimension.to_glwe_size() != bsk.glwe_size() { + return Err("BootstrappingKey has incompatible glwe_size".into()); + } + if tfhe_params.polynomial_size != bsk.polynomial_size() { + return Err("BootstrappingKey has incompatible polynomial size".into()); + } + if tfhe_params.pbs_base_log != bsk.decomposition_base_log() { + return Err("BootstrappingKey has incompatible decomposition_base_log".into()); + } + if tfhe_params.pbs_level != bsk.decomposition_level_count() { + return Err("BootstrappingKey has incompatible decomposition_level_count".into()); + } + if tfhe_params.ciphertext_modulus != bsk.ciphertext_modulus() { + return Err("BootstrappingKey has incompatible ciphertext_modulus".into()); + } + Ok(bsk) + } + crate::shortint::server_key::ShortintCompressedBootstrappingKey::MultiBit { .. } => { + Err("Hpu currently not support multibit. Required a Classic BSK") + } + }?; + let hpu_bsk = HpuLweBootstrapKeyOwned::create_from(bsk.as_view(), params.clone()); + // Extract and convert ksk + let ksk = ap_key + .key_switching_key() + .clone() // TODO fix API this shouldn't be needed + .decompress_into_lwe_keyswitch_key(); + // Check that given key is compliant with current device configuration + if tfhe_params.ks_base_log != ksk.decomposition_base_log() { + return Err("KeyswitchingKey has incompatible decomposition_base_log".into()); + } + if tfhe_params.ks_level != ksk.decomposition_level_count() { + return Err("KeyswitchingKey has incompatible decomposition_level_count".into()); + } + let hpu_ksk = HpuLweKeyswitchKeyOwned::create_from(ksk.as_view(), params); + + // Upload them on Hpu and configure internal Fw/Lut + device.init( + hpu_bsk, + hpu_ksk, + crate::core_crypto::hpu::glwe_lookuptable::create_hpu_lookuptable, + ); + + Ok(()) +} diff --git a/tfhe/src/integer/keycache.rs b/tfhe/src/integer/keycache.rs index f29d7fdf6..53ebb315f 100644 --- a/tfhe/src/integer/keycache.rs +++ b/tfhe/src/integer/keycache.rs @@ -5,10 +5,25 @@ use crate::shortint::atomic_pattern::AtomicPatternParameters; #[cfg(feature = "experimental")] use crate::shortint::{PBSParameters, WopbsParameters}; +#[cfg(feature = "hpu")] +use std::sync::{Mutex, OnceLock}; +#[cfg(feature = "hpu")] +use tfhe_hpu_backend::prelude::*; + #[derive(Default)] -pub struct IntegerKeyCache; +pub struct IntegerKeyCache { + #[cfg(feature = "hpu")] + hpu_device: OnceLock>, +} impl IntegerKeyCache { + pub const fn new() -> Self { + Self { + #[cfg(feature = "hpu")] + hpu_device: OnceLock::new(), + } + } + pub fn get_from_params

(&self, params: P, key_kind: IntegerKeyKind) -> (ClientKey, ServerKey) where P: Into, @@ -38,6 +53,45 @@ impl IntegerKeyCache { (client_key, server_key) } + + #[cfg(feature = "hpu")] + pub fn get_hpu_device

(&self, param: P) -> &Mutex + where + P: Into + crate::keycache::NamedParam + Clone, + { + let hpu_device = self.hpu_device.get_or_init(|| { + // Instantiate HpuDevice -------------------------------------------------- + let hpu_device = { + let config_file = ShellString::new( + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string(), + ); + HpuDevice::from_config(&config_file.expand()) + }; + // Check compatibility with key + let hpu_pbs_params = + crate::shortint::parameters::KeySwitch32PBSParameters::from(hpu_device.params()); + assert_eq!( + param.clone().into(), + crate::shortint::AtomicPatternParameters::from(hpu_pbs_params), + "Error: Current Hpu device isn't compatible with {}", + param.name() + ); + + // Get current client key + let (cks, _) = self.get_from_params(param, IntegerKeyKind::Radix); + // Generate associated compressed ServerKey + let sks_compressed = super::CompressedServerKey::new_radix_compressed_server_key(&cks); + + // Init Hpu device with server key and firmware + crate::integer::hpu::init_device(&hpu_device, sks_compressed).expect("Invalid key"); + Mutex::new(hpu_device) + }); + + // Sanitize memory to prevent side-effect between tests + hpu_device.lock().unwrap().mem_sanitizer(); + + hpu_device + } } #[derive(Default)] @@ -67,6 +121,6 @@ impl WopbsKeyCache { } } -pub static KEY_CACHE: IntegerKeyCache = IntegerKeyCache; +pub static KEY_CACHE: IntegerKeyCache = IntegerKeyCache::new(); #[cfg(feature = "experimental")] pub static KEY_CACHE_WOPBS: WopbsKeyCache = WopbsKeyCache; diff --git a/tfhe/src/integer/mod.rs b/tfhe/src/integer/mod.rs index 708246512..9c1fa54c1 100755 --- a/tfhe/src/integer/mod.rs +++ b/tfhe/src/integer/mod.rs @@ -73,6 +73,9 @@ pub mod wopbs; #[cfg(feature = "gpu")] pub mod gpu; +#[cfg(feature = "hpu")] +pub mod hpu; + #[cfg(feature = "zk-pok")] pub use ciphertext::ProvenCompactCiphertextList; diff --git a/tfhe/src/lib.rs b/tfhe/src/lib.rs index 30876dc55..0319b0fe0 100644 --- a/tfhe/src/lib.rs +++ b/tfhe/src/lib.rs @@ -158,3 +158,7 @@ pub use error::{Error, ErrorKind}; pub type Result = std::result::Result; pub use tfhe_versionable::{Unversionize, Versionize}; + +/// Export tfhe-hpu-backend for external use +#[cfg(feature = "hpu")] +pub use tfhe_hpu_backend; diff --git a/tfhe/src/shortint/engine/mod.rs b/tfhe/src/shortint/engine/mod.rs index 3b5db3e29..27dfe2597 100644 --- a/tfhe/src/shortint/engine/mod.rs +++ b/tfhe/src/shortint/engine/mod.rs @@ -94,8 +94,11 @@ where assert_eq!(accumulator.polynomial_size(), polynomial_size); assert_eq!(accumulator.glwe_size(), glwe_size); + // NB: Following path will not go `power_of_two_scaling_to_native_torus` + // Thus keep value MSB aligned without considering real delta + // i.e force modulus to be native let output_encoding = ShortintEncoding { - ciphertext_modulus: accumulator.ciphertext_modulus(), + ciphertext_modulus: CiphertextModulus::new_native(), message_modulus: output_message_modulus, carry_modulus: output_carry_modulus, padding_bit: PaddingBit::Yes, diff --git a/tfhe/src/shortint/keycache.rs b/tfhe/src/shortint/keycache.rs index ce184a0c0..9d7d25b90 100644 --- a/tfhe/src/shortint/keycache.rs +++ b/tfhe/src/shortint/keycache.rs @@ -393,6 +393,7 @@ named_params_impl!( ShortintParameterSet => LEGACY_WOPBS_ONLY_2_BLOCKS_PARAM_MESSAGE_7_CARRY_0_KS_PBS, LEGACY_WOPBS_ONLY_2_BLOCKS_PARAM_MESSAGE_7_CARRY_1_KS_PBS, LEGACY_WOPBS_ONLY_2_BLOCKS_PARAM_MESSAGE_8_CARRY_0_KS_PBS, + // Coverage #[cfg(tarpaulin)] COVERAGE_PARAM_MESSAGE_2_CARRY_2_KS_PBS, @@ -406,6 +407,11 @@ named_params_impl!( ShortintParameterSet => COVERAGE_PARAM_MESSAGE_2_CARRY_2_COMPACT_PK_PBS_KS_GAUSSIAN_2M64, #[cfg(tarpaulin)] COVERAGE_PARAM_MESSAGE_2_CARRY_2_COMPACT_PK_KS_PBS_GAUSSIAN_2M64, + + #[cfg(feature ="hpu")] + V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64, + #[cfg(feature ="hpu")] + V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64, ); impl NamedParam for ClassicPBSParameters { diff --git a/tfhe/src/shortint/parameters/hpu.rs b/tfhe/src/shortint/parameters/hpu.rs new file mode 100644 index 000000000..f7ebd8608 --- /dev/null +++ b/tfhe/src/shortint/parameters/hpu.rs @@ -0,0 +1,60 @@ +//! Implement bridge between native tfhe Parameters and Hpu one +use tfhe_hpu_backend::prelude::*; + +use crate::shortint::parameters::{ + CiphertextModulus32, DynamicDistribution, KeySwitch32PBSParameters, +}; +use crate::shortint::prelude::*; + +#[allow(clippy::fallible_impl_from)] +impl From<&HpuParameters> for KeySwitch32PBSParameters { + fn from(value: &HpuParameters) -> Self { + let lwe_noise_distribution = match value.pbs_params.lwe_noise_distribution { + HpuNoiseDistributionInput::GaussianStdDev(std_dev) => { + DynamicDistribution::new_gaussian_from_std_dev(StandardDev(std_dev)) + } + HpuNoiseDistributionInput::TUniformBound(log2_bound) => { + DynamicDistribution::new_t_uniform(log2_bound) + } + }; + let glwe_noise_distribution = match value.pbs_params.glwe_noise_distribution { + HpuNoiseDistributionInput::GaussianStdDev(std_dev) => { + DynamicDistribution::new_gaussian_from_std_dev(StandardDev(std_dev)) + } + HpuNoiseDistributionInput::TUniformBound(log2_bound) => { + DynamicDistribution::new_t_uniform(log2_bound) + } + }; + + Self { + lwe_dimension: LweDimension(value.pbs_params.lwe_dimension), + glwe_dimension: GlweDimension(value.pbs_params.glwe_dimension), + polynomial_size: PolynomialSize(value.pbs_params.polynomial_size), + lwe_noise_distribution, + glwe_noise_distribution, + pbs_base_log: DecompositionBaseLog(value.pbs_params.pbs_base_log), + pbs_level: DecompositionLevelCount(value.pbs_params.pbs_level), + ks_base_log: DecompositionBaseLog(value.pbs_params.ks_base_log), + ks_level: DecompositionLevelCount(value.pbs_params.ks_level), + message_modulus: MessageModulus(1 << value.pbs_params.message_width), + carry_modulus: CarryModulus(1 << value.pbs_params.carry_width), + max_noise_level: MaxNoiseLevel::new(5), + log2_p_fail: -64.0, // TODO fixme + post_keyswitch_ciphertext_modulus: CiphertextModulus32::try_new_power_of_2( + value.ks_params.width, + ) + .unwrap(), + ciphertext_modulus: CiphertextModulus::try_new_power_of_2( + value.pbs_params.ciphertext_width, + ) + .unwrap(), + modulus_switch_noise_reduction_params: None, + } + } +} + +impl From for KeySwitch32PBSParameters { + fn from(value: HpuParameters) -> Self { + Self::from(&value) + } +} diff --git a/tfhe/src/shortint/parameters/mod.rs b/tfhe/src/shortint/parameters/mod.rs index e53b92de2..737884645 100644 --- a/tfhe/src/shortint/parameters/mod.rs +++ b/tfhe/src/shortint/parameters/mod.rs @@ -30,6 +30,8 @@ pub mod classic; pub mod compact_public_key_only; #[cfg(tarpaulin)] pub mod coverage_parameters; +#[cfg(feature = "hpu")] +pub mod hpu; pub mod key_switching; pub mod ks32; pub mod list_compression; diff --git a/tfhe/src/shortint/parameters/v1_2/hpu.rs b/tfhe/src/shortint/parameters/v1_2/hpu.rs new file mode 100644 index 000000000..005a3a6bd --- /dev/null +++ b/tfhe/src/shortint/parameters/v1_2/hpu.rs @@ -0,0 +1,52 @@ +use crate::core_crypto::prelude::DynamicDistribution; +use crate::shortint::parameters::{CiphertextModulus32, KeySwitch32PBSParameters, StandardDev}; +use crate::shortint::prelude::{ + DecompositionBaseLog, DecompositionLevelCount, GlweDimension, LweDimension, PolynomialSize, +}; +use crate::shortint::{CarryModulus, CiphertextModulus, MaxNoiseLevel, MessageModulus}; + +// Gaussian parameters set +pub const V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_GAUSSIAN_2M64: KeySwitch32PBSParameters = + KeySwitch32PBSParameters { + lwe_dimension: LweDimension(804), + glwe_dimension: GlweDimension(1), + polynomial_size: PolynomialSize(2048), + lwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 5.963599673924788e-6, + )), + glwe_noise_distribution: DynamicDistribution::new_gaussian_from_std_dev(StandardDev( + 2.8452674713391114e-15, + )), + pbs_base_log: DecompositionBaseLog(23), + pbs_level: DecompositionLevelCount(1), + ks_base_log: DecompositionBaseLog(2), + ks_level: DecompositionLevelCount(8), + message_modulus: MessageModulus(4), + carry_modulus: CarryModulus(4), + max_noise_level: MaxNoiseLevel::new(5), + log2_p_fail: -64.0, + post_keyswitch_ciphertext_modulus: CiphertextModulus32::new(1 << 21), + ciphertext_modulus: CiphertextModulus::new_native(), + modulus_switch_noise_reduction_params: None, + }; + +// TUniform parameters set +pub const V1_2_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M64: KeySwitch32PBSParameters = + KeySwitch32PBSParameters { + lwe_dimension: LweDimension(839), + glwe_dimension: GlweDimension(1), + polynomial_size: PolynomialSize(2048), + lwe_noise_distribution: DynamicDistribution::new_t_uniform(4), + glwe_noise_distribution: DynamicDistribution::new_t_uniform(17), + pbs_base_log: DecompositionBaseLog(23), + pbs_level: DecompositionLevelCount(1), + ks_base_log: DecompositionBaseLog(2), + ks_level: DecompositionLevelCount(7), + message_modulus: MessageModulus(4), + carry_modulus: CarryModulus(4), + max_noise_level: MaxNoiseLevel::new(5), + log2_p_fail: -64.0, + post_keyswitch_ciphertext_modulus: CiphertextModulus32::new(1 << 21), + ciphertext_modulus: CiphertextModulus::new_native(), + modulus_switch_noise_reduction_params: None, + }; diff --git a/tfhe/src/shortint/parameters/v1_2/mod.rs b/tfhe/src/shortint/parameters/v1_2/mod.rs index f3c93ac1b..1f460dc4e 100644 --- a/tfhe/src/shortint/parameters/v1_2/mod.rs +++ b/tfhe/src/shortint/parameters/v1_2/mod.rs @@ -40,6 +40,9 @@ pub use multi_bit::tuniform::p_fail_2_minus_64::ks_pbs::*; pub use multi_bit::tuniform::p_fail_2_minus_64::ks_pbs_gpu::*; pub use noise_squashing::p_fail_2_minus_128::*; +#[cfg(feature = "hpu")] +pub use hpu::*; + use crate::shortint::parameters::{ ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters, MultiBitPBSParameters, NoiseSquashingParameters, ShortintKeySwitchingParameters, @@ -1689,3 +1692,6 @@ pub const VEC_ALL_NOISE_SQUASHING_PARAMETERS: [(&NoiseSquashingParameters, &str) &V1_2_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128, "V1_2_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128", )]; + +#[cfg(feature = "hpu")] +pub mod hpu; diff --git a/tfhe/src/test_user_docs.rs b/tfhe/src/test_user_docs.rs index dc7495172..d53444f32 100644 --- a/tfhe/src/test_user_docs.rs +++ b/tfhe/src/test_user_docs.rs @@ -240,3 +240,17 @@ mod test_gpu_doc { configuration_gpu_acceleration_multi_gpu_device_selection ); } + +#[cfg(feature = "hpu")] +mod test_hpu_doc { + use doc_comment::doctest; + + doctest!( + "../docs/configuration/hpu_acceleration/run_on_hpu.md", + configuration_hpu_acceleration_run_on_hpu + ); + doctest!( + "../docs/configuration/hpu_acceleration/benchmark.md", + configuration_hpu_acceleration_benchmark + ); +} diff --git a/tfhe/tests/hpu.rs b/tfhe/tests/hpu.rs new file mode 100644 index 000000000..dac1f0854 --- /dev/null +++ b/tfhe/tests/hpu.rs @@ -0,0 +1,632 @@ +//! Define a test-harness that handle setup and configuration of Hpu Backend +//! The test harness take a list of testcase and run them +//! A testcase simply bind a IOp to a closure describing it's behavior +//! WARN: Only one Hpu could be use at a time, thus all test must be run sequentially + +#[cfg(feature = "hpu")] +mod hpu_test { + use std::str::FromStr; + + use rand::rngs::StdRng; + use rand::{Rng, RngCore, SeedableRng}; + use tfhe::core_crypto::commons::generators::DeterministicSeeder; + use tfhe::core_crypto::prelude::DefaultRandomGenerator; + + use tfhe::Seed; + pub use tfhe_hpu_backend::prelude::*; + + /// Variable to store initialized HpuDevice and associated client key for fast iteration + static HPU_DEVICE_RNG_CKS: std::sync::OnceLock<( + std::sync::Mutex, + tfhe::integer::ClientKey, + u128, + )> = std::sync::OnceLock::new(); + + // // Instantiate a shared rng for cleartext input generation + // let rng: StdRng = SeedableRng::seed_from_u64((seed & u64::MAX as u128) as u64); + + /// Simple function used to retrieved or generate a seed from environment + fn get_or_init_seed(name: &str) -> u128 { + match std::env::var(name) { + Ok(var) => if let Some(hex) = var.strip_prefix("0x").or_else(|| var.strip_prefix("0X")) + { + u128::from_str_radix(hex, 16) + } else if let Some(bin) = var.strip_prefix("0b").or_else(|| var.strip_prefix("0B")) { + u128::from_str_radix(bin, 2) + } else if let Some(oct) = var.strip_prefix("0o").or_else(|| var.strip_prefix("0O")) { + u128::from_str_radix(oct, 8) + } else { + var.parse::() // default: base 10 + } + .unwrap_or_else(|_| panic!("{name} env variable {var} couldn't be casted in u128")), + _ => { + // Use tread_rng to generate the seed + let lsb = rand::thread_rng().next_u64() as u128; + let msb = rand::thread_rng().next_u64() as u128; + (msb << u64::BITS) | lsb + } + } + } + + fn init_hpu_and_associated_material( + ) -> (std::sync::Mutex, tfhe::integer::ClientKey, u128) { + // Hpu io dump for debug ------------------------------------------------- + #[cfg(feature = "hpu-debug")] + if let Some(dump_path) = std::env::var("HPU_IO_DUMP").ok() { + set_hpu_io_dump(&dump_path); + } + + // Instantiate HpuDevice -------------------------------------------------- + let hpu_device = { + let config_file = ShellString::new( + "${HPU_BACKEND_DIR}/config_store/${HPU_CONFIG}/hpu_config.toml".to_string(), + ); + HpuDevice::from_config(&config_file.expand()) + }; + + // Check if user force a seed for the key generation + let key_seed = get_or_init_seed("HPU_KEY_SEED"); + + // Force key seeder for easily reproduce failure + let mut key_seeder = DeterministicSeeder::::new(Seed(key_seed)); + let shortint_engine = + tfhe::shortint::engine::ShortintEngine::new_from_seeder(&mut key_seeder); + tfhe::shortint::engine::ShortintEngine::with_thread_local_mut(|engine| { + std::mem::replace(engine, shortint_engine) + }); + + // Extract pbs_configuration from Hpu and create Client/Server Key + let cks = tfhe::integer::ClientKey::new( + tfhe::shortint::parameters::KeySwitch32PBSParameters::from(hpu_device.params()), + ); + let sks_compressed = + tfhe::integer::CompressedServerKey::new_radix_compressed_server_key(&cks); + + // Init Hpu device with server key and firmware + tfhe::integer::hpu::init_device(&hpu_device, sks_compressed).expect("Invalid key"); + (std::sync::Mutex::new(hpu_device), cks, key_seed) + } + + // NB: Currently u55c didn't check for workq overflow. + // -> Use default value < queue depth to circumvent this limitation + // NB': This is only for u55c, on V80 user could set HPU_TEST_ITER to whatever value he want + const DEFAULT_TEST_ITER: usize = 32; + + macro_rules! hpu_testbundle { + ($base_name: literal::$integer_width:tt => [$($testcase: literal),+]) => { + ::paste::paste! { + #[test] + pub fn []() { + // Register tracing subscriber that use env-filter + // Discard error ( mainly due to already registered subscriber) + let _ = tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .compact() + .with_file(false) + .with_line_number(false) + .without_time() + .try_init(); + // Retrieved test iteration from environment ---------------------------- + let hpu_test_iter = match(std::env::var("HPU_TEST_ITER")){ + Ok(var) => usize::from_str(&var).unwrap_or_else(|_| { + panic!("HPU_TEST_ITER env variable {var} couldn't be casted in usize") + }), + _ => DEFAULT_TEST_ITER + }; + + // Retrieved HpuDevice or init --------------------------------------------- + let (hpu_mutex, cks, key_seed)= HPU_DEVICE_RNG_CKS.get_or_init(init_hpu_and_associated_material); + let mut hpu_device = hpu_mutex.lock().expect("Error with HpuDevice Mutex"); + assert!(hpu_device.config().firmware.integer_w.contains(&($integer_width as usize)), "Current Hpu configuration doesn't support {}b integer [has {:?}]", $integer_width, hpu_device.config().firmware.integer_w); + + // Instantiate a Rng for cleartest input generation + // Create a fresh one for each testbundle to be reproducible even if execution order + // of testbundle are not stable + let test_seed = get_or_init_seed("HPU_TEST_SEED"); + // Display used seed value in a reusable manner (i.e. valid bash syntax) + println!("HPU_KEY_SEED={key_seed} #[i.e. 0x{key_seed:x}]"); + println!("HPU_TEST_SEED={test_seed} #[i.e. 0x{test_seed:x}]"); + + let mut rng: StdRng = SeedableRng::seed_from_u64((test_seed & u64::MAX as u128) as u64); + + // Reseed shortint engine for reproducible noise generation. + let mut noise_seeder = DeterministicSeeder::::new(Seed(test_seed)); + let shortint_engine = + tfhe::shortint::engine::ShortintEngine::new_from_seeder(&mut noise_seeder); + tfhe::shortint::engine::ShortintEngine::with_thread_local_mut(|engine| { + std::mem::replace(engine, shortint_engine) + }); + + // Run test-case --------------------------------------------------------- + let mut acc_status = true; + $( + { + let status = [](hpu_test_iter, &mut hpu_device, &mut rng, &cks); + if !status { + println!("Error: in testcase {}", stringify!([])); + } + acc_status &= status + } + )* + + drop(hpu_device); + assert!(acc_status, "At least one testcase failed in the testbundle"); + } + } + }; +} + + macro_rules! hpu_testcase { + ($iop: literal => [$($user_type: ty),+] |$ct:ident, $imm: ident| $behav: expr) => { + ::paste::paste! { + $( + #[cfg(feature = "hpu")] + #[allow(unused)] + pub fn [](iter: usize, device: &mut HpuDevice, rng: &mut StdRng, cks: &tfhe::integer::ClientKey) -> bool { + use tfhe::integer::hpu::ciphertext::HpuRadixCiphertext; + + let iop = hpu_asm::AsmIOpcode::from_str($iop).expect("Invalid AsmIOpcode "); + let proto = if let Some(format) = iop.format() { + format.proto.clone() + } else { + eprintln!("Hpu testcase only work on specified operations. Check test definition"); + return false; + }; + + let width = $user_type::BITS as usize; + let num_block = width / device.params().pbs_params.message_width; + (0..iter).map(|_| { + // Generate inputs ciphertext + let (srcs_clear, srcs_enc): (Vec<_>, Vec<_>) = proto + .src + .iter() + .enumerate() + .map(|(pos, mode)| { + let (bw, block) = match mode { + hpu_asm::iop::VarMode::Native => (width, num_block), + hpu_asm::iop::VarMode::Half => (width / 2, num_block / 2), + hpu_asm::iop::VarMode::Bool => (1, 1), + }; + + let clear = rng.gen_range(0..=$user_type::MAX >> ($user_type::BITS - (bw as u32))); + let fhe = cks.encrypt_radix(clear, block); + let hpu_fhe = HpuRadixCiphertext::from_radix_ciphertext(&fhe, device); + (clear, hpu_fhe) + }) + .unzip(); + + let imms = (0..proto.imm) + .map(|pos| rng.gen_range(0..$user_type::MAX) as u128) + .collect::>(); + + // execute on Hpu + let res_hpu = HpuRadixCiphertext::exec(&proto, iop.opcode(), &srcs_enc, &imms); + let res_fhe = res_hpu + .iter() + .map(|x| x.to_radix_ciphertext()).collect::>(); + let res = res_fhe + .iter() + .map(|x| cks.decrypt_radix(x)) + .collect::>(); + + let exp_res = { + let $ct = &srcs_clear; + let $imm = imms.iter().map(|x| *x as $user_type).collect::>(); + ($behav.iter().map(|x| *x as $user_type).collect::>()) + }; + println!("{:>8} <{:>8x?}> <{:>8x?}> => {:<8x?} [exp {:<8x?}] {{Delta: {:x?} }}", iop, srcs_clear, imms, res, exp_res, std::iter::zip(res.iter(), exp_res.iter()).map(|(x,y)| x ^y).collect::>()); + std::iter::zip(res.iter(), exp_res.iter()).map(|(x,y)| x== y).fold(true, |acc, val| acc & val) + }).fold(true, |acc, val| acc & val) + } + )* + } + }; +} + + // Define testcase implementation for all supported IOp + // Alu IOp with Ct x Imm + hpu_testcase!("ADDS" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0].wrapping_add(imm[0])]); + hpu_testcase!("SUBS" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0].wrapping_sub(imm[0])]); + hpu_testcase!("SSUB" => [u8, u16, u32, u64, u128] + |ct, imm| [imm[0].wrapping_sub(ct[0])]); + hpu_testcase!("MULS" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0].wrapping_mul(imm[0])]); + + // Alu IOp with Ct x Ct + hpu_testcase!("ADD" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0].wrapping_add(ct[1])]); + hpu_testcase!("SUB" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0].wrapping_sub(ct[1])]); + hpu_testcase!("MUL" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0].wrapping_mul(ct[1])]); + + // Bitwise IOp + hpu_testcase!("BW_AND" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] & ct[1]]); + hpu_testcase!("BW_OR" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] | ct[1]]); + hpu_testcase!("BW_XOR" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] ^ ct[1]]); + + // Comparison IOp + hpu_testcase!("CMP_GT" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] > ct[1]]); + hpu_testcase!("CMP_GTE" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] >= ct[1]]); + hpu_testcase!("CMP_LT" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] < ct[1]]); + hpu_testcase!("CMP_LTE" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] <= ct[1]]); + hpu_testcase!("CMP_EQ" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] == ct[1]]); + hpu_testcase!("CMP_NEQ" => [u8, u16, u32, u64, u128] + |ct, imm| [ct[0] != ct[1]]); + + // Ternary IOp + hpu_testcase!("IF_THEN_ZERO" => [u8, u16, u32, u64, u128] + |ct, imm| [if ct[1] != 0 {ct[0]} else { 0}]); + hpu_testcase!("IF_THEN_ELSE" => [u8, u16, u32, u64, u128] + |ct, imm| [if ct[2] != 0 {ct[0]} else { ct[1]}]); + + // ERC 20 found xfer + hpu_testcase!("ERC_20" => [u8, u16, u32, u64, u128] + |ct, imm| { + let from = ct[0]; + let to = ct[1]; + let amount = ct[2]; + // TODO enhance this to prevent overflow + if from >= amount { + vec![from - amount, to.wrapping_add(amount)] + } else { + vec![from, to] + } + }); + + // Define a set of test bundle for various size + // 8bit ciphertext ----------------------------------------- + #[cfg(feature = "hpu")] + hpu_testbundle!("alus"::8 => [ + "adds", + "subs", + "ssub", + "muls" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("alu"::8 => [ + "add", + "sub", + "mul" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("bitwise"::8 => [ + "bw_and", + "bw_or", + "bw_xor" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("cmp"::8 => [ + "cmp_gt", + "cmp_gte", + "cmp_lt", + "cmp_lte", + "cmp_eq", + "cmp_neq" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("ternary"::8 => [ + "if_then_zero", + "if_then_else" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("algo"::8 => [ + "erc_20" + ]); + + // 16bit ciphertext ----------------------------------------- + #[cfg(feature = "hpu")] + hpu_testbundle!("alus"::16 => [ + "adds", + "subs", + "ssub", + "muls" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("alu"::16 => [ + "add", + "sub", + "mul" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("bitwise"::16 => [ + "bw_and", + "bw_or", + "bw_xor" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("cmp"::16 => [ + "cmp_gt", + "cmp_gte", + "cmp_lt", + "cmp_lte", + "cmp_eq", + "cmp_neq" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("ternary"::16 => [ + "if_then_zero", + "if_then_else" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("algo"::16 => [ + "erc_20" + ]); + + // 32bit ciphertext ----------------------------------------- + #[cfg(feature = "hpu")] + hpu_testbundle!("alus"::32 => [ + "adds", + "subs", + "ssub", + "muls" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("alu"::32 => [ + "add", + "sub", + "mul" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("bitwise"::32 => [ + "bw_and", + "bw_or", + "bw_xor" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("cmp"::32 => [ + "cmp_gt", + "cmp_gte", + "cmp_lt", + "cmp_lte", + "cmp_eq", + "cmp_neq" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("ternary"::32 => [ + "if_then_zero", + "if_then_else" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("algo"::32 => [ + "erc_20" + ]); + + // 64bit ciphertext ----------------------------------------- + #[cfg(feature = "hpu")] + hpu_testbundle!("alus"::64 => [ + "adds", + "subs", + "ssub", + "muls" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("alu"::64 => [ + "add", + "sub", + "mul" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("bitwise"::64 => [ + "bw_and", + "bw_or", + "bw_xor" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("cmp"::64 => [ + "cmp_gt", + "cmp_gte", + "cmp_lt", + "cmp_lte", + "cmp_eq", + "cmp_neq" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("ternary"::64 => [ + "if_then_zero", + "if_then_else" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("algo"::64 => [ + "erc_20" + ]); + + // 128bit ciphertext ----------------------------------------- + #[cfg(feature = "hpu")] + hpu_testbundle!("alus"::128 => [ + "adds", + "subs", + "ssub", + "muls" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("alu"::128 => [ + "add", + "sub", + "mul" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("bitwise"::128 => [ + "bw_and", + "bw_or", + "bw_xor" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("cmp"::128 => [ + "cmp_gt", + "cmp_gte", + "cmp_lt", + "cmp_lte", + "cmp_eq", + "cmp_neq" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("ternary"::128 => [ + "if_then_zero", + "if_then_else" + ]); + + #[cfg(feature = "hpu")] + hpu_testbundle!("algo"::128 => [ + "erc_20" + ]); + + /// Simple test dedicated to check entities conversion from/to Cpu + #[cfg(feature = "hpu")] + #[test] + fn hpu_key_loopback() { + use tfhe::core_crypto::prelude::*; + use tfhe::*; + use tfhe_hpu_backend::prelude::*; + + // Retrieved HpuDevice or init --------------------------------------------- + // Used hpu_device backed in static variable to automatically serialize tests + let (hpu_params, cks, key_seed) = { + let (hpu_mutex, cks, key_seed) = + HPU_DEVICE_RNG_CKS.get_or_init(init_hpu_and_associated_material); + let hpu_device = hpu_mutex.lock().expect("Error with HpuDevice Mutex"); + (hpu_device.params().clone(), cks, key_seed) + }; + println!("HPU_KEY_SEED={key_seed} #[i.e. 0x{key_seed:x}]"); + + // Generate Keys --------------------------------------------------------- + let sks_compressed = + tfhe::integer::CompressedServerKey::new_radix_compressed_server_key(cks) + .into_raw_parts(); + + // Unwrap compressed key --------------------------------------------------- + let ap_key = match sks_compressed.compressed_ap_server_key { + tfhe::shortint::atomic_pattern::compressed::CompressedAtomicPatternServerKey::Standard(_) => { + panic!("Hpu not support Standard keys. Required a KeySwitch32 keys") + } + tfhe::shortint::atomic_pattern::compressed::CompressedAtomicPatternServerKey::KeySwitch32(keys) => keys, + }; + + // KSK Loopback conversion and check ------------------------------------- + // Extract and convert ksk + let cpu_ksk_orig = ap_key + .key_switching_key() + .clone() + .decompress_into_lwe_keyswitch_key(); + let hpu_ksk = + HpuLweKeyswitchKeyOwned::create_from(cpu_ksk_orig.as_view(), hpu_params.clone()); + let cpu_ksk_lb = LweKeyswitchKeyOwned::::from(hpu_ksk.as_view()); + + // NB: Some hw modifications such as bit shrinki couldn't be reversed + // cpu_ksk_orig.as_mut().iter_mut().for_each(|coef| { + // let ks_p = hpu_params.ks_params; + // // Apply Hw rounding + // // Extract info bits and rounding if required + // let coef_info = *coef >> (u32::BITS - ks_p.width as u32); + // let coef_rounding = if (ks_p.width as u32) < u32::BITS { + // (*coef >> (u32::BITS - (ks_p.width + 1) as u32)) & 0x1 + // } else { + // 0 + // }; + // *coef = (coef_info + coef_rounding) << (u32::BITS - ks_p.width as u32); + // }); + + let ksk_mismatch: usize = + std::iter::zip(cpu_ksk_orig.as_ref().iter(), cpu_ksk_lb.as_ref().iter()) + .enumerate() + .map(|(i, (x, y))| { + if x != y { + println!("Ksk mismatch @{i}:: {x:x} != {y:x}"); + 1 + } else { + 0 + } + }) + .sum(); + + // BSK Loopback conversion and check ------------------------------------- + // Extract and convert ksk + let cpu_bsk_orig = match ap_key.bootstrapping_key() { + tfhe::shortint::server_key::ShortintCompressedBootstrappingKey::Classic { + bsk: seeded_bsk, + .. + } => seeded_bsk.clone().decompress_into_lwe_bootstrap_key(), + tfhe::shortint::server_key::ShortintCompressedBootstrappingKey::MultiBit { .. } => { + panic!("Hpu currently not support multibit. Required a Classic BSK") + } + }; + let cpu_bsk_ntt = { + // Convert the LweBootstrapKey in Ntt domain + let mut ntt_bsk = NttLweBootstrapKeyOwned::::new( + 0_u64, + cpu_bsk_orig.input_lwe_dimension(), + cpu_bsk_orig.glwe_size(), + cpu_bsk_orig.polynomial_size(), + cpu_bsk_orig.decomposition_base_log(), + cpu_bsk_orig.decomposition_level_count(), + CiphertextModulus::new(u64::from(&hpu_params.ntt_params.prime_modulus) as u128), + ); + + // Conversion to ntt domain + par_convert_standard_lwe_bootstrap_key_to_ntt64( + &cpu_bsk_orig, + &mut ntt_bsk, + NttLweBootstrapKeyOption::Raw, + ); + ntt_bsk + }; + let hpu_bsk = + HpuLweBootstrapKeyOwned::create_from(cpu_bsk_orig.as_view(), hpu_params.clone()); + + let cpu_bsk_lb = NttLweBootstrapKeyOwned::from(hpu_bsk.as_view()); + + let bsk_mismatch: usize = std::iter::zip( + cpu_bsk_ntt.as_view().into_container().iter(), + cpu_bsk_lb.as_view().into_container().iter(), + ) + .enumerate() + .map(|(i, (x, y))| { + if x != y { + println!("@{i}:: {x:x} != {y:x}"); + 1 + } else { + 0 + } + }) + .sum(); + + println!("Ksk loopback with {ksk_mismatch} errors"); + println!("Bsk loopback with {bsk_mismatch} errors"); + + assert_eq!(ksk_mismatch + bsk_mismatch, 0); + } +}