chore(gpu): write swap bench

2026-01-09 22:57:59 -05:00 · 2025-04-28 16:00:41 +02:00
parent 7e3a5fd55b
commit 97690ab3bd
9 changed files with 1015 additions and 30 deletions
--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -0,0 +1,143 @@
 # Run all DEX benchmarks on an AWS instance and return parsed results to Slab CI bot.
 name: DEX benchmarks
 on:
  workflow_dispatch:
  schedule:
    # Weekly benchmarks will be triggered each Saturday at 5a.m.
    - cron: '0 5 * * 6'
 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
 jobs:
  setup-instance:
    name: Setup instance (dex-benchmarks)
    runs-on: ubuntu-latest
    if: github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      runner-name: ${{ steps.start-instance.outputs.label }}
    steps:
      - name: Start instance
        id: start-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: aws
          profile: bench
  dex-benchmarks:
    name: Execute DEX benchmarks
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    concurrency:
      group: ${{ github.workflow_ref }}
      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
    continue-on-error: true
    timeout-minutes: 720  # 12 hours
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Get benchmark details
        run: |
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
      - name: Install rust
        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
        with:
          toolchain: nightly
      - name: Checkout Slab repo
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          repository: zama-ai/slab
          path: slab
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Run benchmarks
        run: |
          make bench_hlapi_dex
      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
          --hardware "hpc7a.96xlarge" \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512
      - name: Parse PBS counts
        run: |
          python3 ./ci/benchmark_parser.py tfhe/dex_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
          --object-sizes \
          --append-results
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
          name: ${{ github.sha }}_dex
          path: ${{ env.RESULTS_FILENAME }}
      - name: Send data to Slab
        shell: bash
        run: |
          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"
      - name: Slack Notification
        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "DEX benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
  teardown-instance:
    name: Teardown instance (dex-benchmarks)
    if: ${{ always() && needs.setup-instance.result == 'success' }}
    needs: [ setup-instance, dex-benchmarks ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          label: ${{ needs.setup-instance.outputs.runner-name }}
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (dex-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_dex.yml
+++ b/.github/workflows/benchmark_gpu_dex.yml
@@ -0,0 +1,44 @@
 # Run CUDA DEX benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
 name: Cuda DEX benchmarks
 on:
  workflow_dispatch:
    inputs:
      profile:
        description: "Instance type"
        required: true
        type: choice
        options:
          - "l40 (n3-L40x1)"
          - "single-h100 (n3-H100x1)"
          - "2-h100 (n3-H100x2)"
          - "4-h100 (n3-H100x4)"
          - "multi-h100 (n3-H100x8)"
          - "multi-h100-nvlink (n3-H100x8-NVLink)"
          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
 jobs:
  parse-inputs:
    runs-on: ubuntu-latest
    outputs:
      profile: ${{ steps.parse_profile.outputs.profile }}
      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
    steps:
      - name: Parse profile
        id: parse_profile
        run: |
          echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"
      - name: Parse hardware name
        id: parse_hardware_name
        run: |
          echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"
  run-benchmarks:
    name: Run benchmarks
    needs: parse-inputs
    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
    with:
      profile: ${{ needs.parse-inputs.outputs.profile }}
      hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
    secrets: inherit
--- a/.github/workflows/benchmark_gpu_dex_common.yml
+++ b/.github/workflows/benchmark_gpu_dex_common.yml
@@ -0,0 +1,201 @@
 # Run DEX benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
 name: Cuda DEX benchmarks - common
 on:
  workflow_call:
    inputs:
      backend:
        type: string
        default: hyperstack
      profile:
        type: string
        required: true
      hardware_name:
        type: string
        required: true
    secrets:
      REPO_CHECKOUT_TOKEN:
        required: true
      SLAB_ACTION_TOKEN:
        required: true
      SLAB_BASE_URL:
        required: true
      SLAB_URL:
        required: true
      JOB_SECRET:
        required: true
      SLACK_CHANNEL:
        required: true
      BOT_USERNAME:
        required: true
      SLACK_WEBHOOK:
        required: true
 env:
  CARGO_TERM_COLOR: always
  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
  RUST_BACKTRACE: "full"
  RUST_MIN_STACK: "8388608"
  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
 jobs:
  setup-instance:
    name: Setup instance (cuda-dex-benchmarks)
    runs-on: ubuntu-latest
    if:  github.event_name == 'workflow_dispatch' ||
      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
    outputs:
      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
      # otherwise we'll try to run the next job on a non-existing on-demand instance.
      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
    steps:
      - name: Start remote instance
        id: start-remote-instance
        continue-on-error: true
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: start
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          backend: ${{ inputs.backend }}
          profile: ${{ inputs.profile }}
      - name: Acknowledge remote instance failure
        if: steps.start-remote-instance.outcome == 'failure' &&
          inputs.profile != 'single-h100'
        run: |
          echo "Remote instance instance has failed to start (profile provided: '${{ inputs.profile }}')"
          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
          exit 1
      # This will allow to fallback on permanent instances running on Hyperstack.
      - name: Use permanent remote instance
        id: use-permanent-instance
        if: env.SECRETS_AVAILABLE == 'true' &&
          steps.start-remote-instance.outcome == 'failure' &&
          inputs.profile == 'single-h100'
        run: |
          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
  cuda-dex-benchmarks:
    name: Cuda DEX benchmarks (${{ inputs.profile }})
    needs: setup-instance
    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
    strategy:
      fail-fast: false
      # explicit include-based build matrix, of known valid options
      matrix:
        include:
          - os: ubuntu-22.04
            cuda: "12.2"
            gcc: 11
    steps:
      - name: Checkout tfhe-rs repo with tags
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          fetch-depth: 0
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Setup Hyperstack dependencies
        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
        uses: ./.github/actions/gpu_setup
        with:
          cuda-version: ${{ matrix.cuda }}
          gcc-version: ${{ matrix.gcc }}
      - name: Get benchmark details
        run: |
          {
            echo "BENCH_DATE=$(date --iso-8601=seconds)";
            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
            echo "COMMIT_HASH=$(git describe --tags --dirty)";
          } >> "${GITHUB_ENV}"
      - name: Install rust
        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
        with:
          toolchain: nightly
      - name: Run benchmarks
        run: |
          make bench_hlapi_dex_gpu
      - name: Parse results
        run: |
          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
          --database tfhe_rs \
          --hardware "${{ inputs.hardware_name }}" \
          --backend gpu \
          --project-version "${{ env.COMMIT_HASH }}" \
          --branch ${{ github.ref_name }} \
          --commit-date "${{ env.COMMIT_DATE }}" \
          --bench-date "${{ env.BENCH_DATE }}" \
          --walk-subdirs \
          --name-suffix avx512
      - name: Upload parsed results artifact
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
        with:
          name: ${{ github.sha }}_dex_${{ inputs.profile }}
          path: ${{ env.RESULTS_FILENAME }}
      - name: Checkout Slab repo
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
        with:
          repository: zama-ai/slab
          path: slab
          persist-credentials: 'false'
          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
      - name: Send data to Slab
        shell: bash
        run: |
          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
          --slab-url "${{ secrets.SLAB_URL }}"
  slack-notify:
    name: Slack Notification
    needs: [ setup-instance, cuda-dex-benchmarks ]
    runs-on: ubuntu-latest
    if: ${{ always() && needs.cuda-dex-benchmarks.result != 'skipped' && failure() }}
    continue-on-error: true
    steps:
      - name: Send message
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ needs.cuda-dex-benchmarks.result }}
          SLACK_MESSAGE: "Cuda DEX benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-dex-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
  teardown-instance:
    name: Teardown instance (cuda-dex-${{ inputs.profile }}-benchmarks)
    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
    needs: [ setup-instance, cuda-dex-benchmarks, slack-notify ]
    runs-on: ubuntu-latest
    steps:
      - name: Stop instance
        id: stop-instance
        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
        with:
          mode: stop
          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
          slab-url: ${{ secrets.SLAB_BASE_URL }}
          job-secret: ${{ secrets.JOB_SECRET }}
          label: ${{ needs.setup-instance.outputs.runner-name }}
      - name: Slack Notification
        if: ${{ failure() }}
        continue-on-error: true
        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
        env:
          SLACK_COLOR: ${{ job.status }}
          SLACK_MESSAGE: "Instance teardown (cuda-dex-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_dex_weekly.yml
+++ b/.github/workflows/benchmark_gpu_dex_weekly.yml
@@ -0,0 +1,35 @@
 # Run CUDA DEX benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
 name: Cuda DEX weekly benchmarks
 on:
  schedule:
    # Weekly benchmarks will be triggered each Saturday at 9a.m.
    - cron: '0 9 * * 6'
 jobs:
  run-benchmarks-1-h100:
    name: Run benchmarks (1xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
    with:
      profile: single-h100
      hardware_name: n3-H100x1
    secrets: inherit
  run-benchmarks-2-h100:
    name: Run benchmarks (2xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
    with:
      profile: 2-h100
      hardware_name: n3-H100x2
    secrets: inherit
  run-benchmarks-8-h100:
    name: Run benchmarks (8xH100)
    if: github.repository == 'zama-ai/tfhe-rs'
    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
    with:
      profile: multi-h100
      hardware_name: n3-H100x8
    secrets: inherit
--- a/12
+++ b/12
@@ -1293,6 +1293,18 @@ bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
 .PHONY: bench_hlapi_dex # Run benchmarks for ECR20 operations
 bench_hlapi_dex: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-dex \
 	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
 .PHONY: bench_hlapi_dex_gpu # Run benchmarks for ECR20 operations on GPU
 bench_hlapi_dex_gpu: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
 	--bench hlapi-dex \
 	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -254,6 +254,12 @@ path = "benches/high_level_api/erc20.rs"
 harness = false
 required-features = ["integer", "internal-keycache"]
 [[bench]]
 name = "hlapi-dex"
 path = "benches/high_level_api/dex.rs"
 harness = false
 required-features = ["integer", "internal-keycache"]
 [[bench]]
 name = "keygen"
 path = "benches/keygen/bench.rs"
--- a/tfhe/benches/high_level_api/dex.rs
+++ b/tfhe/benches/high_level_api/dex.rs
@@ -0,0 +1,540 @@
 #[path = "../utilities.rs"]
 mod utilities;
 #[cfg(feature = "gpu")]
 use crate::utilities::configure_gpu;
 use crate::utilities::{write_to_json, OperatorType};
 use criterion::measurement::WallTime;
 use criterion::{BenchmarkGroup, Criterion};
 use rand::prelude::*;
 use rand::thread_rng;
 use std::ops::{Add, Div, Mul, Sub};
 use tfhe::keycache::NamedParam;
 use tfhe::prelude::*;
 use tfhe::shortint::parameters::*;
 #[cfg(not(feature = "gpu"))]
 use tfhe::{set_server_key, CompressedServerKey};
 use tfhe::{ClientKey, ConfigBuilder, FheBool, FheUint128, FheUint64};
 pub(crate) fn transfer_whitepaper<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
    amount: &FheType,
 ) -> (FheType, FheType)
 where
    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType>,
    FheBool: IfThenElse<FheType>,
    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
 {
    let has_enough_funds = (from_amount).ge(amount);
    let mut new_to_amount = to_amount + amount;
    new_to_amount = has_enough_funds.if_then_else(&new_to_amount, to_amount);
    let mut new_from_amount = from_amount - amount;
    new_from_amount = has_enough_funds.if_then_else(&new_from_amount, from_amount);
    (new_from_amount, new_to_amount)
 }
 #[allow(clippy::too_many_arguments)]
 fn swap_request<FheType>(
    from_balance_0: &FheType,
    from_balance_1: &FheType,
    current_dex_balance_0: &FheType,
    current_dex_balance_1: &FheType,
    to_balance_0: &FheType,
    to_balance_1: &FheType,
    total_dex_token_0_in: &FheType,
    total_dex_token_1_in: &FheType,
    amount0: &FheType,
    amount1: &FheType,
 ) -> (FheType, FheType, FheType, FheType)
 where
    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + Clone,
    FheBool: IfThenElse<FheType>,
    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
 {
    let (_, new_current_balance_0) =
        transfer_whitepaper(from_balance_0, current_dex_balance_0, amount0);
    let (_, new_current_balance_1) =
        transfer_whitepaper(from_balance_1, current_dex_balance_1, amount1);
    let sent0 = &new_current_balance_0 - current_dex_balance_0;
    let sent1 = &new_current_balance_1 - current_dex_balance_1;
    let pending_0_in = to_balance_0 + &sent0;
    let pending_total_token_0_in = total_dex_token_0_in + &sent0;
    let pending_1_in = to_balance_1 + &sent1;
    let pending_total_token_1_in = total_dex_token_1_in + &sent1;
    (
        pending_0_in,
        pending_total_token_0_in,
        pending_1_in,
        pending_total_token_1_in,
    )
 }
 #[allow(clippy::too_many_arguments)]
 fn swap_claim<FheType, BigFheType>(
    pending_0_in: &FheType,
    pending_1_in: &FheType,
    total_dex_token_0_in: u64,
    total_dex_token_1_in: u64,
    total_dex_token_0_out: u64,
    total_dex_token_1_out: u64,
    old_balance_0: &FheType,
    old_balance_1: &FheType,
    current_dex_balance_0: &FheType,
    current_dex_balance_1: &FheType,
 ) -> (FheType, FheType)
 where
    FheType: CastFrom<FheBool>
        + for<'a> FheOrd<&'a FheType>
        + CastFrom<BigFheType>
        + Clone
        + Add<Output = FheType>,
    BigFheType: CastFrom<FheType> + Mul<u128, Output = BigFheType> + Div<u128, Output = BigFheType>,
    FheBool: IfThenElse<FheType>,
    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
 {
    let mut new_balance_0 = old_balance_0.clone();
    let mut new_balance_1 = old_balance_1.clone();
    if total_dex_token_1_in != 0 {
        let big_pending_1_in = BigFheType::cast_from(pending_1_in.clone());
        let big_amount_0_out =
            (big_pending_1_in * total_dex_token_0_out as u128) / total_dex_token_1_in as u128;
        let amount_0_out = FheType::cast_from(big_amount_0_out);
        let (_, new_balance_0_tmp) =
            transfer_whitepaper(current_dex_balance_0, old_balance_0, &amount_0_out);
        new_balance_0 = new_balance_0_tmp;
    }
    if total_dex_token_0_in != 0 {
        let big_pending_0_in = BigFheType::cast_from(pending_0_in.clone());
        let big_amount_1_out =
            (big_pending_0_in * total_dex_token_1_out as u128) / total_dex_token_0_in as u128;
        let amount_1_out = FheType::cast_from(big_amount_1_out);
        let (_, new_balance_1_tmp) =
            transfer_whitepaper(current_dex_balance_1, old_balance_1, &amount_1_out);
        new_balance_1 = new_balance_1_tmp;
    }
    (new_balance_0, new_balance_1)
 }
 #[cfg(feature = "pbs-stats")]
 mod pbs_stats {
    use super::*;
    use std::fs::{File, OpenOptions};
    use std::io::Write;
    use std::path::Path;
    fn write_result(file: &mut File, name: &str, value: usize) {
        let line = format!("{name},{value}\n");
        let error_message = format!("cannot write {name} result into file");
        file.write_all(line.as_bytes()).expect(&error_message);
    }
    pub fn print_swap_request_pbs_counts<FheType, F>(
        client_key: &ClientKey,
        type_name: &str,
        swap_request_func: F,
    ) where
        FheType: FheEncrypt<u64, ClientKey>,
        F: for<'a> Fn(
            &'a FheType,
            &'a FheType,
            &'a FheType,
            &'a FheType,
            &'a FheType,
            &'a FheType,
            &'a FheType,
            &'a FheType,
            &'a FheType,
            &'a FheType,
        ) -> (FheType, FheType, FheType, FheType),
    {
        let mut rng = thread_rng();
        let from_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let from_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let current_dex_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let current_dex_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let to_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let to_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let total_dex_token_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let total_dex_token_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let amount_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let amount_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        #[cfg(feature = "gpu")]
        configure_gpu(client_key);
        tfhe::reset_pbs_count();
        let (_, _, _, _) = swap_request_func(
            &from_balance_0,
            &from_balance_1,
            &current_dex_balance_0,
            &current_dex_balance_1,
            &to_balance_0,
            &to_balance_1,
            &total_dex_token_0,
            &total_dex_token_1,
            &amount_0,
            &amount_1,
        );
        let count = tfhe::get_pbs_count();
        println!("ERC20 swap request/::{type_name}: {count} PBS");
        let params = client_key.computation_parameters();
        let test_name = if cfg!(feature = "gpu") {
            format!("hlapi::cuda::dex::swap_request::pbs_count::{type_name}")
        } else {
            format!("hlapi::dex::swap_request::pbs_count::{type_name}")
        };
        let results_file = Path::new("dex_swap_request_pbs_count.csv");
        if !results_file.exists() {
            File::create(results_file).expect("create results file failed");
        }
        let mut file = OpenOptions::new()
            .append(true)
            .open(results_file)
            .expect("cannot open results file");
        write_result(&mut file, &test_name, count as usize);
        write_to_json::<u64, _>(
            &test_name,
            params,
            params.name(),
            "pbs-count",
            &OperatorType::Atomic,
            0,
            vec![],
        );
    }
    pub fn print_swap_claim_pbs_counts<FheType, F>(
        client_key: &ClientKey,
        type_name: &str,
        swap_claim_func: F,
    ) where
        FheType: FheEncrypt<u64, ClientKey>,
        F: for<'a> Fn(
            &'a FheType,
            &'a FheType,
            u64,
            u64,
            u64,
            u64,
            &'a FheType,
            &'a FheType,
            &'a FheType,
            &'a FheType,
        ) -> (FheType, FheType),
    {
        let mut rng = thread_rng();
        let pending_0_in = FheType::encrypt(rng.gen::<u64>(), client_key);
        let pending_1_in = FheType::encrypt(rng.gen::<u64>(), client_key);
        let total_dex_token_0_in = rng.gen::<u64>();
        let total_dex_token_1_in = rng.gen::<u64>();
        let total_dex_token_0_out = rng.gen::<u64>();
        let total_dex_token_1_out = rng.gen::<u64>();
        let old_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let old_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let current_dex_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let current_dex_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        #[cfg(feature = "gpu")]
        configure_gpu(client_key);
        tfhe::reset_pbs_count();
        let (_, _) = swap_claim_func(
            &pending_0_in,
            &pending_1_in,
            total_dex_token_0_in,
            total_dex_token_1_in,
            total_dex_token_0_out,
            total_dex_token_1_out,
            &old_balance_0,
            &old_balance_1,
            &current_dex_balance_0,
            &current_dex_balance_1,
        );
        let count = tfhe::get_pbs_count();
        println!("ERC20 swap claim/::{type_name}: {count} PBS");
        let params = client_key.computation_parameters();
        let test_name = if cfg!(feature = "gpu") {
            format!("hlapi::cuda::dex::swap_claim::pbs_count::{type_name}")
        } else {
            format!("hlapi::dex::swap_claim::pbs_count::{type_name}")
        };
        let results_file = Path::new("dex_swap_claim_pbs_count.csv");
        if !results_file.exists() {
            File::create(results_file).expect("create results file failed");
        }
        let mut file = OpenOptions::new()
            .append(true)
            .open(results_file)
            .expect("cannot open results file");
        write_result(&mut file, &test_name, count as usize);
        write_to_json::<u64, _>(
            &test_name,
            params,
            params.name(),
            "pbs-count",
            &OperatorType::Atomic,
            0,
            vec![],
        );
    }
 }
 fn bench_swap_request_latency<FheType, F>(
    c: &mut BenchmarkGroup<'_, WallTime>,
    client_key: &ClientKey,
    bench_name: &str,
    type_name: &str,
    fn_name: &str,
    swap_request_func: F,
 ) where
    FheType: FheEncrypt<u64, ClientKey>,
    F: for<'a> Fn(
        &'a FheType,
        &'a FheType,
        &'a FheType,
        &'a FheType,
        &'a FheType,
        &'a FheType,
        &'a FheType,
        &'a FheType,
        &'a FheType,
        &'a FheType,
    ) -> (FheType, FheType, FheType, FheType),
 {
    #[cfg(feature = "gpu")]
    configure_gpu(client_key);
    let bench_id = format!("{bench_name}::{fn_name}::{type_name}");
    c.bench_function(&bench_id, |b| {
        let mut rng = thread_rng();
        let from_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let from_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let current_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let current_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let to_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let to_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let total_token_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let total_token_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let amount_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let amount_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        b.iter(|| {
            let (_, _, _, _) = swap_request_func(
                &from_balance_0,
                &from_balance_1,
                &current_balance_0,
                &current_balance_1,
                &to_balance_0,
                &to_balance_1,
                &total_token_0,
                &total_token_1,
                &amount_0,
                &amount_1,
            );
        })
    });
    let params = client_key.computation_parameters();
    write_to_json::<u64, _>(
        &bench_id,
        params,
        params.name(),
        "dex-swap-request",
        &OperatorType::Atomic,
        64,
        vec![],
    );
 }
 fn bench_swap_claim_latency<FheType, F>(
    c: &mut BenchmarkGroup<'_, WallTime>,
    client_key: &ClientKey,
    bench_name: &str,
    type_name: &str,
    fn_name: &str,
    swap_claim_func: F,
 ) where
    FheType: FheEncrypt<u64, ClientKey>,
    F: for<'a> Fn(
        &'a FheType,
        &'a FheType,
        u64,
        u64,
        u64,
        u64,
        &'a FheType,
        &'a FheType,
        &'a FheType,
        &'a FheType,
    ) -> (FheType, FheType),
 {
    #[cfg(feature = "gpu")]
    configure_gpu(client_key);
    let bench_id = format!("{bench_name}::{fn_name}::{type_name}");
    c.bench_function(&bench_id, |b| {
        let mut rng = thread_rng();
        let pending_0_in = FheType::encrypt(rng.gen::<u64>(), client_key);
        let pending_1_in = FheType::encrypt(rng.gen::<u64>(), client_key);
        let total_token_0_in = rng.gen::<u64>();
        let total_token_1_in = rng.gen::<u64>();
        let total_token_0_out = rng.gen::<u64>();
        let total_token_1_out = rng.gen::<u64>();
        let old_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let old_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let current_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
        let current_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
        b.iter(|| {
            let (_, _) = swap_claim_func(
                &pending_0_in,
                &pending_1_in,
                total_token_0_in,
                total_token_1_in,
                total_token_0_out,
                total_token_1_out,
                &old_balance_0,
                &old_balance_1,
                &current_balance_0,
                &current_balance_1,
            );
        })
    });
    let params = client_key.computation_parameters();
    write_to_json::<u64, _>(
        &bench_id,
        params,
        params.name(),
        "dex-swap-claim",
        &OperatorType::Atomic,
        64,
        vec![],
    );
 }
 #[cfg(feature = "pbs-stats")]
 use crate::pbs_stats::print_swap_claim_pbs_counts;
 #[cfg(feature = "pbs-stats")]
 use crate::pbs_stats::print_swap_request_pbs_counts;
 #[cfg(not(feature = "gpu"))]
 fn main() {
    let params = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
    let config = ConfigBuilder::with_custom_parameters(params).build();
    let cks = ClientKey::generate(config);
    let compressed_sks = CompressedServerKey::new(&cks);
    let sks = compressed_sks.decompress();
    rayon::broadcast(|_| set_server_key(sks.clone()));
    set_server_key(sks);
    let mut c = Criterion::default().sample_size(10).configure_from_args();
    let bench_name = "hlapi::dex";
    // FheUint64 PBS counts
    // We don't run multiple times since every input is encrypted
    // PBS count is always the same
    #[cfg(feature = "pbs-stats")]
    {
        print_swap_request_pbs_counts(&cks, "FheUint64", swap_request::<FheUint64>);
        print_swap_claim_pbs_counts(&cks, "FheUint64", swap_claim::<FheUint64, FheUint128>);
    }
    // FheUint64 latency
    {
        let mut group = c.benchmark_group(bench_name);
        bench_swap_request_latency(
            &mut group,
            &cks,
            bench_name,
            "FheUint64",
            "swap_request",
            swap_request::<FheUint64>,
        );
        bench_swap_claim_latency(
            &mut group,
            &cks,
            bench_name,
            "FheUint64",
            "swap_claim",
            swap_claim::<FheUint64, FheUint128>,
        );
        group.finish();
    }
    c.final_summary();
 }
 #[cfg(feature = "gpu")]
 fn main() {
    let params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS;
    let config = ConfigBuilder::with_custom_parameters(params).build();
    let cks = ClientKey::generate(config);
    let mut c = Criterion::default().sample_size(10).configure_from_args();
    let bench_name = "hlapi::cuda::dex";
    // FheUint64 PBS counts
    // We don't run multiple times since every input is encrypted
    // PBS count is always the same
    #[cfg(feature = "pbs-stats")]
    {
        print_swap_request_pbs_counts(&cks, "FheUint64", swap_request::<FheUint64>);
        print_swap_claim_pbs_counts(&cks, "FheUint64", swap_claim::<FheUint64, FheUint128>);
    }
    // FheUint64 latency
    {
        let mut group = c.benchmark_group(bench_name);
        bench_swap_request_latency(
            &mut group,
            &cks,
            bench_name,
            "FheUint64",
            "swap_request",
            swap_request::<FheUint64>,
        );
        bench_swap_claim_latency(
            &mut group,
            &cks,
            bench_name,
            "FheUint64",
            "swap_claim",
            swap_claim::<FheUint64, FheUint128>,
        );
        group.finish();
    }
    c.final_summary();
 }
--- a/tfhe/benches/high_level_api/erc20.rs
+++ b/tfhe/benches/high_level_api/erc20.rs
@@ -1,6 +1,8 @@
 #[path = "../utilities.rs"]
 mod utilities;
 #[cfg(feature = "gpu")]
 use crate::utilities::configure_gpu;
 use crate::utilities::{write_to_json, OperatorType};
 use criterion::measurement::WallTime;
 use criterion::{BenchmarkGroup, Criterion, Throughput};
@@ -18,7 +20,7 @@ use tfhe::{set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheBoo
 /// Transfer as written in the original FHEvm white-paper,
 /// it uses a comparison to check if the sender has enough,
 /// and cmuxes based on the comparison result
-fn transfer_whitepaper<FheType>(
+pub fn transfer_whitepaper<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
    amount: &FheType,
@@ -177,13 +179,6 @@ mod pbs_stats {
    }
 }
 #[cfg(feature = "gpu")]
 fn configure_gpu(client_key: &ClientKey) {
    let compressed_sks = CompressedServerKey::new(client_key);
    let sks = compressed_sks.decompress_to_gpu();
    rayon::broadcast(|_| set_server_key(sks.clone()));
    set_server_key(sks);
 }
 fn bench_transfer_latency<FheType, F>(
    c: &mut BenchmarkGroup<'_, WallTime>,
    client_key: &ClientKey,
@@ -383,7 +378,7 @@ fn main() {
    let mut c = Criterion::default().sample_size(10).configure_from_args();
-    let bench_name = "hlapi::erc20::transfer";
+    let bench_name = "hlapi::erc20";
    // FheUint64 PBS counts
    // We don't run multiple times since every input is encrypted
@@ -393,14 +388,14 @@ fn main() {
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::<FheUint64>);
@@ -414,7 +409,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_latency(
@@ -422,7 +417,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "no_cmux",
+            "transfer::no_cmux",
            transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_latency(
@@ -430,7 +425,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        bench_transfer_latency(
@@ -438,7 +433,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "safe",
+            "transfer::safe",
            transfer_safe::<FheUint64>,
        );
@@ -453,7 +448,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_throughput(
@@ -461,7 +456,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "no_cmux",
+            "transfer::no_cmux",
            transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_throughput(
@@ -469,7 +464,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        bench_transfer_throughput(
@@ -477,9 +472,10 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "safe",
+            "transfer::safe",
            transfer_safe::<FheUint64>,
        );
        group.finish();
    }
@@ -495,7 +491,7 @@ fn main() {
    let mut c = Criterion::default().sample_size(10).configure_from_args();
-    let bench_name = "hlapi::cuda::erc20::transfer";
+    let bench_name = "hlapi::cuda::erc20";
    // FheUint64 PBS counts
    // We don't run multiple times since every input is encrypted
@@ -505,14 +501,14 @@ fn main() {
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::<FheUint64>);
@@ -526,7 +522,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_latency(
@@ -534,7 +530,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "no_cmux",
+            "transfer::no_cmux",
            transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_latency(
@@ -542,7 +538,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        bench_transfer_latency(
@@ -550,7 +546,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "safe",
+            "transfer::safe",
            transfer_safe::<FheUint64>,
        );
@@ -565,7 +561,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        cuda_bench_transfer_throughput(
@@ -573,7 +569,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "no_cmux",
+            "transfer::no_cmux",
            transfer_no_cmux::<FheUint64>,
        );
        cuda_bench_transfer_throughput(
@@ -581,7 +577,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        cuda_bench_transfer_throughput(
@@ -589,7 +585,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "safe",
+            "transfer::safe",
            transfer_safe::<FheUint64>,
        );
        group.finish();
--- a/tfhe/benches/utilities.rs
+++ b/tfhe/benches/utilities.rs
@@ -655,6 +655,7 @@ mod cuda_utils {
    use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
    use tfhe::core_crypto::prelude::{Numeric, UnsignedInteger};
    use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
    use tfhe::{set_server_key, ClientKey, CompressedServerKey};
    #[allow(dead_code)]
    pub const GPU_MAX_SUPPORTED_POLYNOMIAL_SIZE: usize = 16384;
@@ -879,6 +880,13 @@ mod cuda_utils {
        }
    }
    #[allow(dead_code)]
    pub fn configure_gpu(client_key: &ClientKey) {
        let compressed_sks = CompressedServerKey::new(client_key);
        let sks = compressed_sks.decompress_to_gpu();
        rayon::broadcast(|_| set_server_key(sks.clone()));
        set_server_key(sks);
    }
    #[allow(unused_imports)]
    #[cfg(feature = "integer")]
    pub use cuda_integer_utils::*;