chore(gpu): write swap bench

2026-01-09 22:57:59 -05:00 · 2025-04-28 16:00:41 +02:00
parent 7e3a5fd55b
commit 97690ab3bd
9 changed files with 1015 additions and 30 deletions
--- a/.github/workflows/benchmark_dex.yml
+++ b/.github/workflows/benchmark_dex.yml
@@ -0,0 +1,143 @@
+# Run all DEX benchmarks on an AWS instance and return parsed results to Slab CI bot.
+name: DEX benchmarks
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 5a.m.
+    - cron: '0 5 * * 6'
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (dex-benchmarks)
+    runs-on: ubuntu-latest
+    if: github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: bench
+
+  dex-benchmarks:
+    name: Execute DEX benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    concurrency:
+      group: ${{ github.workflow_ref }}
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    continue-on-error: true
+    timeout-minutes: 720  # 12 hours
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        with:
+          toolchain: nightly
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Run benchmarks
+        run: |
+          make bench_hlapi_dex
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "hpc7a.96xlarge" \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512
+
+      - name: Parse PBS counts
+        run: |
+          python3 ./ci/benchmark_parser.py tfhe/dex_pbs_count.csv ${{ env.RESULTS_FILENAME }} \
+          --object-sizes \
+          --append-results
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ github.sha }}_dex
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"
+
+      - name: Slack Notification
+        if: ${{ failure() || (cancelled() && github.event_name != 'pull_request') }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "DEX benchmarks finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (dex-benchmarks)
+    if: ${{ always() && needs.setup-instance.result == 'success' }}
+    needs: [ setup-instance, dex-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (dex-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_dex.yml
+++ b/.github/workflows/benchmark_gpu_dex.yml
@@ -0,0 +1,44 @@
+# Run CUDA DEX benchmarks on a Hyperstack VM and return parsed results to Slab CI bot.
+name: Cuda DEX benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      profile:
+        description: "Instance type"
+        required: true
+        type: choice
+        options:
+          - "l40 (n3-L40x1)"
+          - "single-h100 (n3-H100x1)"
+          - "2-h100 (n3-H100x2)"
+          - "4-h100 (n3-H100x4)"
+          - "multi-h100 (n3-H100x8)"
+          - "multi-h100-nvlink (n3-H100x8-NVLink)"
+          - "multi-h100-sxm5 (n3-H100x8-SXM5)"
+
+jobs:
+  parse-inputs:
+    runs-on: ubuntu-latest
+    outputs:
+      profile: ${{ steps.parse_profile.outputs.profile }}
+      hardware_name: ${{ steps.parse_hardware_name.outputs.name }}
+    steps:
+      - name: Parse profile
+        id: parse_profile
+        run: |
+          echo "profile=$(echo '${{ inputs.profile }}' | sed 's|\(.*\)[[:space:]](.*)|\1|')" >> "${GITHUB_OUTPUT}"
+
+      - name: Parse hardware name
+        id: parse_hardware_name
+        run: |
+          echo "name=$(echo '${{ inputs.profile }}' | sed 's|.*[[:space:]](\(.*\))|\1|')" >> "${GITHUB_OUTPUT}"
+
+  run-benchmarks:
+    name: Run benchmarks
+    needs: parse-inputs
+    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
+    with:
+      profile: ${{ needs.parse-inputs.outputs.profile }}
+      hardware_name: ${{ needs.parse-inputs.outputs.hardware_name }}
+    secrets: inherit
--- a/.github/workflows/benchmark_gpu_dex_common.yml
+++ b/.github/workflows/benchmark_gpu_dex_common.yml
@@ -0,0 +1,201 @@
+# Run DEX benchmarks on an instance with CUDA and return parsed results to Slab CI bot.
+name: Cuda DEX benchmarks - common
+
+on:
+  workflow_call:
+    inputs:
+      backend:
+        type: string
+        default: hyperstack
+      profile:
+        type: string
+        required: true
+      hardware_name:
+        type: string
+        required: true
+    secrets:
+      REPO_CHECKOUT_TOKEN:
+        required: true
+      SLAB_ACTION_TOKEN:
+        required: true
+      SLAB_BASE_URL:
+        required: true
+      SLAB_URL:
+        required: true
+      JOB_SECRET:
+        required: true
+      SLACK_CHANNEL:
+        required: true
+      BOT_USERNAME:
+        required: true
+      SLACK_WEBHOOK:
+        required: true
+
+env:
+  CARGO_TERM_COLOR: always
+  RESULTS_FILENAME: parsed_benchmark_results_${{ github.sha }}.json
+  PARSE_INTEGER_BENCH_CSV_FILE: tfhe_rs_integer_benches_${{ github.sha }}.csv
+  ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+  RUST_BACKTRACE: "full"
+  RUST_MIN_STACK: "8388608"
+  SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
+  SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
+  SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
+  SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+
+jobs:
+  setup-instance:
+    name: Setup instance (cuda-dex-benchmarks)
+    runs-on: ubuntu-latest
+    if:  github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'schedule' && github.repository == 'zama-ai/tfhe-rs')
+    outputs:
+      # Use permanent remote instance label first as on-demand remote instance label output is set before the end of start-remote-instance step.
+      # If the latter fails due to a failed GitHub action runner set up, we have to fallback on the permanent instance.
+      # Since the on-demand remote label is set before failure, we have to do the logical OR in this order,
+      # otherwise we'll try to run the next job on a non-existing on-demand instance.
+      runner-name: ${{ steps.use-permanent-instance.outputs.runner_group || steps.start-remote-instance.outputs.label }}
+      remote-instance-outcome: ${{ steps.start-remote-instance.outcome }}
+    steps:
+      - name: Start remote instance
+        id: start-remote-instance
+        continue-on-error: true
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: ${{ inputs.backend }}
+          profile: ${{ inputs.profile }}
+
+      - name: Acknowledge remote instance failure
+        if: steps.start-remote-instance.outcome == 'failure' &&
+          inputs.profile != 'single-h100'
+        run: |
+          echo "Remote instance instance has failed to start (profile provided: '${{ inputs.profile }}')"
+          echo "Permanent instance instance cannot be used as a substitute (profile needed: 'single-h100')"
+          exit 1
+
+      # This will allow to fallback on permanent instances running on Hyperstack.
+      - name: Use permanent remote instance
+        id: use-permanent-instance
+        if: env.SECRETS_AVAILABLE == 'true' &&
+          steps.start-remote-instance.outcome == 'failure' &&
+          inputs.profile == 'single-h100'
+        run: |
+          echo "runner_group=h100x1" >> "$GITHUB_OUTPUT"
+
+  cuda-dex-benchmarks:
+    name: Cuda DEX benchmarks (${{ inputs.profile }})
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    strategy:
+      fail-fast: false
+      # explicit include-based build matrix, of known valid options
+      matrix:
+        include:
+          - os: ubuntu-22.04
+            cuda: "12.2"
+            gcc: 11
+    steps:
+      - name: Checkout tfhe-rs repo with tags
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          fetch-depth: 0
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Setup Hyperstack dependencies
+        if: needs.setup-instance.outputs.remote-instance-outcome == 'success'
+        uses: ./.github/actions/gpu_setup
+        with:
+          cuda-version: ${{ matrix.cuda }}
+          gcc-version: ${{ matrix.gcc }}
+
+      - name: Get benchmark details
+        run: |
+          {
+            echo "BENCH_DATE=$(date --iso-8601=seconds)";
+            echo "COMMIT_DATE=$(git --no-pager show -s --format=%cd --date=iso8601-strict ${{ github.sha }})";
+            echo "COMMIT_HASH=$(git describe --tags --dirty)";
+          } >> "${GITHUB_ENV}"
+
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
+        with:
+          toolchain: nightly
+
+      - name: Run benchmarks
+        run: |
+          make bench_hlapi_dex_gpu
+
+      - name: Parse results
+        run: |
+          python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
+          --database tfhe_rs \
+          --hardware "${{ inputs.hardware_name }}" \
+          --backend gpu \
+          --project-version "${{ env.COMMIT_HASH }}" \
+          --branch ${{ github.ref_name }} \
+          --commit-date "${{ env.COMMIT_DATE }}" \
+          --bench-date "${{ env.BENCH_DATE }}" \
+          --walk-subdirs \
+          --name-suffix avx512
+
+      - name: Upload parsed results artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
+        with:
+          name: ${{ github.sha }}_dex_${{ inputs.profile }}
+          path: ${{ env.RESULTS_FILENAME }}
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
+        with:
+          repository: zama-ai/slab
+          path: slab
+          persist-credentials: 'false'
+          token: ${{ secrets.REPO_CHECKOUT_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          python3 slab/scripts/data_sender.py ${{ env.RESULTS_FILENAME }} "${{ secrets.JOB_SECRET }}" \
+          --slab-url "${{ secrets.SLAB_URL }}"
+
+  slack-notify:
+    name: Slack Notification
+    needs: [ setup-instance, cuda-dex-benchmarks ]
+    runs-on: ubuntu-latest
+    if: ${{ always() && needs.cuda-dex-benchmarks.result != 'skipped' && failure() }}
+    continue-on-error: true
+    steps:
+      - name: Send message
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ needs.cuda-dex-benchmarks.result }}
+          SLACK_MESSAGE: "Cuda DEX benchmarks (${{ inputs.profile }}) finished with status: ${{ needs.cuda-dex-benchmarks.result }}. (${{ env.ACTION_RUN_URL }})"
+
+  teardown-instance:
+    name: Teardown instance (cuda-dex-${{ inputs.profile }}-benchmarks)
+    if: ${{ always() && needs.setup-instance.outputs.remote-instance-outcome == 'success' }}
+    needs: [ setup-instance, cuda-dex-benchmarks, slack-notify ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
+
+      - name: Slack Notification
+        if: ${{ failure() }}
+        continue-on-error: true
+        uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
+        env:
+          SLACK_COLOR: ${{ job.status }}
+          SLACK_MESSAGE: "Instance teardown (cuda-dex-${{ inputs.profile }}-benchmarks) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"
--- a/.github/workflows/benchmark_gpu_dex_weekly.yml
+++ b/.github/workflows/benchmark_gpu_dex_weekly.yml
@@ -0,0 +1,35 @@
+# Run CUDA DEX benchmarks on multiple Hyperstack VMs and return parsed results to Slab CI bot.
+name: Cuda DEX weekly benchmarks
+
+on:
+  schedule:
+    # Weekly benchmarks will be triggered each Saturday at 9a.m.
+    - cron: '0 9 * * 6'
+
+jobs:
+  run-benchmarks-1-h100:
+    name: Run benchmarks (1xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
+    with:
+      profile: single-h100
+      hardware_name: n3-H100x1
+    secrets: inherit
+
+  run-benchmarks-2-h100:
+    name: Run benchmarks (2xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
+    with:
+      profile: 2-h100
+      hardware_name: n3-H100x2
+    secrets: inherit
+
+  run-benchmarks-8-h100:
+    name: Run benchmarks (8xH100)
+    if: github.repository == 'zama-ai/tfhe-rs'
+    uses: ./.github/workflows/benchmark_gpu_dex_common.yml
+    with:
+      profile: multi-h100
+      hardware_name: n3-H100x8
+    secrets: inherit
--- a/12
+++ b/12
@@ -1293,6 +1293,18 @@ bench_hlapi_erc20_gpu: install_rs_check_toolchain
 	--bench hlapi-erc20 \
 	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --

+.PHONY: bench_hlapi_dex # Run benchmarks for ECR20 operations
+bench_hlapi_dex: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+
+.PHONY: bench_hlapi_dex_gpu # Run benchmarks for ECR20 operations on GPU
+bench_hlapi_dex_gpu: install_rs_check_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_CHECK_TOOLCHAIN) bench \
+	--bench hlapi-dex \
+	--features=integer,gpu,internal-keycache,pbs-stats,nightly-avx512 -p $(TFHE_SPEC) --
+
 .PHONY: bench_tfhe_zk_pok # Run benchmarks for the tfhe_zk_pok crate
 bench_tfhe_zk_pok: install_rs_check_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" \
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -254,6 +254,12 @@ path = "benches/high_level_api/erc20.rs"
 harness = false
 required-features = ["integer", "internal-keycache"]

+[[bench]]
+name = "hlapi-dex"
+path = "benches/high_level_api/dex.rs"
+harness = false
+required-features = ["integer", "internal-keycache"]
+
 [[bench]]
 name = "keygen"
 path = "benches/keygen/bench.rs"
--- a/tfhe/benches/high_level_api/dex.rs
+++ b/tfhe/benches/high_level_api/dex.rs
@@ -0,0 +1,540 @@
+#[path = "../utilities.rs"]
+mod utilities;
+
+#[cfg(feature = "gpu")]
+use crate::utilities::configure_gpu;
+use crate::utilities::{write_to_json, OperatorType};
+
+use criterion::measurement::WallTime;
+use criterion::{BenchmarkGroup, Criterion};
+use rand::prelude::*;
+use rand::thread_rng;
+use std::ops::{Add, Div, Mul, Sub};
+use tfhe::keycache::NamedParam;
+use tfhe::prelude::*;
+use tfhe::shortint::parameters::*;
+#[cfg(not(feature = "gpu"))]
+use tfhe::{set_server_key, CompressedServerKey};
+use tfhe::{ClientKey, ConfigBuilder, FheBool, FheUint128, FheUint64};
+
+pub(crate) fn transfer_whitepaper<FheType>(
+    from_amount: &FheType,
+    to_amount: &FheType,
+    amount: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType>,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
+{
+    let has_enough_funds = (from_amount).ge(amount);
+
+    let mut new_to_amount = to_amount + amount;
+    new_to_amount = has_enough_funds.if_then_else(&new_to_amount, to_amount);
+
+    let mut new_from_amount = from_amount - amount;
+    new_from_amount = has_enough_funds.if_then_else(&new_from_amount, from_amount);
+
+    (new_from_amount, new_to_amount)
+}
+
+#[allow(clippy::too_many_arguments)]
+fn swap_request<FheType>(
+    from_balance_0: &FheType,
+    from_balance_1: &FheType,
+    current_dex_balance_0: &FheType,
+    current_dex_balance_1: &FheType,
+    to_balance_0: &FheType,
+    to_balance_1: &FheType,
+    total_dex_token_0_in: &FheType,
+    total_dex_token_1_in: &FheType,
+    amount0: &FheType,
+    amount1: &FheType,
+) -> (FheType, FheType, FheType, FheType)
+where
+    FheType: Add<Output = FheType> + for<'a> FheOrd<&'a FheType> + Clone,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
+{
+    let (_, new_current_balance_0) =
+        transfer_whitepaper(from_balance_0, current_dex_balance_0, amount0);
+    let (_, new_current_balance_1) =
+        transfer_whitepaper(from_balance_1, current_dex_balance_1, amount1);
+    let sent0 = &new_current_balance_0 - current_dex_balance_0;
+    let sent1 = &new_current_balance_1 - current_dex_balance_1;
+    let pending_0_in = to_balance_0 + &sent0;
+    let pending_total_token_0_in = total_dex_token_0_in + &sent0;
+    let pending_1_in = to_balance_1 + &sent1;
+    let pending_total_token_1_in = total_dex_token_1_in + &sent1;
+    (
+        pending_0_in,
+        pending_total_token_0_in,
+        pending_1_in,
+        pending_total_token_1_in,
+    )
+}
+
+#[allow(clippy::too_many_arguments)]
+fn swap_claim<FheType, BigFheType>(
+    pending_0_in: &FheType,
+    pending_1_in: &FheType,
+    total_dex_token_0_in: u64,
+    total_dex_token_1_in: u64,
+    total_dex_token_0_out: u64,
+    total_dex_token_1_out: u64,
+    old_balance_0: &FheType,
+    old_balance_1: &FheType,
+    current_dex_balance_0: &FheType,
+    current_dex_balance_1: &FheType,
+) -> (FheType, FheType)
+where
+    FheType: CastFrom<FheBool>
+        + for<'a> FheOrd<&'a FheType>
+        + CastFrom<BigFheType>
+        + Clone
+        + Add<Output = FheType>,
+    BigFheType: CastFrom<FheType> + Mul<u128, Output = BigFheType> + Div<u128, Output = BigFheType>,
+    FheBool: IfThenElse<FheType>,
+    for<'a> &'a FheType: Add<Output = FheType> + Sub<Output = FheType>,
+{
+    let mut new_balance_0 = old_balance_0.clone();
+    let mut new_balance_1 = old_balance_1.clone();
+    if total_dex_token_1_in != 0 {
+        let big_pending_1_in = BigFheType::cast_from(pending_1_in.clone());
+        let big_amount_0_out =
+            (big_pending_1_in * total_dex_token_0_out as u128) / total_dex_token_1_in as u128;
+        let amount_0_out = FheType::cast_from(big_amount_0_out);
+        let (_, new_balance_0_tmp) =
+            transfer_whitepaper(current_dex_balance_0, old_balance_0, &amount_0_out);
+        new_balance_0 = new_balance_0_tmp;
+    }
+    if total_dex_token_0_in != 0 {
+        let big_pending_0_in = BigFheType::cast_from(pending_0_in.clone());
+        let big_amount_1_out =
+            (big_pending_0_in * total_dex_token_1_out as u128) / total_dex_token_0_in as u128;
+        let amount_1_out = FheType::cast_from(big_amount_1_out);
+        let (_, new_balance_1_tmp) =
+            transfer_whitepaper(current_dex_balance_1, old_balance_1, &amount_1_out);
+        new_balance_1 = new_balance_1_tmp;
+    }
+
+    (new_balance_0, new_balance_1)
+}
+
+#[cfg(feature = "pbs-stats")]
+mod pbs_stats {
+    use super::*;
+    use std::fs::{File, OpenOptions};
+    use std::io::Write;
+    use std::path::Path;
+
+    fn write_result(file: &mut File, name: &str, value: usize) {
+        let line = format!("{name},{value}\n");
+        let error_message = format!("cannot write {name} result into file");
+        file.write_all(line.as_bytes()).expect(&error_message);
+    }
+
+    pub fn print_swap_request_pbs_counts<FheType, F>(
+        client_key: &ClientKey,
+        type_name: &str,
+        swap_request_func: F,
+    ) where
+        FheType: FheEncrypt<u64, ClientKey>,
+        F: for<'a> Fn(
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+        ) -> (FheType, FheType, FheType, FheType),
+    {
+        let mut rng = thread_rng();
+
+        let from_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let from_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_dex_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_dex_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_dex_token_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_dex_token_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        #[cfg(feature = "gpu")]
+        configure_gpu(client_key);
+
+        tfhe::reset_pbs_count();
+        let (_, _, _, _) = swap_request_func(
+            &from_balance_0,
+            &from_balance_1,
+            &current_dex_balance_0,
+            &current_dex_balance_1,
+            &to_balance_0,
+            &to_balance_1,
+            &total_dex_token_0,
+            &total_dex_token_1,
+            &amount_0,
+            &amount_1,
+        );
+        let count = tfhe::get_pbs_count();
+
+        println!("ERC20 swap request/::{type_name}: {count} PBS");
+
+        let params = client_key.computation_parameters();
+
+        let test_name = if cfg!(feature = "gpu") {
+            format!("hlapi::cuda::dex::swap_request::pbs_count::{type_name}")
+        } else {
+            format!("hlapi::dex::swap_request::pbs_count::{type_name}")
+        };
+
+        let results_file = Path::new("dex_swap_request_pbs_count.csv");
+        if !results_file.exists() {
+            File::create(results_file).expect("create results file failed");
+        }
+        let mut file = OpenOptions::new()
+            .append(true)
+            .open(results_file)
+            .expect("cannot open results file");
+
+        write_result(&mut file, &test_name, count as usize);
+
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "pbs-count",
+            &OperatorType::Atomic,
+            0,
+            vec![],
+        );
+    }
+    pub fn print_swap_claim_pbs_counts<FheType, F>(
+        client_key: &ClientKey,
+        type_name: &str,
+        swap_claim_func: F,
+    ) where
+        FheType: FheEncrypt<u64, ClientKey>,
+        F: for<'a> Fn(
+            &'a FheType,
+            &'a FheType,
+            u64,
+            u64,
+            u64,
+            u64,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+            &'a FheType,
+        ) -> (FheType, FheType),
+    {
+        let mut rng = thread_rng();
+
+        let pending_0_in = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let pending_1_in = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_dex_token_0_in = rng.gen::<u64>();
+        let total_dex_token_1_in = rng.gen::<u64>();
+        let total_dex_token_0_out = rng.gen::<u64>();
+        let total_dex_token_1_out = rng.gen::<u64>();
+        let old_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let old_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_dex_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_dex_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        #[cfg(feature = "gpu")]
+        configure_gpu(client_key);
+
+        tfhe::reset_pbs_count();
+        let (_, _) = swap_claim_func(
+            &pending_0_in,
+            &pending_1_in,
+            total_dex_token_0_in,
+            total_dex_token_1_in,
+            total_dex_token_0_out,
+            total_dex_token_1_out,
+            &old_balance_0,
+            &old_balance_1,
+            &current_dex_balance_0,
+            &current_dex_balance_1,
+        );
+        let count = tfhe::get_pbs_count();
+
+        println!("ERC20 swap claim/::{type_name}: {count} PBS");
+
+        let params = client_key.computation_parameters();
+
+        let test_name = if cfg!(feature = "gpu") {
+            format!("hlapi::cuda::dex::swap_claim::pbs_count::{type_name}")
+        } else {
+            format!("hlapi::dex::swap_claim::pbs_count::{type_name}")
+        };
+
+        let results_file = Path::new("dex_swap_claim_pbs_count.csv");
+        if !results_file.exists() {
+            File::create(results_file).expect("create results file failed");
+        }
+        let mut file = OpenOptions::new()
+            .append(true)
+            .open(results_file)
+            .expect("cannot open results file");
+
+        write_result(&mut file, &test_name, count as usize);
+
+        write_to_json::<u64, _>(
+            &test_name,
+            params,
+            params.name(),
+            "pbs-count",
+            &OperatorType::Atomic,
+            0,
+            vec![],
+        );
+    }
+}
+
+fn bench_swap_request_latency<FheType, F>(
+    c: &mut BenchmarkGroup<'_, WallTime>,
+    client_key: &ClientKey,
+    bench_name: &str,
+    type_name: &str,
+    fn_name: &str,
+    swap_request_func: F,
+) where
+    FheType: FheEncrypt<u64, ClientKey>,
+    F: for<'a> Fn(
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+    ) -> (FheType, FheType, FheType, FheType),
+{
+    #[cfg(feature = "gpu")]
+    configure_gpu(client_key);
+
+    let bench_id = format!("{bench_name}::{fn_name}::{type_name}");
+    c.bench_function(&bench_id, |b| {
+        let mut rng = thread_rng();
+
+        let from_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let from_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let to_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_token_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_token_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let amount_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        b.iter(|| {
+            let (_, _, _, _) = swap_request_func(
+                &from_balance_0,
+                &from_balance_1,
+                &current_balance_0,
+                &current_balance_1,
+                &to_balance_0,
+                &to_balance_1,
+                &total_token_0,
+                &total_token_1,
+                &amount_0,
+                &amount_1,
+            );
+        })
+    });
+
+    let params = client_key.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "dex-swap-request",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+}
+
+fn bench_swap_claim_latency<FheType, F>(
+    c: &mut BenchmarkGroup<'_, WallTime>,
+    client_key: &ClientKey,
+    bench_name: &str,
+    type_name: &str,
+    fn_name: &str,
+    swap_claim_func: F,
+) where
+    FheType: FheEncrypt<u64, ClientKey>,
+    F: for<'a> Fn(
+        &'a FheType,
+        &'a FheType,
+        u64,
+        u64,
+        u64,
+        u64,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+        &'a FheType,
+    ) -> (FheType, FheType),
+{
+    #[cfg(feature = "gpu")]
+    configure_gpu(client_key);
+
+    let bench_id = format!("{bench_name}::{fn_name}::{type_name}");
+    c.bench_function(&bench_id, |b| {
+        let mut rng = thread_rng();
+
+        let pending_0_in = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let pending_1_in = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let total_token_0_in = rng.gen::<u64>();
+        let total_token_1_in = rng.gen::<u64>();
+        let total_token_0_out = rng.gen::<u64>();
+        let total_token_1_out = rng.gen::<u64>();
+        let old_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let old_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_balance_0 = FheType::encrypt(rng.gen::<u64>(), client_key);
+        let current_balance_1 = FheType::encrypt(rng.gen::<u64>(), client_key);
+
+        b.iter(|| {
+            let (_, _) = swap_claim_func(
+                &pending_0_in,
+                &pending_1_in,
+                total_token_0_in,
+                total_token_1_in,
+                total_token_0_out,
+                total_token_1_out,
+                &old_balance_0,
+                &old_balance_1,
+                &current_balance_0,
+                &current_balance_1,
+            );
+        })
+    });
+
+    let params = client_key.computation_parameters();
+
+    write_to_json::<u64, _>(
+        &bench_id,
+        params,
+        params.name(),
+        "dex-swap-claim",
+        &OperatorType::Atomic,
+        64,
+        vec![],
+    );
+}
+
+#[cfg(feature = "pbs-stats")]
+use crate::pbs_stats::print_swap_claim_pbs_counts;
+#[cfg(feature = "pbs-stats")]
+use crate::pbs_stats::print_swap_request_pbs_counts;
+
+#[cfg(not(feature = "gpu"))]
+fn main() {
+    let params = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let config = ConfigBuilder::with_custom_parameters(params).build();
+    let cks = ClientKey::generate(config);
+    let compressed_sks = CompressedServerKey::new(&cks);
+
+    let sks = compressed_sks.decompress();
+
+    rayon::broadcast(|_| set_server_key(sks.clone()));
+    set_server_key(sks);
+
+    let mut c = Criterion::default().sample_size(10).configure_from_args();
+
+    let bench_name = "hlapi::dex";
+
+    // FheUint64 PBS counts
+    // We don't run multiple times since every input is encrypted
+    // PBS count is always the same
+    #[cfg(feature = "pbs-stats")]
+    {
+        print_swap_request_pbs_counts(&cks, "FheUint64", swap_request::<FheUint64>);
+        print_swap_claim_pbs_counts(&cks, "FheUint64", swap_claim::<FheUint64, FheUint128>);
+    }
+
+    // FheUint64 latency
+    {
+        let mut group = c.benchmark_group(bench_name);
+        bench_swap_request_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "swap_request",
+            swap_request::<FheUint64>,
+        );
+        bench_swap_claim_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "swap_claim",
+            swap_claim::<FheUint64, FheUint128>,
+        );
+
+        group.finish();
+    }
+
+    c.final_summary();
+}
+
+#[cfg(feature = "gpu")]
+fn main() {
+    let params = PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS;
+
+    let config = ConfigBuilder::with_custom_parameters(params).build();
+    let cks = ClientKey::generate(config);
+
+    let mut c = Criterion::default().sample_size(10).configure_from_args();
+
+    let bench_name = "hlapi::cuda::dex";
+
+    // FheUint64 PBS counts
+    // We don't run multiple times since every input is encrypted
+    // PBS count is always the same
+    #[cfg(feature = "pbs-stats")]
+    {
+        print_swap_request_pbs_counts(&cks, "FheUint64", swap_request::<FheUint64>);
+        print_swap_claim_pbs_counts(&cks, "FheUint64", swap_claim::<FheUint64, FheUint128>);
+    }
+
+    // FheUint64 latency
+    {
+        let mut group = c.benchmark_group(bench_name);
+        bench_swap_request_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "swap_request",
+            swap_request::<FheUint64>,
+        );
+        bench_swap_claim_latency(
+            &mut group,
+            &cks,
+            bench_name,
+            "FheUint64",
+            "swap_claim",
+            swap_claim::<FheUint64, FheUint128>,
+        );
+
+        group.finish();
+    }
+
+    c.final_summary();
+}
--- a/tfhe/benches/high_level_api/erc20.rs
+++ b/tfhe/benches/high_level_api/erc20.rs
@@ -1,6 +1,8 @@
 #[path = "../utilities.rs"]
 mod utilities;

+#[cfg(feature = "gpu")]
+use crate::utilities::configure_gpu;
 use crate::utilities::{write_to_json, OperatorType};
 use criterion::measurement::WallTime;
 use criterion::{BenchmarkGroup, Criterion, Throughput};
@@ -18,7 +20,7 @@ use tfhe::{set_server_key, ClientKey, CompressedServerKey, ConfigBuilder, FheBoo
 /// Transfer as written in the original FHEvm white-paper,
 /// it uses a comparison to check if the sender has enough,
 /// and cmuxes based on the comparison result
-fn transfer_whitepaper<FheType>(
+pub fn transfer_whitepaper<FheType>(
    from_amount: &FheType,
    to_amount: &FheType,
    amount: &FheType,
@@ -177,13 +179,6 @@ mod pbs_stats {
    }
 }

-#[cfg(feature = "gpu")]
-fn configure_gpu(client_key: &ClientKey) {
-    let compressed_sks = CompressedServerKey::new(client_key);
-    let sks = compressed_sks.decompress_to_gpu();
-    rayon::broadcast(|_| set_server_key(sks.clone()));
-    set_server_key(sks);
-}
 fn bench_transfer_latency<FheType, F>(
    c: &mut BenchmarkGroup<'_, WallTime>,
    client_key: &ClientKey,
@@ -383,7 +378,7 @@ fn main() {

    let mut c = Criterion::default().sample_size(10).configure_from_args();

-    let bench_name = "hlapi::erc20::transfer";
+    let bench_name = "hlapi::erc20";

    // FheUint64 PBS counts
    // We don't run multiple times since every input is encrypted
@@ -393,14 +388,14 @@ fn main() {
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::<FheUint64>);
@@ -414,7 +409,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_latency(
@@ -422,7 +417,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "no_cmux",
+            "transfer::no_cmux",
            transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_latency(
@@ -430,7 +425,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        bench_transfer_latency(
@@ -438,7 +433,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "safe",
+            "transfer::safe",
            transfer_safe::<FheUint64>,
        );

@@ -453,7 +448,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_throughput(
@@ -461,7 +456,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "no_cmux",
+            "transfer::no_cmux",
            transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_throughput(
@@ -469,7 +464,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        bench_transfer_throughput(
@@ -477,9 +472,10 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "safe",
+            "transfer::safe",
            transfer_safe::<FheUint64>,
        );
+
        group.finish();
    }

@@ -495,7 +491,7 @@ fn main() {

    let mut c = Criterion::default().sample_size(10).configure_from_args();

-    let bench_name = "hlapi::cuda::erc20::transfer";
+    let bench_name = "hlapi::cuda::erc20";

    // FheUint64 PBS counts
    // We don't run multiple times since every input is encrypted
@@ -505,14 +501,14 @@ fn main() {
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "no_cmux", transfer_no_cmux::<FheUint64>);
        print_transfer_pbs_counts(
            &cks,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        print_transfer_pbs_counts(&cks, "FheUint64", "safe", transfer_safe::<FheUint64>);
@@ -526,7 +522,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        bench_transfer_latency(
@@ -534,7 +530,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "no_cmux",
+            "transfer::no_cmux",
            transfer_no_cmux::<FheUint64>,
        );
        bench_transfer_latency(
@@ -542,7 +538,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        bench_transfer_latency(
@@ -550,7 +546,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "safe",
+            "transfer::safe",
            transfer_safe::<FheUint64>,
        );

@@ -565,7 +561,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "whitepaper",
+            "transfer::whitepaper",
            transfer_whitepaper::<FheUint64>,
        );
        cuda_bench_transfer_throughput(
@@ -573,7 +569,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "no_cmux",
+            "transfer::no_cmux",
            transfer_no_cmux::<FheUint64>,
        );
        cuda_bench_transfer_throughput(
@@ -581,7 +577,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "overflow",
+            "transfer::overflow",
            transfer_overflow::<FheUint64>,
        );
        cuda_bench_transfer_throughput(
@@ -589,7 +585,7 @@ fn main() {
            &cks,
            bench_name,
            "FheUint64",
-            "safe",
+            "transfer::safe",
            transfer_safe::<FheUint64>,
        );
        group.finish();
--- a/tfhe/benches/utilities.rs
+++ b/tfhe/benches/utilities.rs
@@ -655,6 +655,7 @@ mod cuda_utils {
    use tfhe::core_crypto::gpu::{get_number_of_gpus, CudaStreams};
    use tfhe::core_crypto::prelude::{Numeric, UnsignedInteger};
    use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
+    use tfhe::{set_server_key, ClientKey, CompressedServerKey};

    #[allow(dead_code)]
    pub const GPU_MAX_SUPPORTED_POLYNOMIAL_SIZE: usize = 16384;
@@ -879,6 +880,13 @@ mod cuda_utils {
        }
    }

+    #[allow(dead_code)]
+    pub fn configure_gpu(client_key: &ClientKey) {
+        let compressed_sks = CompressedServerKey::new(client_key);
+        let sks = compressed_sks.decompress_to_gpu();
+        rayon::broadcast(|_| set_server_key(sks.clone()));
+        set_server_key(sks);
+    }
    #[allow(unused_imports)]
    #[cfg(feature = "integer")]
    pub use cuda_integer_utils::*;