chore(common): Merge branch 'main' into release/2.7.x

2026-01-09 12:57:55 -05:00 · 2024-08-13 14:12:19 +02:00
parent d7e1006098 89ef8d344e
commit 5a98041d13
252 changed files with 9166 additions and 6430 deletions
--- a/.github/workflows/block_merge.yml
+++ b/.github/workflows/block_merge.yml
@@ -10,7 +10,7 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
      - name: Check first line
-        uses: gsactions/commit-message-checker@v1
+        uses: gsactions/commit-message-checker@16fa2d5de096ae0d35626443bcd24f1e756cafee # v2.0.0
        with:
          pattern: '^(feat|fix|test|bench|docs|chore|refactor|perf)\((compiler|backend|frontend|optimizer|tools|ci|common).*\): '
          flags: 'gs'
--- a/.github/workflows/compiler_benchmark.yml
+++ b/.github/workflows/compiler_benchmark.yml
@@ -47,13 +47,8 @@ jobs:
        run: |
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

-      # SSH private key is required as some dependencies are from private repos
-      - uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
-
      - name: Fetch submodules
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          fetch-depth: 0
          submodules: recursive
@@ -100,7 +95,7 @@ jobs:
          make ${{ env.BENCHMARK_TARGET }}

      - name: Upload raw results artifact
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: compiler_${{ github.sha }}_raw
          path: compilers/concrete-compiler/compiler/benchmarks_results.json
@@ -120,13 +115,13 @@ jobs:
          --throughput

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: compiler_${{ github.sha }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/compiler_build_and_test_cpu.yml
+++ b/.github/workflows/compiler_build_and_test_cpu.yml
@@ -45,12 +45,6 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"
          echo "Matrix item: ${{ inputs.matrix_item }}"

-      # A SSH private key is required as some dependencies are from private repos
-      - name: Set up SSH agent
-        uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
-
      - name: Set up env
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
@@ -58,7 +52,7 @@ jobs:
          echo "SSH_AUTH_SOCK_DIR=$(dirname $SSH_AUTH_SOCK)" >> "${GITHUB_ENV}"

      - name: Fetch repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
@@ -70,7 +64,7 @@ jobs:
        run: mkdir build

      - name: Build compiler
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        id: build-compiler
        with:
          registry: ghcr.io
@@ -86,16 +80,16 @@ jobs:
            ${{ env.DOCKER_GPU_OPTION }}
          shell: bash
          run: |
-            rustup toolchain install nightly-2024-01-31
+            rustup toolchain install nightly-2024-07-01
            set -e
            cd /concrete/compilers/concrete-compiler/compiler
            rm -rf /build/*
-            make DATAFLOW_EXECUTION_ENABLED=ON CCACHE=ON Python3_EXECUTABLE=$PYTHON_EXEC BUILD_DIR=/build all build-end-to-end-dataflow-tests
+            make DATAFLOW_EXECUTION_ENABLED=ON CCACHE=ON Python3_EXECUTABLE=$PYTHON_EXEC BUILD_DIR=/build all
            echo "Debug: ccache statistics (after the build):"
            ccache -s

      - name: Build compiler Dialects docs and check diff
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        id: build-compiler-docs
        with:
          registry: ghcr.io
@@ -127,7 +121,7 @@ jobs:
        run: echo "MINIMAL_TESTS=ON" >> $GITHUB_ENV

      - name: Test compiler
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        with:
          registry: ghcr.io
          image: ${{ env.DOCKER_IMAGE_TEST }}
@@ -140,13 +134,13 @@ jobs:
          shell: bash
          run: |
            set -e
-            rustup toolchain install nightly-2024-01-31
+            rustup toolchain install nightly-2024-07-01
            cd /concrete/compilers/concrete-compiler/compiler
            pip install pytest
            dnf install -y libzstd libzstd-devel
            sed "s/pytest/python -m pytest/g" -i Makefile
            mkdir -p /tmp/concrete_compiler/gpu_tests/
-            make MINIMAL_TESTS=${{ env.MINIMAL_TESTS }} DATAFLOW_EXECUTION_ENABLED=ON CCACHE=ON Python3_EXECUTABLE=$PYTHON_EXEC BUILD_DIR=/build run-tests run-end-to-end-dataflow-tests
+            make MINIMAL_TESTS=${{ env.MINIMAL_TESTS }} DATAFLOW_EXECUTION_ENABLED=ON CCACHE=ON Python3_EXECUTABLE=$PYTHON_EXEC BUILD_DIR=/build run-tests
            chmod -R ugo+rwx /tmp/KeySetCache
            
      - name: Analyze logs
@@ -155,7 +149,7 @@ jobs:
            ls -1 | xargs grep -H "WARNING RETRY" | sed -e "s/.log.*//g" | uniq -c | sed -re "s/ *([0-9]*) (.*)/::warning ::Test \2 retried \1 times/g" | cat

      # - name: Archive python package
-      #   uses: actions/upload-artifact@v3
+      #   uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
      #   with:
      #     name: concrete-compiler.whl
      #     path: build/wheels/concrete_compiler-*-manylinux_{{ env.GLIB_VER }}_x86_64.whl
--- a/.github/workflows/compiler_build_and_test_cpu_distributed.yml
+++ b/.github/workflows/compiler_build_and_test_cpu_distributed.yml
@@ -47,7 +47,7 @@ jobs:
          docker system prune -af

      - name: Fetch repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          fetch-depth: 0
          submodules: recursive
@@ -74,13 +74,14 @@ jobs:
          cd compilers/concrete-compiler/compiler
          rm -rf /shared/build
          make HPX_DIR=/shared/hpx install-hpx-from-source
-          make HPX_DIR=/shared/hpx BUILD_DIR=/shared/build CCACHE=ON DATAFLOW_EXECUTION_ENABLED=ON BINDINGS_PYTHON_ENABLED=OFF CUDA_SUPPORT=${{ env.CUDA_SUPPORT }} build-end-to-end-dataflow-tests
+          make HPX_DIR=/shared/hpx BUILD_DIR=/shared/build CCACHE=ON DATAFLOW_EXECUTION_ENABLED=ON BINDINGS_PYTHON_ENABLED=OFF CUDA_SUPPORT=${{ env.CUDA_SUPPORT }} build-end-to-end-tests

      - name: Run end-to-end benchmarks
        run: |
          set -e
          cd compilers/concrete-compiler/compiler
-          make BUILD_DIR=/shared/build run-end-to-end-distributed-tests
+          rm -rf /shared/KeyCache
+          make BUILD_DIR=/shared/build KEY_CACHE_DIRECTORY=/shared/KeyCache run-end-to-end-distributed-tests

      - name: Instance cleanup
        run: |
--- a/.github/workflows/compiler_build_and_test_gpu.yml
+++ b/.github/workflows/compiler_build_and_test_gpu.yml
@@ -45,12 +45,6 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"
          echo "Matrix item: ${{ inputs.matrix_item }}"

-      # A SSH private key is required as some dependencies are from private repos
-      - name: Set up SSH agent
-        uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
-
      - name: Set up env
        # "Install rust" step require root user to have a HOME directory which is not set.
        run: |
@@ -58,7 +52,7 @@ jobs:
          echo "SSH_AUTH_SOCK_DIR=$(dirname $SSH_AUTH_SOCK)" >> "${GITHUB_ENV}"

      - name: Fetch repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
@@ -70,7 +64,7 @@ jobs:
        run: mkdir build

      - name: Build and test compiler
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        id: build-compiler
        with:
          registry: ghcr.io
@@ -86,7 +80,7 @@ jobs:
            --gpus all
          shell: bash
          run: |
-            rustup toolchain install nightly-2024-01-31
+            rustup toolchain install nightly-2024-07-01
            set -e
            cd /concrete/compilers/concrete-compiler/compiler
            rm -rf /build/*
--- a/.github/workflows/compiler_format_and_linting.yml
+++ b/.github/workflows/compiler_format_and_linting.yml
@@ -8,7 +8,7 @@ jobs:
  FormattingAndLinting:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
      - name: Format with clang-format (Cpp)
        run: |
          sudo apt install moreutils
@@ -34,6 +34,6 @@ jobs:
  CheckLicense:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
      - name: Check if sources include the license header
        run: .github/workflows/scripts/check_for_license.sh
--- a/.github/workflows/compiler_macos_build_and_test.yml
+++ b/.github/workflows/compiler_macos_build_and_test.yml
@@ -24,12 +24,7 @@ jobs:
        runson: ["aws-mac1-metal", "aws-mac2-metal"]
    runs-on: ${{ matrix.runson }}
    steps:
-      # A SSH private key is required as some dependencies are from private repos
-      - uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
-
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
@@ -45,7 +40,7 @@ jobs:

      - name: Cache compilation (push)
        if: github.event_name == 'push'
-        uses: actions/cache@v3
+        uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
        with:
          path: /Users/runner/Library/Caches/ccache
          key: ${{ runner.os }}-${{ runner.arch }}-compilation-cache-${{ github.sha }}
@@ -54,7 +49,7 @@ jobs:

      - name: Cache compilation (pull_request)
        if: github.event_name == 'pull_request'
-        uses: actions/cache@v3
+        uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
        with:
          path: /Users/runner/Library/Caches/ccache
          key: ${{ runner.os }}-${{ runner.arch }}-compilation-cache-${{ github.event.pull_request.base.sha }}
--- a/.github/workflows/compiler_publish_docker_images.yml
+++ b/.github/workflows/compiler_publish_docker_images.yml
@@ -49,12 +49,8 @@ jobs:
          echo "AMI: ${{ inputs.instance_image_id }}"
          echo "Type: ${{ inputs.instance_type }}"
          echo "Request ID: ${{ inputs.request_id }}"
-      # SSH private key is required as some dependencies are from private repos
-      - uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}

-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
@@ -66,7 +62,6 @@ jobs:
      - name: Build Image
        run: |
          DOCKER_BUILDKIT=1 docker build --no-cache \
-            --ssh default=${{ env.SSH_AUTH_SOCK }} \
            --label "commit-sha=${{ github.sha }}" -t ${{ matrix.image }} -f ${{ matrix.dockerfile }} .

      - name: Tag and Publish Image
@@ -88,7 +83,7 @@ jobs:
      IMAGE: ghcr.io/zama-ai/hpx

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          fetch-depth: 0

@@ -98,7 +93,7 @@ jobs:

      - name: Get changed files
        id: changed-files
-        uses: tj-actions/changed-files@v44
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c # v44.5.24

      - name: Login
        id: login
@@ -127,7 +122,7 @@ jobs:
            dockerfile: docker/Dockerfile.cuda-118-env

    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          fetch-depth: 0

@@ -137,7 +132,7 @@ jobs:

      - name: Get changed files
        id: changed-files
-        uses: tj-actions/changed-files@v44
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c # v44.5.24

      - name: Login
        id: login
--- a/.github/workflows/concrete_cpu_test.yml
+++ b/.github/workflows/concrete_cpu_test.yml
@@ -16,17 +16,17 @@ jobs:
    env:
      RUSTFLAGS: -D warnings
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

      - name: Rust install
-        uses: actions-rs/toolchain@v1
+        uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
        with:
-          toolchain: nightly-2024-01-31
+          toolchain: nightly-2024-07-01
          override: true
          components: rustfmt, clippy

      - name: Download cargo cache
-        uses: Swatinem/rust-cache@v2
+        uses: Swatinem/rust-cache@23bce251a8cd2ffc3c1075eaa2367cf899916d84 # v2.7.3

      - name: Formatting
        run: |
@@ -63,13 +63,13 @@ jobs:
    env:
      RUSTFLAGS: -D warnings
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

      - name: Setup rust toolchain for concrete-cpu
        uses: ./.github/workflows/setup_rust_toolchain_for_concrete_cpu

      - name: Download cargo cache
-        uses: Swatinem/rust-cache@v2
+        uses: Swatinem/rust-cache@23bce251a8cd2ffc3c1075eaa2367cf899916d84 # v2.7.3

      - name: Tests
        run: |
@@ -81,13 +81,13 @@ jobs:
    env:
      RUSTFLAGS: -D warnings
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

      - name: Setup rust toolchain for concrete-cpu
        uses: ./.github/workflows/setup_rust_toolchain_for_concrete_cpu

      - name: Download cargo cache
-        uses: Swatinem/rust-cache@v2
+        uses: Swatinem/rust-cache@23bce251a8cd2ffc3c1075eaa2367cf899916d84 # v2.7.3

      - name: Tests
        run: |
--- a/.github/workflows/concrete_ml_tests.yml
+++ b/.github/workflows/concrete_ml_tests.yml
@@ -38,11 +38,6 @@ jobs:
          echo "Request ID: ${{ inputs.request_id }}"
          echo "User Inputs: ${{ inputs.user_inputs }}"

-      - name: Set up SSH agent
-        uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
-
      - name: Set up GitHub environment
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
@@ -50,7 +45,7 @@ jobs:
          echo "SSH_AUTH_SOCK_DIR=$(dirname $SSH_AUTH_SOCK)" >> "${GITHUB_ENV}"

      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
@@ -65,7 +60,7 @@ jobs:
        run: mkdir build

      - name: Build wheel
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        id: build-compiler-bindings
        with:
          registry: ghcr.io
@@ -104,7 +99,7 @@ jobs:
            deactivate

      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
        with:
          python-version: ${{ matrix.python-version }}

--- a/.github/workflows/concrete_python_benchmark.yml
+++ b/.github/workflows/concrete_python_benchmark.yml
@@ -0,0 +1,125 @@
+name: Concrete Python Benchmark
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 1 * * SAT"
+
+env:
+  DOCKER_IMAGE: ghcr.io/zama-ai/concrete-compiler
+  GLIB_VER: 2_28
+
+jobs:
+  setup-instance:
+    name: Setup Instance
+    runs-on: ubuntu-latest
+    outputs:
+      runner-name: ${{ steps.start-instance.outputs.label }}
+    steps:
+      - name: Start instance
+        id: start-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: start
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          backend: aws
+          profile: m7i-cpu-bench
+
+  concrete-python-benchmarks:
+    name: Run Concrete Python Benchmarks
+    needs: setup-instance
+    runs-on: ${{ needs.setup-instance.outputs.runner-name }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          submodules: recursive
+          fetch-depth: 0
+
+      - name: Benchmark
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
+        id: build-compiler-bindings
+        with:
+          registry: ghcr.io
+          image: ${{ env.DOCKER_IMAGE }}
+          username: ${{ secrets.GHCR_LOGIN }}
+          password: ${{ secrets.GHCR_PASSWORD }}
+          options: >-
+            -v ${{ github.workspace }}:/concrete
+            -v ${{ github.workspace }}/build:/build
+            -v ${{ env.SSH_AUTH_SOCK }}:/ssh.socket
+            -e SSH_AUTH_SOCK=/ssh.socket
+            ${{ env.DOCKER_GPU_OPTION }}
+          shell: bash
+          run: |
+            set -e
+
+            rustup toolchain install nightly-2024-07-01
+            rm -rf /build/*
+
+            export PYTHON=${{ format('python{0}', matrix.python-version) }}
+            echo "Using $PYTHON"
+
+            dnf -y install graphviz graphviz-devel
+
+            cd /concrete/frontends/concrete-python
+            make PYTHON=$PYTHON venv
+            source .venv/bin/activate
+
+            cd /concrete/compilers/concrete-compiler/compiler
+            make BUILD_DIR=/build CCACHE=ON DATAFLOW_EXECUTION_ENABLED=ON Python3_EXECUTABLE=$(which python) python-bindings
+
+            echo "Debug: ccache statistics (after the build):"
+            ccache -s
+
+            cd /concrete/frontends/concrete-python
+
+            export COMPILER_BUILD_DIRECTORY="/build"
+            export PROGRESS_MACHINE_NAME="m7i.48xlarge"
+            
+            make benchmark
+            make process-benchmark-results-for-grafana
+
+            deactivate
+
+      - name: Checkout Slab repo
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
+        with:
+          repository: zama-ai/slab
+          path: slab
+          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
+
+      - name: Send data to Slab
+        shell: bash
+        run: |
+          echo "Computing HMac on results file"
+          SIGNATURE="$(slab/scripts/hmac_calculator.sh frontends/concrete-python/progress.processed.json '${{ secrets.JOB_SECRET }}')"
+          
+          cd frontends/concrete-python
+          
+          echo "Sending results to Slab..."
+          curl -v -k \
+            -H "Content-Type: application/json" \
+            -H "X-Slab-Repository: ${{ github.repository }}" \
+            -H "X-Slab-Command: store_data_v2" \
+            -H "X-Hub-Signature-256: sha256=${SIGNATURE}" \
+            -d @progress.processed.json \
+            ${{ secrets.SLAB_URL }}
+
+  teardown-instance:
+    name: Teardown Instance
+    if: ${{ always() && needs.setup-instance.result != 'skipped' }}
+    needs: [ setup-instance, concrete-python-benchmarks ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Stop instance
+        id: stop-instance
+        uses: zama-ai/slab-github-runner@447a2d0fd2d1a9d647aa0d0723a6e9255372f261
+        with:
+          mode: stop
+          github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
+          slab-url: ${{ secrets.SLAB_BASE_URL }}
+          job-secret: ${{ secrets.JOB_SECRET }}
+          label: ${{ needs.setup-instance.outputs.runner-name }}
--- a/.github/workflows/concrete_python_checks.yml
+++ b/.github/workflows/concrete_python_checks.yml
@@ -7,7 +7,7 @@ jobs:
  Checks:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
      - name: Pre-Commit Checks
        run: |
          sudo apt install -y graphviz libgraphviz-dev
--- a/.github/workflows/concrete_python_push_docker_image.yml
+++ b/.github/workflows/concrete_python_push_docker_image.yml
@@ -20,7 +20,7 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

      - name: Get version from tag
        run: |
@@ -34,7 +34,7 @@ jobs:
          docker image build -t ${{ env.DOCKER_IMAGE_NAME }} --build-arg version=${{ env.VERSION }} -f ${{ env.DOCKER_FILE }} empty_context

      - name: Login to Docker Hub
-        uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20
+        uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
--- a/.github/workflows/concrete_python_release.yml
+++ b/.github/workflows/concrete_python_release.yml
@@ -50,7 +50,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          fetch-depth: 0
@@ -73,7 +73,7 @@ jobs:
        run: cp frontends/concrete-python/version.txt frontends/concrete-python/concrete/fhe/version.py

      - name: Build wheel
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        id: build-compiler-bindings
        with:
          registry: ghcr.io
@@ -90,7 +90,7 @@ jobs:
          run: |
            set -e

-            rustup toolchain install nightly-2024-01-31
+            rustup toolchain install nightly-2024-07-01
            rm -rf /build/*

            export PYTHON=${{ format('python{0}', matrix.python-version) }}
@@ -116,7 +116,7 @@ jobs:
            deactivate

      - name: Upload wheel
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: ${{ format('wheel-{0}-linux-x86', matrix.python-version) }}
          path: frontends/concrete-python/dist/*manylinux*.whl
@@ -131,7 +131,7 @@ jobs:
    runs-on: ${{ matrix.runs-on }}
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          fetch-depth: 0
@@ -193,7 +193,7 @@ jobs:
          deactivate

      - name: Upload wheel
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: ${{ format('wheel-{0}-{1}', matrix.python-version, matrix.runs-on) }}
          path: frontends/concrete-python/dist/*macos*.whl
@@ -206,7 +206,7 @@ jobs:
    outputs:
      hash: ${{ steps.hash.outputs.hash }}
    steps:
-      - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
+      - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
          path: frontends/concrete-python/dist
          pattern: wheel-*
@@ -230,20 +230,20 @@ jobs:
    needs: [build-linux-x86, build-macos, provenance]
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          fetch-depth: 0
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
          path: wheels
          pattern: 'wheel-*'
          merge-multiple: true
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
          pattern: '*.intoto.jsonl'
-      # When building a new tag, create a new draft release.
+      # When building a new public tag, create a new draft release.
      - name: create draft release
-        if: ${{ env.RELEASE_TYPE == 'public' || env.RELEASE_TYPE == 'nightly' }}
+        if: ${{ env.RELEASE_TYPE == 'public'}}
        run: |
          export TAG=$(git describe --tags --abbrev=0)
          echo $TAG
@@ -278,7 +278,7 @@ jobs:
            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            https://api.github.com/repos/zama-ai/concrete/actions/workflows/concrete_python_push_docker_image.yml/dispatches \
-            -d "{\"ref\": \"$TAG\", \"inputs\": {\"tag\":\"$TAG\"}}"
+            -d "{\"ref\": \"$TAG\", \"inputs\": {\"tag\":\"v$TAG\"}}"

  test-linux-x86:
    needs: [build-linux-x86]
@@ -289,16 +289,16 @@ jobs:
    runs-on: ${{ github.event.inputs.runner_name }}
    steps:
      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
        with:
          python-version: ${{ matrix.python-version }}
      - name: Download wheels
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
          name: ${{ format('wheel-{0}-linux-x86', matrix.python-version) }}
          path: ${{ format('wheel-{0}-linux-x86', matrix.python-version) }}
      - name: Checkout the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          path: repo
@@ -339,12 +339,12 @@ jobs:
    runs-on: ${{ matrix.runs-on }}
    steps:
      - name: Download wheels
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
          name: ${{ format('wheel-{0}-{1}', matrix.python-version, matrix.runs-on) }}
          path: ${{ format('wheel-{0}-{1}', matrix.python-version, matrix.runs-on) }}
      - name: Checkout the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          path: repo
--- a/.github/workflows/concrete_python_release_gpu.yml
+++ b/.github/workflows/concrete_python_release_gpu.yml
@@ -51,7 +51,7 @@ jobs:
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"

      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          fetch-depth: 0
@@ -74,7 +74,7 @@ jobs:
        run: cp frontends/concrete-python/version.txt frontends/concrete-python/concrete/fhe/version.py

      - name: Build wheel
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        id: build-compiler-bindings
        with:
          registry: ghcr.io
@@ -90,7 +90,7 @@ jobs:
          run: |
            set -e

-            rustup toolchain install nightly-2024-01-31
+            rustup toolchain install nightly-2024-07-01
            rm -rf /build/*
            
            export PYTHON=${{ format('python{0}', matrix.python-version) }}
@@ -104,7 +104,7 @@ jobs:
            
            cd /concrete/compilers/concrete-compiler/compiler
            make BUILD_DIR=/build CCACHE=ON DATAFLOW_EXECUTION_ENABLED=OFF Python3_EXECUTABLE=$(which python) \
-              CUDA_SUPPORT=ON CUDA_PATH=${{ env.CUDA_PATH }} python-bindings
+              CUDA_SUPPORT=ON TIMING_ENABLED=ON CUDA_PATH=${{ env.CUDA_PATH }} python-bindings
            
            echo "Debug: ccache statistics (after the build):"
            ccache -s
@@ -117,7 +117,7 @@ jobs:
            deactivate

      - name: Upload wheel
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: ${{ format('wheel-{0}-linux-x86', matrix.python-version) }}
          path: frontends/concrete-python/dist/*manylinux*.whl
@@ -126,10 +126,12 @@ jobs:
  push:
    needs: [build-linux-x86]
    runs-on: ubuntu-latest
+    outputs:
+      wheel_version: ${{ steps.version.outputs.wheel_version }}
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
        with:
          path: wheels
          merge-multiple: true
@@ -155,10 +157,17 @@ jobs:
          # update indexes and invalidate cloudfront cache
          python .github/workflows/scripts/s3_update_html_indexes.py

+      - name: Output Wheel Version
+        id: version
+        run: |
+          export VERSION=`ls ./wheels/*manylinux* | head -n1 |  cut -d "-" -f2`
+          echo "VERSION=$VERSION"
+          echo "wheel_version=$VERSION" >> "$GITHUB_OUTPUT"
+
  test-gpu-wheel:
    needs: [push]
    uses: ./.github/workflows/start_slab.yml
    secrets: inherit
    with:
      command: concrete-python-test-gpu-wheel
-      user_inputs: "TODO"
+      user_inputs: "${{ needs.push.outputs.wheel_version }}"
--- a/.github/workflows/concrete_python_test_gpu_wheel.yml
+++ b/.github/workflows/concrete_python_test_gpu_wheel.yml
@@ -35,16 +35,15 @@ jobs:
    runs-on: ${{ github.event.inputs.runner_name }}
    steps:
      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1
        with:
          python-version: ${{ matrix.python-version }}

-      # TODO: specify CP version
      - name: Install CP
-        run: pip install --pre --extra-index-url https://pypi.zama.ai/gpu/ concrete-python
+        run: pip install --pre --extra-index-url https://pypi.zama.ai/gpu/ "concrete-python==${{ env.CP_VERSION }}"

      - name: Checkout the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          path: repo

--- a/.github/workflows/concrete_python_test_macos.yml
+++ b/.github/workflows/concrete_python_test_macos.yml
@@ -21,13 +21,8 @@ jobs:

    runs-on: ${{ matrix.machine }}
    steps:
-      - name: Set up SSH agent
-        uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
-
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
@@ -41,7 +36,7 @@ jobs:

      - name: Cache Compilation (push)
        if: github.event_name == 'push'
-        uses: actions/cache@v3
+        uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
        with:
          path: /Users/runner/Library/Caches/ccache
          key: ${{ runner.os }}-${{ runner.arch }}-compilation-cache-${{ github.sha }}
@@ -50,7 +45,7 @@ jobs:

      - name: Cache Compilation (pull_request)
        if: github.event_name == 'pull_request'
-        uses: actions/cache@v3
+        uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2
        with:
          path: /Users/runner/Library/Caches/ccache
          key: ${{ runner.os }}-${{ runner.arch }}-compilation-cache-${{ github.event.pull_request.base.sha }}
--- a/.github/workflows/concrete_python_tests_linux.yml
+++ b/.github/workflows/concrete_python_tests_linux.yml
@@ -42,11 +42,6 @@ jobs:
          echo "Type: ${{ inputs.instance_type }}"
          echo "Request ID: ${{ inputs.request_id }}"

-      - name: Set up SSH agent
-        uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
-
      - name: Set up GitHub environment
        run: |
          echo "HOME=/home/ubuntu" >> "${GITHUB_ENV}"
@@ -54,7 +49,7 @@ jobs:
          echo "SSH_AUTH_SOCK_DIR=$(dirname $SSH_AUTH_SOCK)" >> "${GITHUB_ENV}"
          
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          submodules: recursive
          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}
@@ -66,7 +61,7 @@ jobs:
        uses: ./.github/workflows/setup_rust_toolchain_for_concrete_cpu

      - name: Build bindings
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        if: ${{ !contains(inputs.instance_type, 'p3') }}
        id: build-compiler-bindings
        with:
@@ -81,7 +76,7 @@ jobs:
            -e SSH_AUTH_SOCK=/ssh.socket
          shell: bash
          run: |
-            rustup toolchain install nightly-2024-01-31
+            rustup toolchain install nightly-2024-07-01
            set -e
            rm -rf /build/*

@@ -98,7 +93,7 @@ jobs:
            ccache -s

      - name: Test
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        if: ${{ !contains(inputs.instance_type, 'p3') }}
        with:
          registry: ghcr.io
@@ -124,7 +119,7 @@ jobs:
            KEY_CACHE_DIRECTORY=./KeySetCache make pytest
            
      - name: Build bindings gpu
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        if: ${{ contains(inputs.instance_type, 'p3') }}
        id: build-compiler-bindings-gpu
        with:
@@ -155,7 +150,7 @@ jobs:
            ccache -s

      - name: Test gpu
-        uses: addnab/docker-run-action@v3
+        uses: addnab/docker-run-action@4f65fabd2431ebc8d299f8e5a018d79a769ae185 # v3
        if: ${{ contains(inputs.instance_type, 'p3') }}
        with:
          registry: ghcr.io
--- a/.github/workflows/linelint.yml
+++ b/.github/workflows/linelint.yml
@@ -12,7 +12,7 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
      - name: Linelint
        uses: fernandrone/linelint@0.0.4
        id: linelint
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -39,20 +39,20 @@ jobs:
      push-main: ${{ steps.github.outputs.push-main }}
    steps:
      - name: Checkout the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          fetch-depth: 0
          token: ${{ secrets.CONCRETE_ACTIONS_TOKEN }}

      - name: Get changed files in the concrete-compiler directory
        id: compiler
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./compilers/concrete-compiler/**

      - name: Get changed files for concrete-optimizer
        id: optimizer
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: |
            ./compilers/concrete-optimizer/**
@@ -60,43 +60,43 @@ jobs:

      - name: Get changed files in the concrete-cpu directory
        id: concrete-cpu
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./backends/concrete-cpu/implementation/**

      - name: Get changed files in the concrete-python directory
        id: concrete-python
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./frontends/concrete-python/**

      - name: Check if compiler_build_and_test_cpu workflow has changed
        id: concrete-compiler-cpu-workflow
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./.github/workflows/compiler_build_and_test_cpu.yml

      - name: Check if compiler_build_and_test_gpu workflow has changed
        id: concrete-compiler-gpu-workflow
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./.github/workflows/compiler_build_and_test_gpu.yml

      - name: Check if compiler_format_and_linting.yml workflow has changed
        id: concrete-compiler-format-and-linting-workflow
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./.github/workflows/compiler_format_and_linting.yml

      - name: Check if compiler_macos_build_and_test workflow has changed
        id: concrete-compiler-macos-workflow
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./.github/workflows/compiler_macos_build_and_test.yml

      - name: Check if compiler_publish_docker_images workflow has changed
        id: concrete-compiler-docker-images-workflow
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: |
            ./.github/workflows/compiler_publish_docker_images.yml
@@ -104,31 +104,31 @@ jobs:

      - name: Check if concrete_cpu_test workflow has changed
        id: concrete-cpu-workflow
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./.github/workflows/concrete_cpu_test.yml

      - name: Check if concrete_python_checks workflow has changed
        id: concrete-python-workflow
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./.github/workflows/concrete_python_checks.yml

      - name: Check if optimizer workflow has changed
        id: concrete-optimizer-workflow
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./.github/workflows/optimizer.yml

      - name: Get changed files in the concrete-cpu directory
        id: concrete-cpu-api
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./backends/concrete-cpu/implementation/include/**

      - name: Get changed files in the concrete-cuda directory
        id: concrete-cuda-api
-        uses: tj-actions/changed-files@e5ce44a30190a3af4c81b960587845596e2300ca
+        uses: tj-actions/changed-files@c65cd883420fd2eb864698a825fc4162dd94482c
        with:
          files: ./backends/concrete-cuda/implementation/include/**

@@ -252,7 +252,7 @@ jobs:
 # Concrete-ML tests #############################
  concrete-ml-tests-linux:
    needs: file-change
-    if: needs.file-change.outputs.concrete-python == 'true' || needs.file-change.outputs.push-main
+    if: needs.file-change.outputs.concrete-python == 'true' || needs.file-change.outputs.compiler == 'true' || needs.file-change.outputs.push-main
    uses: ./.github/workflows/start_slab.yml
    secrets: inherit
    with:
--- a/.github/workflows/markdown_link_check.yml
+++ b/.github/workflows/markdown_link_check.yml
@@ -4,6 +4,7 @@ on:
  pull_request:
    paths:
      - '**.md'
+      - .github/workflows/markdown_link_check.yml
  push:
    branches:
      - main
@@ -13,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@master
-    - uses: gaurav-nelson/github-action-markdown-link-check@v1
+    - uses: gaurav-nelson/github-action-markdown-link-check@5c5dfc0ac2e225883c0e5f03a85311ec2830d368 # v1
      with:
        use-quiet-mode: 'yes'
        use-verbose-mode: 'yes'
--- a/.github/workflows/ml_benchmark_subset.yml
+++ b/.github/workflows/ml_benchmark_subset.yml
@@ -46,13 +46,8 @@ jobs:
        run: |
          echo "BENCH_DATE=$(date --iso-8601=seconds)" >> "${GITHUB_ENV}"

-      # SSH private key is required as some dependencies are from private repos
-      - uses: webfactory/ssh-agent@v0.7.0
-        with:
-          ssh-private-key: ${{ secrets.CONCRETE_CI_SSH_PRIVATE }}
-
      - name: Fetch submodules
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          fetch-depth: 0
          submodules: recursive
@@ -86,7 +81,7 @@ jobs:
          make BINDINGS_PYTHON_ENABLED=OFF ML_BENCH_SUBSET_ID=${{ inputs.matrix_item }} run-mlbench-subset

      - name: Upload raw results artifact
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: ${{ github.sha }}_raw
          path: compiler/benchmarks_results.json
@@ -105,13 +100,13 @@ jobs:
          --bench-date "${{ env.BENCH_DATE }}"

      - name: Upload parsed results artifact
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          name: ${{ github.sha }}
          path: ${{ env.RESULTS_FILENAME }}

      - name: Checkout Slab repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          repository: zama-ai/slab
          path: slab
--- a/.github/workflows/optimizer.yml
+++ b/.github/workflows/optimizer.yml
@@ -24,7 +24,7 @@ jobs:
    env:
      RUSTFLAGS: -D warnings
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

      - name: "Setup"
        uses: ./.github/workflows/optimizer_setup
@@ -56,7 +56,7 @@ jobs:
  benchmarks:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

      - name: "Setup"
        uses: ./.github/workflows/optimizer_setup
@@ -73,7 +73,7 @@ jobs:
        # for artifacts restrictions see https://github.com/actions/download-artifact/issues/3
        # for cache restrictions see https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#restrictions-for-accessing-a-cache
        # and  https://github.com/actions/cache/issues/692
-        uses: dawidd6/action-download-artifact@v5
+        uses: dawidd6/action-download-artifact@bf251b5aa9c2f7eeb574a96ee720e24f801b7c11
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
          workflow_conclusion: ""
@@ -83,7 +83,7 @@ jobs:
          path: ./benchmark

      - name: Save benchmark result to file
-        uses: benchmark-action/github-action-benchmark@v1
+        uses: benchmark-action/github-action-benchmark@4de1bed97a47495fc4c5404952da0499e31f5c29 # v1.20.3
        with:
          tool: 'cargo'
          output-file-path: ./compilers/concrete-optimizer/bench_result.txt
@@ -96,7 +96,7 @@ jobs:
          summary-always: true

      - name: Upload benchmark data
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@834a144ee995460fba8ed112a2fc961b36a5ec5a # v4.3.6
        with:
          path: ./benchmark
          name: ${{ runner.os }}-benchmark
--- a/.github/workflows/optimizer_setup/action.yml
+++ b/.github/workflows/optimizer_setup/action.yml
@@ -7,15 +7,9 @@ runs:
  using: "composite"
  steps:
    - name: Rust install
-      uses: actions-rs/toolchain@v1
+      uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
      with:
        toolchain: stable

    - name: Download cargo cache
-      uses: Swatinem/rust-cache@v2
-
-    # A SSH private key is required as some dependencies are from private repos
-    - name: Set ssh keys
-      uses: webfactory/ssh-agent@v0.6.0
-      with:
-        ssh-private-key: ${{ inputs.ssh_private_key }}
+      uses: Swatinem/rust-cache@23bce251a8cd2ffc3c1075eaa2367cf899916d84 # v2.7.3
--- a/.github/workflows/refresh_zama_pypi.yml
+++ b/.github/workflows/refresh_zama_pypi.yml
@@ -7,7 +7,7 @@ jobs:
  refresh:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3
+      - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v3

      - name: Update index and invalidate cache
        env:
--- a/.github/workflows/setup_rust_toolchain_for_concrete_cpu/action.yml
+++ b/.github/workflows/setup_rust_toolchain_for_concrete_cpu/action.yml
@@ -2,7 +2,7 @@ runs:
  using: "composite"
  steps:
    - name: Install rust
-      uses: actions-rs/toolchain@v1
+      uses: actions-rs/toolchain@16499b5e05bf2e26879000db0c1d13f7e13fa3af # v1.0.7
      with:
-        toolchain: nightly-2024-01-31
+        toolchain: nightly-2024-07-01
        override: true
--- a/.github/workflows/start_slab.yml
+++ b/.github/workflows/start_slab.yml
@@ -28,7 +28,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout concrete
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          fetch-depth: 0

@@ -39,7 +39,7 @@ jobs:
          echo "GIT_REF=${{ github.ref_name }}" >> $GITHUB_ENV

      - name: Checkout Slab repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
        with:
          repository: zama-ai/slab
          path: slab
--- a/README.md
+++ b/README.md
@@ -129,7 +129,7 @@ assert result == add(2, 6)
 - [[Video tutorial] How To Get Started With Concrete - Zama's Fully Homomorphic Encryption Compiler](https://www.zama.ai/post/how-to-started-with-concrete-zama-fully-homomorphic-encryption-compiler)
 - [The Encrypted Game of Life in Python Using Concrete](https://www.zama.ai/post/the-encrypted-game-of-life-using-concrete-python)
 - [Encrypted Key-value Database Using Homomorphic Encryption](https://www.zama.ai/post/encrypted-key-value-database-using-homomorphic-encryption)
- [SHA-256 Implementation Using Concrete](https://github.com/zama-ai/concrete/blob/main/docs/application-tutorial/sha256.ipynb)
+- [SHA-256 Implementation Using Concrete](https://github.com/zama-ai/concrete/blob/main/frontends/concrete-python/examples/sha256/sha256.ipynb)

 *Explore more useful resources in [Concrete tutorials](https://docs.zama.ai/concrete/v/main-1/tutorials/see-all-tutorials) and [Awesome Zama repo](https://github.com/zama-ai/awesome-zama?tab=readme-ov-file#concrete). If you have built awesome projects using Concrete, please let us know and we will be happy to showcase them here!*

--- a/backends/concrete-cpu/implementation/Cargo.lock
+++ b/backends/concrete-cpu/implementation/Cargo.lock
@@ -45,6 +45,15 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"

+[[package]]
+name = "bincode"
+version = "1.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "bitflags"
 version = "1.3.2"
@@ -59,9 +68,9 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"

 [[package]]
 name = "bytemuck"
-version = "1.13.1"
+version = "1.16.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17febce684fd15d89027105661fec94afb475cb995fbc59d2865198446ba2eea"
+checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"

 [[package]]
 name = "cast"
@@ -170,14 +179,14 @@ dependencies = [
 "cbindgen",
 "concrete-cpu-noise-model",
 "concrete-csprng",
- "concrete-fft 0.2.1",
+ "concrete-fft",
 "concrete-security-curves",
 "criterion",
 "dyn-stack",
 "libc",
 "num-complex",
 "once_cell",
- "pulp 0.10.4",
+ "pulp",
 "rayon",
 "readonly",
 "tfhe",
@@ -193,9 +202,9 @@ dependencies = [

 [[package]]
 name = "concrete-csprng"
-version = "0.4.0"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c7080e711c39c3dda834604f7c31f4634a38b8dddeb9f24db5d7801751415ad"
+checksum = "90518357249582c16a6b64d7410243dfb3109d5bf0ad1665c058c9a59f2fc4cc"
 dependencies = [
 "aes",
 "libc",
@@ -204,29 +213,26 @@ dependencies = [

 [[package]]
 name = "concrete-fft"
-version = "0.2.1"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ab720f85aa80be9d78fba5a941458ec50d97c7856aced76707f0bf227a0c6ca"
+checksum = "3144f883422ee22c65d4f408c11b3406513eadc0d50b3d65bfadcb97852817e0"
 dependencies = [
 "aligned-vec",
 "bytemuck",
 "dyn-stack",
 "num-complex",
- "pulp 0.11.11",
+ "pulp",
+ "serde",
 ]

 [[package]]
-name = "concrete-fft"
-version = "0.3.0"
+name = "concrete-ntt"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdb823cf0c7a83ac8f7e38bcda73d754fb8f57797f342cdd7c20a61a1db8b058"
+checksum = "b4f4643dbd5de069e099122ae6c2bbd3db70d69ffec348dfc228448d635f949e"
 dependencies = [
 "aligned-vec",
- "bytemuck",
- "dyn-stack",
- "num-complex",
- "pulp 0.11.11",
- "serde",
+ "pulp",
 ]

 [[package]]
@@ -387,6 +393,17 @@ dependencies = [
 "version_check",
 ]

+[[package]]
+name = "getrandom"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
 [[package]]
 name = "half"
 version = "1.8.2"
@@ -500,9 +517,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"

 [[package]]
 name = "libc"
-version = "0.2.147"
+version = "0.2.155"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
+
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"

 [[package]]
 name = "linux-raw-sys"
@@ -527,9 +550,9 @@ dependencies = [

 [[package]]
 name = "num-complex"
-version = "0.4.3"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "02e0d21255c828d6f128a1e41534206671e8c3ea0c62f32291e808dc82cff17d"
+checksum = "23c6602fda94a57c990fe0df199a035d83576b496aa29f4e634a8ac6004e68a6"
 dependencies = [
 "bytemuck",
 "num-traits",
@@ -573,6 +596,12 @@ version = "6.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4d5d9eb14b174ee9aa2ef96dc2b94637a2d4b6e7cb873c7e171f0c20c6cf3eac"

+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
 [[package]]
 name = "plotters"
 version = "0.3.5"
@@ -612,30 +641,14 @@ dependencies = [

 [[package]]
 name = "pulp"
-version = "0.10.4"
+version = "0.18.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "284c392c810680912400c6f70879a8cde404344db6b68ff52cc3990c020324d1"
-dependencies = [
- "bytemuck",
-]
-
-[[package]]
-name = "pulp"
-version = "0.11.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "866e8018d6397b0717100dd4a7948fc8cbc8c4b8ce3e39e98a0e1e878d3ba925"
-dependencies = [
- "bytemuck",
-]
-
-[[package]]
-name = "pulp"
-version = "0.13.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50ff10f8f3224a1cab64371fbab0fff9b9cb5892076eb9032dc825c2e50894e4"
+checksum = "0ec8d02258294f59e4e223b41ad7e81c874aa6b15bc4ced9ba3965826da0eed5"
 dependencies = [
 "bytemuck",
+ "libm",
 "num-complex",
+ "reborrow",
 ]

 [[package]]
@@ -647,6 +660,15 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -835,19 +857,47 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"

 [[package]]
 name = "tfhe"
-version = "0.4.1"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df9c5cc5c4a1bd0bcfc2e7a3e83a8851dab02ff59d63ca8bc17da0e069498f71"
+checksum = "273f968d322dd02cd76a51fe8e2312b5807631d33e99ea06ccfd8034d558c562"
 dependencies = [
 "aligned-vec",
+ "bincode",
 "bytemuck",
 "concrete-csprng",
- "concrete-fft 0.3.0",
+ "concrete-fft",
+ "concrete-ntt",
 "dyn-stack",
 "itertools 0.11.0",
- "pulp 0.13.2",
+ "paste",
+ "pulp",
+ "rand_core",
 "rayon",
 "serde",
+ "tfhe-versionable",
+]
+
+[[package]]
+name = "tfhe-versionable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7977829d8b2d59a16d9780ffbf8e4084a8d67f32c0e557b647136d094d391b2"
+dependencies = [
+ "aligned-vec",
+ "num-complex",
+ "serde",
+ "tfhe-versionable-derive",
+]
+
+[[package]]
+name = "tfhe-versionable-derive"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "754aede9c522d81d852b46714d42137d307f418b8b6eed4db03d1466080f4f9d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.22",
 ]

 [[package]]
@@ -897,6 +947,12 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
 [[package]]
 name = "wasm-bindgen"
 version = "0.2.87"
--- a/backends/concrete-cpu/implementation/Cargo.toml
+++ b/backends/concrete-cpu/implementation/Cargo.toml
@@ -10,18 +10,18 @@ crate-type = ["lib", "staticlib"]


 [dependencies]
-concrete-csprng = { version = "0.4", optional = true, features = [
+concrete-csprng = { version = "0.4.1", optional = true, features = [
  "generator_fallback",
 ] }
 concrete-cpu-noise-model = { path = "../noise-model/" }
 concrete-security-curves = { path = "../../../tools/parameter-curves/concrete-security-curves-rust" }
 libc = { version = "0.2", default-features = false }
-pulp = { version = "0.10", default-features = false }
+pulp = { version = "0.18.8", default-features = false }
 dyn-stack = { version = "0.9", default-features = false }
 readonly = "0.2"
 aligned-vec = { version = "0.5", default-features = false }
-concrete-fft = { version = "0.2.1", default-features = false }
-bytemuck = "1.12"
+concrete-fft = { version = "0.4.1", default-features = false }
+bytemuck = "1.14.3"
 num-complex = { version = "0.4", default-features = false, features = [
  "bytemuck",
 ] }
@@ -29,16 +29,16 @@ num-complex = { version = "0.4", default-features = false, features = [
 rayon = { version = "1.6", optional = true }
 once_cell = { version = "1.16", optional = true }

-tfhe = { version = "0.4", features = [] }
+tfhe = { version = "0.7", features = [] }

 [target.x86_64-unknown-unix-gnu.dependencies]
-tfhe = { version = "0.4", features = ["x86_64-unix"] }
+tfhe = { version = "0.7", features = ["x86_64-unix"] }

 [target.aarch64-unknown-unix-gnu.dependencies]
-tfhe = { version = "0.4", features = ["aarch64-unix"] }
+tfhe = { version = "0.7", features = ["aarch64-unix"] }

 [target.x86_64-pc-windows-gnu.dependencies]
-tfhe = { version = "0.4", features = ["x86_64"] }
+tfhe = { version = "0.7", features = ["x86_64"] }

 [features]
 default = ["parallel", "std", "csprng"]
--- a/backends/concrete-cpu/implementation/src/c_api/bootstrap.rs
+++ b/backends/concrete-cpu/implementation/src/c_api/bootstrap.rs
@@ -73,14 +73,14 @@ pub unsafe extern "C" fn concrete_cpu_init_lwe_bootstrap_key_u64(
                &lwe_sk,
                &glwe_sk,
                &mut bsk,
-                Variance::from_variance(variance),
+                Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
                &mut *(csprng as *mut EncryptionRandomGenerator<SoftwareRandomGenerator>),
            ),
            Parallelism::Rayon => par_generate_lwe_bootstrap_key(
                &lwe_sk,
                &glwe_sk,
                &mut bsk,
-                Variance::from_variance(variance),
+                Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
                &mut *(csprng as *mut EncryptionRandomGenerator<SoftwareRandomGenerator>),
            ),
        }
@@ -151,14 +151,14 @@ pub unsafe extern "C" fn concrete_cpu_init_seeded_lwe_bootstrap_key_u64(
                &lwe_sk,
                &glwe_sk,
                &mut bsk,
-                Variance::from_variance(variance),
+                Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
                seeder,
            ),
            Parallelism::Rayon => par_generate_seeded_lwe_bootstrap_key(
                &lwe_sk,
                &glwe_sk,
                &mut bsk,
-                Variance::from_variance(variance),
+                Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
                seeder,
            ),
        }
--- a/backends/concrete-cpu/implementation/src/c_api/keyswitch.rs
+++ b/backends/concrete-cpu/implementation/src/c_api/keyswitch.rs
@@ -53,7 +53,7 @@ pub unsafe extern "C" fn concrete_cpu_init_lwe_keyswitch_key_u64(
            &input_key,
            &output_key,
            &mut ksk,
-            Variance::from_variance(variance),
+            Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
            &mut *(csprng as *mut EncryptionRandomGenerator<SoftwareRandomGenerator>),
        )
    });
@@ -110,7 +110,7 @@ pub unsafe extern "C" fn concrete_cpu_init_seeded_lwe_keyswitch_key_u64(
            &input_key,
            &output_key,
            &mut seeded_ksk,
-            Variance::from_variance(variance),
+            Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
            seeder,
        )
    });
--- a/backends/concrete-cpu/implementation/src/c_api/secret_key.rs
+++ b/backends/concrete-cpu/implementation/src/c_api/secret_key.rs
@@ -53,7 +53,7 @@ pub unsafe extern "C" fn concrete_cpu_encrypt_lwe_ciphertext_u64(
            &lwe_sk,
            &mut lwe_out,
            Plaintext(input),
-            Variance::from_variance(variance),
+            Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
            &mut *(csprng as *mut EncryptionRandomGenerator<SoftwareRandomGenerator>),
        );
    });
@@ -95,7 +95,7 @@ pub unsafe extern "C" fn concrete_cpu_encrypt_seeded_lwe_ciphertext_u64(
            &lwe_sk,
            &mut seeded_lwe_ciphertext,
            Plaintext(input),
-            Variance::from_variance(variance),
+            Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
            seeder,
        );
        *seeded_lwe_out = seeded_lwe_ciphertext.into_scalar();
@@ -145,7 +145,7 @@ pub unsafe extern "C" fn concrete_cpu_encrypt_ggsw_ciphertext_u64(
            &glwe_sk,
            &mut ggsw_out,
            Plaintext(input),
-            Variance::from_variance(variance),
+            Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
            &mut *(csprng as *mut EncryptionRandomGenerator<SoftwareRandomGenerator>),
        );
    });
--- a/backends/concrete-cpu/implementation/src/c_api/wop_pbs.rs
+++ b/backends/concrete-cpu/implementation/src/c_api/wop_pbs.rs
@@ -71,14 +71,14 @@ pub unsafe extern "C" fn concrete_cpu_init_lwe_circuit_bootstrap_private_functio
                &mut fpksk_list,
                &input_key,
                &output_key,
-                Variance::from_variance(variance),
+                Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
                &mut *(csprng as *mut EncryptionRandomGenerator<SoftwareRandomGenerator>),
            ),
            Parallelism::Rayon => par_generate_circuit_bootstrap_lwe_pfpksk_list(
                &mut fpksk_list,
                &input_key,
                &output_key,
-                Variance::from_variance(variance),
+                Gaussian::from_dispersion_parameter(Variance::from_variance(variance), 0.0),
                &mut *(csprng as *mut EncryptionRandomGenerator<SoftwareRandomGenerator>),
            ),
        }
--- a/backends/concrete-cpu/implementation/src/lib.rs
+++ b/backends/concrete-cpu/implementation/src/lib.rs
@@ -1,5 +1,4 @@
 #![allow(clippy::missing_safety_doc, dead_code)]
-#![cfg_attr(feature = "nightly", feature(stdsimd))]
 #![cfg_attr(feature = "nightly", feature(avx512_target_feature))]

 extern crate alloc;
--- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
@@ -1,3 +1,4 @@
+#include <err.h>
 #include "bootstrap_amortized.cuh"

 /*
@@ -85,6 +86,8 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
@@ -141,6 +144,8 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
@@ -216,6 +221,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        lwe_idx, max_shared_memory);
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
@@ -353,6 +360,8 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        lwe_idx, max_shared_memory);
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
@@ -1,3 +1,4 @@
+#include <err.h>
 #include "bootstrap_fast_low_latency.cuh"
 #include "bootstrap_low_latency.cuh"
 /*
@@ -100,6 +101,8 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
          input_lwe_ciphertext_count, max_shared_memory);
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    return 0;
    break;
  }
@@ -244,6 +247,8 @@ void scratch_cuda_bootstrap_low_latency_32(
          allocate_gpu_memory);
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
@@ -377,6 +382,8 @@ void scratch_cuda_bootstrap_low_latency_64(
          allocate_gpu_memory);
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
@@ -527,6 +534,8 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          num_samples, num_lut_vectors, max_shared_memory);
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
@@ -744,7 +753,10 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
          num_samples, num_lut_vectors, max_shared_memory);
+    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
--- a/backends/concrete-cuda/implementation/src/bootstrap_multibit.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_multibit.cu
@@ -1,3 +1,4 @@
+#include <err.h>
 #include "bootstrap_fast_multibit.cuh"
 #include "bootstrap_multibit.cuh"
 #include "bootstrap_multibit.h"
@@ -172,6 +173,8 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
    }
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
@@ -311,6 +314,8 @@ void scratch_cuda_multi_bit_pbs_64(
    }
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
 }
--- a/backends/concrete-cuda/implementation/src/crypto/bootstrapping_key.cuh
+++ b/backends/concrete-cuda/implementation/src/crypto/bootstrapping_key.cuh
@@ -8,6 +8,7 @@
 #include "polynomial/polynomial.cuh"
 #include <atomic>
 #include <cstdint>
+#include <err.h>

 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
                                         int glwe_dimension,
@@ -235,6 +236,8 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
    }
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }

@@ -446,6 +449,8 @@ void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
    }
    break;
  default:
+    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
+	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
    break;
  }
  cuda_drop_async(buffer, stream, gpu_index);
--- a/ci/scripts/make_apidocs.sh
+++ b/ci/scripts/make_apidocs.sh
@@ -31,20 +31,6 @@ pip install lazydocs
 .venvtrash/bin/lazydocs --output-path="../docs/dev/api" --overview-file="README.md" --src-base-url="../../" --no-watermark concrete
 cd -

-# Add the files in the summary
-FILES=$(cd docs && find dev/api -name "*.md")
-
-TMP_FILE=$(mktemp /tmp/apidocs.XXXXXX)
-rm -rf "$TMP_FILE"
-touch "$TMP_FILE"
-
-for f in $FILES
-do
-    filename=$(echo "$f" | rev | cut -d '/' -f 1 | rev)
-
-    echo "  * [$filename]($f)" >> "$TMP_FILE"
-done
-
 rm -rf "$FRESH_DIRECTORY"

 # New files?
--- a/ci/scripts/test_cml.sh
+++ b/ci/scripts/test_cml.sh
@@ -206,6 +206,9 @@ then
    git diff
 fi

+# Update the pandas files in CML
+make update_encrypted_dataframe
+
 # Launch CML tests with pytest (and ignore flaky ones)
 # As compared to regular `make pytest`, known flaky errors from Concrete ML are simply ignored
 # and coverage is disabled
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -1,32 +1,42 @@
+# This is the new version of Slab that handles multi backend providers.
+[backend.aws.m7i-cpu-bench]
+region = "eu-west-1"
+image_id = "ami-002bdcd64b8472cf9" # Based on Ubuntu 22.4
+instance_type = "m7i.48xlarge"
+security_group = ["sg-0e55cc31dfda0d8a7", ]
+
 [profile.m7i-cpu-bench]
 region = "eu-west-1"
 image_id = "ami-002bdcd64b8472cf9" # Based on Ubuntu 22.4
 instance_type = "m7i.48xlarge"
+security_group= ["sg-0e55cc31dfda0d8a7", ]

 [profile.m7i-cpu-test]
 region = "eu-west-1"
 image_id = "ami-002bdcd64b8472cf9"
 instance_type = "m7i.16xlarge"
+security_group= ["sg-0e55cc31dfda0d8a7", ]

 [profile.m7i-metal]
 region = "eu-west-1"
 image_id = "ami-002bdcd64b8472cf9"
 instance_type = "m7i.metal-24xl"
+security_group= ["sg-0e55cc31dfda0d8a7", ]

 [profile.gpu-bench]
 region = "us-east-1"
 image_id = "ami-08e27480d79e82238"
 instance_type = "p3.2xlarge"
 subnet_id = "subnet-8123c9e7"
-security_group= ["sg-0f8b52622a2669491", ]
+security_group= ["sg-017afab1f328af917", ]

 # Docker is well configured for test inside docker in this AMI
 [profile.gpu-test]
 region = "us-east-1"
-image_id = "ami-0c4773f5626d919b6"
+image_id = "ami-0257c6ad39f902b5e"
 instance_type = "p3.2xlarge"
 subnet_id = "subnet-8123c9e7"
-security_group= ["sg-0f8b52622a2669491", ]
+security_group= ["sg-017afab1f328af917", ]

 # It has CUDA Driver (<=12.5) and Docker installed
 [profile.gpu-test-ubuntu22]
@@ -34,13 +44,14 @@ region = "us-east-1"
 image_id = "ami-05385e0c3c574621f"
 instance_type = "p3.2xlarge"
 subnet_id = "subnet-8123c9e7"
-security_group= ["sg-0f8b52622a2669491", ]
+security_group= ["sg-017afab1f328af917", ]

 [profile.slurm-cluster]
 region = "eu-west-3"
 image_id = "ami-0bb5bb9cb747b5ddd"
 instance_id = "i-0e5ae2a14134d6275"
 instance_type = "m6i.8xlarge"
+security_group= ["sg-02dd8470fa845f31b", ]

 #################################################
 # Compiler commands
@@ -121,5 +132,5 @@ check_run_name = "Concrete Python Release (GPU)"

 [command.concrete-python-test-gpu-wheel]
 workflow = "concrete_python_test_gpu_wheel.yml"
-profile = "gpu-test-ubuntu22"
+profile = "gpu-test"
 check_run_name = "Concrete Python Test GPU Wheel"
--- a/compilers/concrete-compiler/compiler/CMakeLists.txt
+++ b/compilers/concrete-compiler/compiler/CMakeLists.txt
@@ -136,7 +136,7 @@ ExternalProject_Add(
  DOWNLOAD_COMMAND ""
  CONFIGURE_COMMAND "" OUTPUT "${CONCRETE_CPU_STATIC_LIB}"
  BUILD_ALWAYS true
-  BUILD_COMMAND cargo +nightly-2024-01-31 build --release --features=nightly
+  BUILD_COMMAND cargo +nightly-2024-07-01 build --release --features=nightly
  BINARY_DIR "${CONCRETE_CPU_DIR}"
  INSTALL_COMMAND ""
  LOG_BUILD ON
--- a/compilers/concrete-compiler/compiler/Makefile
+++ b/compilers/concrete-compiler/compiler/Makefile
@@ -329,19 +329,9 @@ run-end-to-end-tests-gpu: build-end-to-end-test generate-gpu-tests
 	--backend=gpu \
 	$(FIXTURE_GPU_DIR)/*.yaml

-## end-to-end-dataflow-tests
-
-build-end-to-end-dataflow-tests: build-initialized
-	cmake --build $(BUILD_DIR) --target end_to_end_jit_auto_parallelization
-	cmake --build $(BUILD_DIR) --target end_to_end_jit_distributed
-	cmake --build $(BUILD_DIR) --target end_to_end_jit_aes_short
-
-run-end-to-end-dataflow-tests: build-end-to-end-dataflow-tests
-	$(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_auto_parallelization
-	$(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_distributed
+## end-to-end-distributed-tests

 run-end-to-end-distributed-tests: $(GTEST_PARALLEL_PY) build-end-to-end-tests generate-cpu-tests
-	srun -n4 -c8 --kill-on-bad-exit=1 $(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_jit_distributed
 	srun -n4 -c8 --kill-on-bad-exit=1 $(BUILD_DIR)/tools/concretelang/tests/end_to_end_tests/end_to_end_test \
 	  --optimizer-strategy=dag-mono --dataflow-parallelize=1 \
 	  $(FIXTURE_CPU_DIR)/*round*.yaml $(FIXTURE_CPU_DIR)/*relu*.yaml $(FIXTURE_CPU_DIR)/*linalg*.yaml
@@ -562,8 +552,6 @@ FORCE:
 	run-unit-tests \
 	run-python-tests \
 	build-end-to-end-tests \
-	build-end-to-end-dataflow-tests \
-	run-end-to-end-dataflow-tests \
 	run-random-end-to-end-tests-for-each-options \
 	opt \
 	mlir-opt \
--- a/compilers/concrete-compiler/compiler/README.md
+++ b/compilers/concrete-compiler/compiler/README.md
@@ -148,6 +148,9 @@ You can create a tarball containing libs, bins, and include files for the tools

 ### Build the Python Package

+> [!IMPORTANT]  
+> The wheel built in the following steps is for `concrete-compiler` (which doesn't have the frontend layer) and not `concrete-python`. If you are interested in the `concrete-python` package, then you should build it from [here](https://github.com/zama-ai/concrete/tree/main/frontends/concrete-python) instead.
+
 Currently supported platforms:
 - Linux x86_64 for python 3.8, 3.9, 3.10, and 3.11

--- a/compilers/concrete-compiler/compiler/include/concretelang/Dialect/FHE/IR/FHEOps.td
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Dialect/FHE/IR/FHEOps.td
@@ -19,7 +19,7 @@ include "concretelang/Dialect/FHE/Interfaces/FHEInterfaces.td"
 class FHE_Op<string mnemonic, list<Trait> traits = []> :
    Op<FHE_Dialect, mnemonic, traits>;

-def FHE_ZeroEintOp : FHE_Op<"zero", [Pure, ConstantNoise]> {
+def FHE_ZeroEintOp : FHE_Op<"zero", [Pure, ZeroNoise]> {
    let summary = "Returns a trivial encrypted integer of 0";

    let description = [{
@@ -34,7 +34,7 @@ def FHE_ZeroEintOp : FHE_Op<"zero", [Pure, ConstantNoise]> {
    let results = (outs FHE_AnyEncryptedInteger:$out);
 }

-def FHE_ZeroTensorOp : FHE_Op<"zero_tensor", [Pure, ConstantNoise]> {
+def FHE_ZeroTensorOp : FHE_Op<"zero_tensor", [Pure, ZeroNoise]> {
    let summary = "Creates a new tensor with all elements initialized to an encrypted zero.";

    let description = [{
@@ -52,7 +52,7 @@ def FHE_ZeroTensorOp : FHE_Op<"zero_tensor", [Pure, ConstantNoise]> {
    let results = (outs Type<And<[TensorOf<[FHE_AnyEncryptedInteger]>.predicate, HasStaticShapePred]>>:$tensor);
 }

-def FHE_AddEintIntOp : FHE_Op<"add_eint_int", [Pure, BinaryEintInt, DeclareOpInterfaceMethods<Binary>]> {
+def FHE_AddEintIntOp : FHE_Op<"add_eint_int", [Pure, BinaryEintInt, AdditiveNoise,  DeclareOpInterfaceMethods<Binary>]> {
    let summary = "Adds an encrypted integer and a clear integer";

    let description = [{
@@ -85,7 +85,7 @@ def FHE_AddEintIntOp : FHE_Op<"add_eint_int", [Pure, BinaryEintInt, DeclareOpInt
    let hasFolder = 1;
 }

-def FHE_AddEintOp : FHE_Op<"add_eint", [Pure, BinaryEint, DeclareOpInterfaceMethods<BinaryEint>]> {
+def FHE_AddEintOp : FHE_Op<"add_eint", [Pure, BinaryEint, AdditiveNoise, DeclareOpInterfaceMethods<BinaryEint>]> {
    let summary = "Adds two encrypted integers";

    let description = [{
@@ -117,7 +117,7 @@ def FHE_AddEintOp : FHE_Op<"add_eint", [Pure, BinaryEint, DeclareOpInterfaceMeth
    let hasVerifier = 1;
 }

-def FHE_SubIntEintOp : FHE_Op<"sub_int_eint", [Pure, BinaryIntEint]> {
+def FHE_SubIntEintOp : FHE_Op<"sub_int_eint", [Pure, BinaryIntEint, AdditiveNoise]> {
    let summary = "Subtract an encrypted integer from a clear integer";

    let description = [{
@@ -149,7 +149,7 @@ def FHE_SubIntEintOp : FHE_Op<"sub_int_eint", [Pure, BinaryIntEint]> {
    let hasVerifier = 1;
 }

-def FHE_SubEintIntOp : FHE_Op<"sub_eint_int", [Pure, BinaryEintInt, DeclareOpInterfaceMethods<Binary>]> {
+def FHE_SubEintIntOp : FHE_Op<"sub_eint_int", [Pure, BinaryEintInt, AdditiveNoise, DeclareOpInterfaceMethods<Binary>]> {
    let summary = "Subtract a clear integer from an encrypted integer";

    let description = [{
@@ -182,7 +182,7 @@ def FHE_SubEintIntOp : FHE_Op<"sub_eint_int", [Pure, BinaryEintInt, DeclareOpInt
    let hasFolder = 1;
 }

-def FHE_SubEintOp : FHE_Op<"sub_eint", [Pure, BinaryEint, DeclareOpInterfaceMethods<BinaryEint>]> {
+def FHE_SubEintOp : FHE_Op<"sub_eint", [Pure, BinaryEint, AdditiveNoise, DeclareOpInterfaceMethods<BinaryEint>]> {
    let summary = "Subtract an encrypted integer from an encrypted integer";

    let description = [{
@@ -214,7 +214,7 @@ def FHE_SubEintOp : FHE_Op<"sub_eint", [Pure, BinaryEint, DeclareOpInterfaceMeth
    let hasVerifier = 1;
 }

-def FHE_NegEintOp : FHE_Op<"neg_eint", [Pure, UnaryEint, DeclareOpInterfaceMethods<UnaryEint>]> {
+def FHE_NegEintOp : FHE_Op<"neg_eint", [Pure, UnaryEint, AdditiveNoise, DeclareOpInterfaceMethods<UnaryEint>]> {

    let summary = "Negates an encrypted integer";

--- a/compilers/concrete-compiler/compiler/include/concretelang/Dialect/FHE/Interfaces/FHEInterfaces.td
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Dialect/FHE/Interfaces/FHEInterfaces.td
@@ -37,6 +37,22 @@ def ConstantNoise : OpInterface<"ConstantNoise"> {
    let cppNamespace = "mlir::concretelang::FHE";
 }

+def ZeroNoise : OpInterface<"ZeroNoise"> {
+    let description = [{
+        An operation outputs a ciphertext with zero noise.
+    }];
+
+    let cppNamespace = "mlir::concretelang::FHE";
+}
+
+def AdditiveNoise : OpInterface<"AdditiveNoise"> {
+    let description = [{
+        An n-ary operation whose output noise is the unweighted sum of all input noises.
+    }];
+
+    let cppNamespace = "mlir::concretelang::FHE";
+}
+
 def UnaryEint : OpInterface<"UnaryEint"> {
    let description = [{
        A unary operation on scalars, with the operand encrypted.
@@ -63,7 +79,7 @@ def UnaryEint : OpInterface<"UnaryEint"> {
          if (auto operandTy = dyn_cast<mlir::RankedTensorType>($_op->getOpOperand(0).get().getType())) {
            return operandTy.getElementType();
          } else return $_op->getOpOperand(0).get().getType();
-        }]>        
+        }]>
    ];
 }

@@ -124,8 +140,8 @@ def Binary : OpInterface<"Binary"> {
          if (auto cstOp = llvm::dyn_cast_or_null<mlir::arith::ConstantOp>($_op->
                                                        getOpOperand(opNum).get().getDefiningOp()))
            return cstOp->template getAttrOfType<mlir::DenseIntElementsAttr>("value").template getValues<llvm::APInt>();
-          else return {};        
-        }]>,        
+          else return {};
+        }]>,
    ];
 }

--- a/compilers/concrete-compiler/compiler/include/concretelang/Dialect/SDFG/CMakeLists.txt
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Dialect/SDFG/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(Interfaces)
 add_subdirectory(IR)
+add_subdirectory(Transforms)
--- a/compilers/concrete-compiler/compiler/include/concretelang/Dialect/SDFG/Transforms/CMakeLists.txt
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Dialect/SDFG/Transforms/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name Transforms)
+add_public_tablegen_target(ConcretelangSDFGTransformsPassIncGen)
--- a/compilers/concrete-compiler/compiler/include/concretelang/Dialect/SDFG/Transforms/Passes.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Dialect/SDFG/Transforms/Passes.h
@@ -0,0 +1,26 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef CONCRETELANG_SDFG_TRANSFORMS_PASS_H
+#define CONCRETELANG_SDFG_TRANSFORMS_PASS_H
+
+#include <mlir/Dialect/Func/IR/FuncOps.h>
+#include <mlir/Dialect/Linalg/IR/Linalg.h>
+#include <mlir/Dialect/MemRef/IR/MemRef.h>
+#include <mlir/Dialect/SCF/IR/SCF.h>
+#include <mlir/Pass/Pass.h>
+
+#define GEN_PASS_CLASSES
+#include <concretelang/Dialect/SDFG/Transforms/Passes.h.inc>
+
+namespace mlir {
+namespace concretelang {
+
+std::unique_ptr<mlir::Pass> createSDFGBufferOwnershipPass();
+
+} // namespace concretelang
+} // namespace mlir
+
+#endif
--- a/compilers/concrete-compiler/compiler/include/concretelang/Dialect/SDFG/Transforms/Passes.td
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Dialect/SDFG/Transforms/Passes.td
@@ -0,0 +1,17 @@
+#ifndef CONCRETELANG_SDFG_PASSES
+#define CONCRETELANG_SDFG_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def SDFGBufferOwnership : Pass<"SDFGBufferOwnership", "mlir::ModuleOp"> {
+  let summary =
+      "Take ownership of data passed to SDFG operators.";
+
+  let description = [{ As data used in SDFG operators, in particular
+  PUT operations, can be used asynchronously, deallocation must be
+  handled by the runtime if we take ownership.  This pass removes
+  explicit deallocation calls where no other uses of the data exist
+  and makes copies otherwise, letting the runtime handle
+  deallocation when appropriate.}]; }
+
+#endif
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/DFRuntime.hpp
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/DFRuntime.hpp
@@ -25,6 +25,8 @@ bool _dfr_is_jit();
 bool _dfr_is_root_node();
 bool _dfr_use_omp();
 bool _dfr_is_distributed();
+void _dfr_run_remote_scheduler();
+void _dfr_register_lib(void *dlh);

 typedef enum _dfr_task_arg_type {
  _DFR_TASK_ARG_BASE = 0,
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp
@@ -0,0 +1,28 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef CONCRETELANG_GPUDFG_HPP
+#define CONCRETELANG_GPUDFG_HPP
+
+#ifdef CONCRETELANG_CUDA_SUPPORT
+#include "bootstrap.h"
+#include "device.h"
+#include "keyswitch.h"
+#include "linear_algebra.h"
+
+#endif
+
+namespace mlir {
+namespace concretelang {
+namespace gpu_dfg {
+
+bool check_cuda_device_available();
+bool check_cuda_runtime_enabled();
+
+} // namespace gpu_dfg
+} // namespace concretelang
+} // namespace mlir
+
+#endif
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/key_manager.hpp
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/key_manager.hpp
@@ -109,10 +109,8 @@ struct RuntimeContextManager {
  bool allocated = false;
  bool lazy_key_transfer = false;

-  RuntimeContextManager(bool lazy = false) : lazy_key_transfer(lazy) {
-    context = nullptr;
-    _dfr_node_level_runtime_context_manager = this;
-  }
+  RuntimeContextManager(bool lazy = false)
+      : context(nullptr), lazy_key_transfer(lazy) {}

  void setContext(void *ctx) {
    assert(context == nullptr &&
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/stream_emulator_api.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/stream_emulator_api.h
@@ -83,7 +83,8 @@ uint64_t stream_emulator_get_uint64(void *stream);
 void *stream_emulator_make_memref_stream(const char *name, stream_type stype);
 void stream_emulator_put_memref(void *stream, uint64_t *allocated,
                                uint64_t *aligned, uint64_t offset,
-                                uint64_t size, uint64_t stride);
+                                uint64_t size, uint64_t stride,
+                                uint64_t data_ownership);
 void stream_emulator_get_memref(void *stream, uint64_t *out_allocated,
                                uint64_t *out_aligned, uint64_t out_offset,
                                uint64_t out_size, uint64_t out_stride);
@@ -93,7 +94,8 @@ void *stream_emulator_make_memref_batch_stream(const char *name,
 void stream_emulator_put_memref_batch(void *stream, uint64_t *allocated,
                                      uint64_t *aligned, uint64_t offset,
                                      uint64_t size0, uint64_t size1,
-                                      uint64_t stride0, uint64_t stride1);
+                                      uint64_t stride0, uint64_t stride1,
+                                      uint64_t data_ownership);
 void stream_emulator_get_memref_batch(void *stream, uint64_t *out_allocated,
                                      uint64_t *out_aligned,
                                      uint64_t out_offset, uint64_t out_size0,
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/time_util.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/time_util.h
@@ -10,40 +10,156 @@

 #include <assert.h>
 #include <iostream>
+#include <stdlib.h>
+#include <string.h>
 #include <time.h>

 #include "concretelang/Runtime/DFRuntime.hpp"

 #define TIME_UTIL_CLOCK CLOCK_MONOTONIC

+namespace mlir {
+namespace concretelang {
+namespace time_util {
+
+extern bool timing_enabled;
+extern struct timespec timestamp;
+
+} // namespace time_util
+} // namespace concretelang
+} // namespace mlir
+
 static inline int timespec_diff(struct timespec *, const struct timespec *,
                                const struct timespec *);

+#define CONCRETELANG_ENABLE_TIMING()                                           \
+  do {                                                                         \
+    assert(clock_gettime(TIME_UTIL_CLOCK,                                      \
+                         &mlir::concretelang::time_util::timestamp) == 0);     \
+    char *env = getenv("CONCRETE_TIMING_ENABLED");                             \
+    if (env != nullptr)                                                        \
+      if (!strncmp(env, "True", 4) || !strncmp(env, "true", 4) ||              \
+          !strncmp(env, "ON", 2) || !strncmp(env, "on", 2) ||                  \
+          !strncmp(env, "1", 1))                                               \
+        mlir::concretelang::time_util::timing_enabled = true;                  \
+  } while (0)
+
 #define BEGIN_TIME(p)                                                          \
  do {                                                                         \
-    assert(clock_gettime(TIME_UTIL_CLOCK, (p)) == 0);                          \
+    if (mlir::concretelang::time_util::timing_enabled) {                       \
+      assert(clock_gettime(TIME_UTIL_CLOCK, (p)) == 0);                        \
+    }                                                                          \
  } while (0)

 #if CONCRETELANG_DATAFLOW_EXECUTION_ENABLED
 #define END_TIME(p, m)                                                         \
  do {                                                                         \
-    struct timespec _end_time_tv;                                              \
-    assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0);                \
-    assert(timespec_diff((p), &_end_time_tv, (p)) == 0);                       \
-    std::cout << "[NODE \t" << _dfr_debug_get_node_id() << "] \t" << (m)       \
-              << " time : \t" << (p)->tv_sec << "." << (p)->tv_nsec            \
-              << " seconds.\n"                                                 \
-              << std::flush;                                                   \
+    if (mlir::concretelang::time_util::timing_enabled) {                       \
+      struct timespec _end_time_tv;                                            \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0);              \
+      assert(timespec_diff((p), &_end_time_tv, (p)) == 0);                     \
+      struct timespec _timestamp_tv;                                           \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_timestamp_tv) == 0);             \
+      assert(timespec_diff(&_timestamp_tv, &_timestamp_tv,                     \
+                           &mlir::concretelang::time_util::timestamp) == 0);   \
+      std::cout << "[Timing logs][" << time_in_seconds(&_timestamp_tv)         \
+                << "] -\t";                                                    \
+      std::cout << "[NODE \t" << _dfr_debug_get_node_id() << "] \t" << (m)     \
+                << " time : \t" << time_in_seconds((p)) << " seconds.\n"       \
+                << std::flush;                                                 \
+    }                                                                          \
+  } while (0)
+#define END_TIME_C(p, m, c)                                                    \
+  do {                                                                         \
+    if (mlir::concretelang::time_util::timing_enabled) {                       \
+      struct timespec _end_time_tv;                                            \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0);              \
+      assert(timespec_diff((p), &_end_time_tv, (p)) == 0);                     \
+      struct timespec _timestamp_tv;                                           \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_timestamp_tv) == 0);             \
+      assert(timespec_diff(&_timestamp_tv, &_timestamp_tv,                     \
+                           &mlir::concretelang::time_util::timestamp) == 0);   \
+      std::cout << "[Timing logs][" << time_in_seconds(&_timestamp_tv)         \
+                << "] -\t";                                                    \
+      std::cout << "[NODE \t" << _dfr_debug_get_node_id() << "] \t" << (m)     \
+                << " [" << (c) << "] time : \t" << time_in_seconds((p))        \
+                << " seconds.\n"                                               \
+                << std::flush;                                                 \
+    }                                                                          \
+  } while (0)
+#define END_TIME_C_ACC(p, m, c, acc)                                           \
+  do {                                                                         \
+    if (mlir::concretelang::time_util::timing_enabled) {                       \
+      struct timespec _end_time_tv;                                            \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0);              \
+      assert(timespec_diff((p), &_end_time_tv, (p)) == 0);                     \
+      timespec_acc((acc), (p), (acc));                                         \
+      struct timespec _timestamp_tv;                                           \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_timestamp_tv) == 0);             \
+      assert(timespec_diff(&_timestamp_tv, &_timestamp_tv,                     \
+                           &mlir::concretelang::time_util::timestamp) == 0);   \
+      std::cout << "[Timing logs][" << time_in_seconds(&_timestamp_tv)         \
+                << "] -\t";                                                    \
+      std::cout << "[NODE \t" << _dfr_debug_get_node_id() << "] \t" << (m)     \
+                << " [" << (c) << "] time : \t" << time_in_seconds((p))        \
+                << " (total : " << time_in_seconds((acc)) << " )"              \
+                << " seconds.\n"                                               \
+                << std::flush;                                                 \
+    }                                                                          \
  } while (0)
 #else
 #define END_TIME(p, m)                                                         \
  do {                                                                         \
-    struct timespec _end_time_tv;                                              \
-    assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0);                \
-    assert(timespec_diff((p), &_end_time_tv, (p)) == 0);                       \
-    std::cout << (m) << " time : \t" << (p)->tv_sec << "." << (p)->tv_nsec     \
-              << " seconds.\n"                                                 \
-              << std::flush;                                                   \
+    if (mlir::concretelang::time_util::timing_enabled) {                       \
+      struct timespec _end_time_tv;                                            \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0);              \
+      assert(timespec_diff((p), &_end_time_tv, (p)) == 0);                     \
+      struct timespec _timestamp_tv;                                           \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_timestamp_tv) == 0);             \
+      assert(timespec_diff(&_timestamp_tv, &_timestamp_tv,                     \
+                           &mlir::concretelang::time_util::timestamp) == 0);   \
+      std::cout << "[Timing logs][" << time_in_seconds(&_timestamp_tv)         \
+                << "] -\t";                                                    \
+      std::cout << (m) << " time : \t" << time_in_seconds((p))                 \
+                << " seconds.\n"                                               \
+                << std::flush;                                                 \
+    }                                                                          \
+  } while (0)
+#define END_TIME_C(p, m, c)                                                    \
+  do {                                                                         \
+    if (mlir::concretelang::time_util::timing_enabled) {                       \
+      struct timespec _end_time_tv;                                            \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0);              \
+      assert(timespec_diff((p), &_end_time_tv, (p)) == 0);                     \
+      struct timespec _timestamp_tv;                                           \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_timestamp_tv) == 0);             \
+      assert(timespec_diff(&_timestamp_tv, &_timestamp_tv,                     \
+                           &mlir::concretelang::time_util::timestamp) == 0);   \
+      std::cout << "[Timing logs][" << time_in_seconds(&_timestamp_tv)         \
+                << "] -\t";                                                    \
+      std::cout << (m) << " [" << (c) << "] time : \t" << time_in_seconds((p)) \
+                << " seconds.\n"                                               \
+                << std::flush;                                                 \
+    }                                                                          \
+  } while (0)
+#define END_TIME_C_ACC(p, m, c, acc)                                           \
+  do {                                                                         \
+    if (mlir::concretelang::time_util::timing_enabled) {                       \
+      struct timespec _end_time_tv;                                            \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_end_time_tv) == 0);              \
+      assert(timespec_diff((p), &_end_time_tv, (p)) == 0);                     \
+      timespec_acc((acc), (p), (acc));                                         \
+      struct timespec _timestamp_tv;                                           \
+      assert(clock_gettime(TIME_UTIL_CLOCK, &_timestamp_tv) == 0);             \
+      assert(timespec_diff(&_timestamp_tv, &_timestamp_tv,                     \
+                           &mlir::concretelang::time_util::timestamp) == 0);   \
+      std::cout << "[Timing logs][" << time_in_seconds(&_timestamp_tv)         \
+                << "] -\t";                                                    \
+      std::cout << (m) << " [" << (c) << "] time : \t" << time_in_seconds((p)) \
+                << " (total : " << time_in_seconds((acc)) << " )"              \
+                << " seconds.\n"                                               \
+                << std::flush;                                                 \
+    }                                                                          \
  } while (0)
 #endif

@@ -57,6 +173,13 @@ static inline double get_thread_cpu_time(void) {
  return _t;
 }

+static inline double time_in_seconds(struct timespec *_tv) {
+  double _t;
+  _t = _tv->tv_sec;
+  _t += _tv->tv_nsec * 1e-9;
+  return _t;
+}
+
 static inline int timespec_diff(struct timespec *_result,
                                const struct timespec *_px,
                                const struct timespec *_py) {
@@ -86,10 +209,23 @@ static inline int timespec_diff(struct timespec *_result,
  return _x.tv_sec < _y.tv_sec;
 }

+static inline void timespec_acc(struct timespec *_result,
+                                const struct timespec *_px,
+                                const struct timespec *_py) {
+  struct timespec _x, _y;
+  _x = *_px;
+  _y = *_py;
+  _result->tv_sec = _x.tv_sec + _y.tv_sec;
+  _result->tv_nsec = _x.tv_nsec + _y.tv_nsec;
+}
+
 #else // CONCRETELANG_TIMING_ENABLED

+#define CONCRETELANG_ENABLE_TIMING()
 #define BEGIN_TIME(p)
 #define END_TIME(p, m)
+#define END_TIME_C(p, m, c)
+#define END_TIME_C_ACC(p, m, c, acc)

 #endif // CONCRETELANG_TIMING_ENABLED
 #endif
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/utils.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/utils.h
@@ -0,0 +1,21 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
+// for license information.
+
+#ifndef CONCRETELANG_RUNTIME_UTILS_H
+#define CONCRETELANG_RUNTIME_UTILS_H
+
+#include "llvm/Support/TargetSelect.h"
+
+namespace mlir {
+namespace concretelang {
+
+// Mainly a wrapper to some LLVM functions. The reason to have this wrapper is
+// to avoid linking conflicts between the python binary extension, and LLVM.
+void LLVMInitializeNativeTarget();
+
+} // namespace concretelang
+} // namespace mlir
+
+#endif
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/workfunction_registry.hpp
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/workfunction_registry.hpp
@@ -21,13 +21,11 @@ namespace concretelang {
 namespace dfr {

 struct WorkFunctionRegistry;
-namespace {
-static void *dl_handle;
-static WorkFunctionRegistry *_dfr_node_level_work_function_registry;
-} // namespace
+extern WorkFunctionRegistry *_dfr_node_level_work_function_registry;
+extern void *dl_handle;

 struct WorkFunctionRegistry {
-  WorkFunctionRegistry() { _dfr_node_level_work_function_registry = this; }
+  WorkFunctionRegistry() = default;

  wfnptr getWorkFunctionPointer(const std::string &name) {
    std::lock_guard<std::mutex> guard(registry_guard);
--- a/compilers/concrete-compiler/compiler/include/concretelang/Support/CompilerEngine.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Support/CompilerEngine.h
@@ -26,8 +26,6 @@ using concretelang::protocol::Message;
 namespace mlir {
 namespace concretelang {

-bool getEmitGPUOption();
-
 /// Compilation context that acts as the root owner of LLVM and MLIR
 /// data structures directly and indirectly referenced by artefacts
 /// produced by the `CompilerEngine`.
--- a/compilers/concrete-compiler/compiler/include/concretelang/TestLib/TestProgram.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/TestLib/TestProgram.h
@@ -23,6 +23,7 @@
 #include <ostream>
 #include <string>
 #include <thread>
+#include <unistd.h>

 using concretelang::clientlib::ClientCircuit;
 using concretelang::clientlib::ClientProgram;
@@ -220,37 +221,32 @@ private:
    auto new_path = [=]() {
      llvm::SmallString<0> outputPath;
      llvm::sys::path::append(outputPath, rootFolder);
-      std::string uid = std::to_string(
-          std::hash<std::thread::id>()(std::this_thread::get_id()));
+      auto pid = getpid();
+      std::string uid = std::to_string(pid);
      uid.append("-");
      uid.append(std::to_string(std::rand()));
      llvm::sys::path::append(outputPath, uid);
      return std::string(outputPath);
    };

-    // Macos sometimes fail to create new directories. We have to retry a few
-    // times.
-    for (size_t i = 0; i < 5; i++) {
-      auto pathString = new_path();
-      auto ec = std::error_code();
-      llvm::errs() << "TestProgram: create temporary directory(" << pathString
-                   << ")\n";
-      if (!std::filesystem::create_directory(pathString, ec)) {
-        llvm::errs() << "TestProgram: fail to create temporary directory("
-                     << pathString << "), ";
-        if (ec) {
-          llvm::errs() << "already exists";
-        } else {
-          llvm::errs() << "error(" << ec.message() << ")";
-        }
+    auto pathString = new_path();
+    auto ec = std::error_code();
+    llvm::errs() << "TestProgram: create temporary directory(" << pathString
+                 << ")\n";
+    if (!std::filesystem::create_directory(pathString, ec)) {
+      llvm::errs() << "TestProgram: fail to create temporary directory("
+                   << pathString << "), ";
+      if (ec) {
+        llvm::errs() << "already exists";
      } else {
-        llvm::errs() << "TestProgram: directory(" << pathString
-                     << ") successfully created\n";
-        return pathString;
+        llvm::errs() << "error(" << ec.message() << ")";
      }
+      assert(false);
+    } else {
+      llvm::errs() << "TestProgram: directory(" << pathString
+                   << ") successfully created\n";
+      return pathString;
    }
-    llvm::errs() << "Failed to create temp directory 5 times. Aborting...\n";
-    assert(false);
  }
 };

--- a/compilers/concrete-compiler/compiler/lib/Bindings/Python/CompilerAPIModule.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Bindings/Python/CompilerAPIModule.cpp
@@ -12,6 +12,7 @@
 #include "concretelang/Common/Keysets.h"
 #include "concretelang/Dialect/FHE/IR/FHEOpsDialect.h.inc"
 #include "concretelang/Runtime/DFRuntime.hpp"
+#include "concretelang/Runtime/GPUDFG.hpp"
 #include "concretelang/ServerLib/ServerLib.h"
 #include "concretelang/Support/logging.h"
 #include <llvm/Support/Debug.h>
@@ -462,6 +463,14 @@ void initDataflowParallelization() {
  mlir::concretelang::dfr::_dfr_set_required(true);
 }

+bool checkGPURuntimeEnabled() {
+  return mlir::concretelang::gpu_dfg::check_cuda_runtime_enabled();
+}
+
+bool checkCudaDeviceAvailable() {
+  return mlir::concretelang::gpu_dfg::check_cuda_device_available();
+}
+
 std::string roundTrip(const char *module) {
  std::shared_ptr<mlir::concretelang::CompilationContext> ccx =
      mlir::concretelang::CompilationContext::createShared();
@@ -673,6 +682,8 @@ void mlir::concretelang::python::populateCompilerAPISubmodule(
  m.def("terminate_df_parallelization", &terminateDataflowParallelization);

  m.def("init_df_parallelization", &initDataflowParallelization);
+  m.def("check_gpu_runtime_enabled", &checkGPURuntimeEnabled);
+  m.def("check_cuda_device_available", &checkCudaDeviceAvailable);

  pybind11::enum_<mlir::concretelang::Backend>(m, "Backend")
      .value("CPU", mlir::concretelang::Backend::CPU)
@@ -957,6 +968,7 @@ void mlir::concretelang::python::populateCompilerAPISubmodule(
              ::concretelang::serverlib::ServerLambda lambda,
              ::concretelang::clientlib::PublicArguments &publicArguments,
              ::concretelang::clientlib::EvaluationKeys &evaluationKeys) {
+             pybind11::gil_scoped_release release;
             SignalGuard signalGuard;
             return library_server_call(support, lambda, publicArguments,
                                        evaluationKeys);
@@ -1238,6 +1250,7 @@ void mlir::concretelang::python::populateCompilerAPISubmodule(
              ::concretelang::clientlib::PublicArguments &publicArguments,
              ::concretelang::clientlib::EvaluationKeys &evaluationKeys) {
             SignalGuard signalGuard;
+             pybind11::gil_scoped_release release;
             auto keyset = evaluationKeys.keyset;
             auto values = publicArguments.values;
             GET_OR_THROW_RESULT(auto output, circuit.call(keyset, values));
@@ -1268,6 +1281,7 @@ void mlir::concretelang::python::populateCompilerAPISubmodule(
           [](::concretelang::clientlib::ValueExporter &exporter,
              size_t position, int64_t value) {
             SignalGuard signalGuard;
+             pybind11::gil_scoped_release release;

             auto info = exporter.circuit.getCircuitInfo()
                             .asReader()
@@ -1288,6 +1302,7 @@ void mlir::concretelang::python::populateCompilerAPISubmodule(
                               size_t position, std::vector<int64_t> values,
                               std::vector<int64_t> shape) {
        SignalGuard signalGuard;
+        pybind11::gil_scoped_release release;
        std::vector<size_t> dimensions(shape.begin(), shape.end());
        auto info =
            exporter.circuit.getCircuitInfo().asReader().getInputs()[position];
@@ -1363,6 +1378,7 @@ void mlir::concretelang::python::populateCompilerAPISubmodule(
              size_t position,
              ::concretelang::clientlib::SharedScalarOrTensorData &value) {
             SignalGuard signalGuard;
+             pybind11::gil_scoped_release release;

             auto result =
                 decrypter.circuit.processOutput(value.value, position);
--- a/compilers/concrete-compiler/compiler/lib/Bindings/Python/concrete/compiler/init.py
+++ b/compilers/concrete-compiler/compiler/lib/Bindings/Python/concrete/compiler/init.py
@@ -8,6 +8,8 @@ import atexit
 from mlir._mlir_libs._concretelang._compiler import (
    terminate_df_parallelization as _terminate_df_parallelization,
    init_df_parallelization as _init_df_parallelization,
+    check_gpu_runtime_enabled as _check_gpu_runtime_enabled,
+    check_cuda_device_available as _check_cuda_device_available,
 )
 from mlir._mlir_libs._concretelang._compiler import round_trip as _round_trip
 from mlir._mlir_libs._concretelang._compiler import (
@@ -49,6 +51,18 @@ def init_dfr():
    _init_df_parallelization()


+def check_gpu_enabled() -> bool:
+    """Check whether the compiler and runtime support GPU offloading.
+
+    GPU offloading is not always available, in particular in non-GPU wheels."""
+    return _check_gpu_runtime_enabled()
+
+
+def check_gpu_available() -> bool:
+    """Check whether a CUDA device is available and online."""
+    return _check_cuda_device_available()
+
+
 # Cleanly terminate the dataflow runtime if it has been initialized
 # (does nothing otherwise)
 atexit.register(_terminate_df_parallelization)
--- a/compilers/concrete-compiler/compiler/lib/Conversion/SDFGToStreamEmulator/SDFGToStreamEmulator.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Conversion/SDFGToStreamEmulator/SDFGToStreamEmulator.cpp
@@ -403,13 +403,19 @@ struct LowerSDFGPut
             "SDFG streams only support memrefs and integers.");
      funcName = stream_emulator_put_uint64;
    }
+    // Add data ownership flag - if the put operation takes ownership
+    // of the memref data, set to 0 by default.
+    mlir::SmallVector<mlir::Value> operands(putOp->getOperands());
+    operands.push_back(rewriter.create<mlir::arith::ConstantOp>(
+        putOp.getLoc(), rewriter.getI64IntegerAttr(0)));
+
    if (insertGenericForwardDeclaration(putOp, rewriter, funcName,
-                                        putOp->getOperandTypes(),
+                                        mlir::ValueRange{operands}.getTypes(),
                                        putOp->getResultTypes())
            .failed())
      return ::mlir::failure();
    mlir::SmallVector<mlir::Value> newOps;
-    castDynamicTensorOps(putOp, rewriter, putOp->getOperands(), newOps);
+    castDynamicTensorOps(putOp, rewriter, operands, newOps);
    rewriter.replaceOpWithNewOp<mlir::func::CallOp>(
        putOp, funcName, putOp->getResultTypes(), newOps);
    return ::mlir::success();
--- a/compilers/concrete-compiler/compiler/lib/Conversion/SimulateTFHE/SimulateTFHE.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Conversion/SimulateTFHE/SimulateTFHE.cpp
@@ -3,6 +3,7 @@
 // https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
 // for license information.

+#include "concretelang/Dialect/Tracing/IR/TracingOps.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -686,6 +687,37 @@ struct ZeroTensorOpPattern
  };
 };

+struct TraceCiphertextOpPattern
+    : public mlir::OpConversionPattern<Tracing::TraceCiphertextOp> {
+  TraceCiphertextOpPattern(mlir::MLIRContext *context,
+                           mlir::TypeConverter &typeConverter)
+      : mlir::OpConversionPattern<Tracing::TraceCiphertextOp>(
+            typeConverter, context,
+            mlir::concretelang::DEFAULT_PATTERN_BENEFIT) {}
+
+  ::mlir::LogicalResult
+  matchAndRewrite(Tracing::TraceCiphertextOp traceCiphertextOp,
+                  Tracing::TraceCiphertextOp::Adaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    Tracing::TracePlaintextOp ptOp =
+        rewriter.replaceOpWithNewOp<Tracing::TracePlaintextOp>(
+            traceCiphertextOp, mlir::TypeRange{}, adaptor.getCiphertext());
+
+    if (auto msg = traceCiphertextOp.getMsg())
+      ptOp.setMsg(msg);
+
+    if (auto nmsb = traceCiphertextOp.getNmsb())
+      ptOp.setNmsb(nmsb);
+
+    auto inputWidth =
+        ptOp.getPlaintext().getType().cast<mlir::IntegerType>().getWidth();
+
+    ptOp->setAttr("input_width", rewriter.getI64IntegerAttr(inputWidth));
+
+    return ::mlir::success();
+  };
+};
+
 struct SimulateTFHEPass : public SimulateTFHEBase<SimulateTFHEPass> {
  bool enableOverflowDetection;
  SimulateTFHEPass(bool enableOverflowDetection)
@@ -704,7 +736,8 @@ void SimulateTFHEPass::runOnOperation() {
  target.addLegalOp<mlir::func::CallOp, mlir::memref::GetGlobalOp,
                    mlir::memref::CastOp, mlir::bufferization::AllocTensorOp,
                    mlir::tensor::CastOp, mlir::LLVM::GlobalOp,
-                    mlir::LLVM::AddressOfOp, mlir::LLVM::GEPOp>();
+                    mlir::LLVM::AddressOfOp, mlir::LLVM::GEPOp,
+                    Tracing::TracePlaintextOp>();
  // Make sure that no ops from `TFHE` remain after the lowering
  target.addIllegalDialect<TFHE::TFHEDialect>();

@@ -742,11 +775,11 @@ void SimulateTFHEPass::runOnOperation() {
      mlir::tensor::InsertOp, mlir::tensor::InsertSliceOp,
      mlir::tensor::ParallelInsertSliceOp, mlir::tensor::FromElementsOp,
      mlir::tensor::ExpandShapeOp, mlir::tensor::CollapseShapeOp,
-      mlir::bufferization::AllocTensorOp, mlir::tensor::EmptyOp>(
-      [&](mlir::Operation *op) {
-        return converter.isLegal(op->getResultTypes()) &&
-               converter.isLegal(op->getOperandTypes());
-      });
+      mlir::bufferization::AllocTensorOp, mlir::tensor::EmptyOp,
+      Tracing::TraceCiphertextOp>([&](mlir::Operation *op) {
+    return converter.isLegal(op->getResultTypes()) &&
+           converter.isLegal(op->getOperandTypes());
+  });
  // Make sure that no ops `linalg.generic` that have illegal types
  target
      .addDynamicallyLegalOp<mlir::linalg::GenericOp, mlir::tensor::GenerateOp>(
@@ -778,8 +811,8 @@ void SimulateTFHEPass::runOnOperation() {

  patterns.insert<ZeroOpPattern, ZeroTensorOpPattern, KeySwitchGLWEOpPattern,
                  WopPBSGLWEOpPattern, EncodeLutForCrtWopPBSOpPattern,
-                  EncodePlaintextWithCrtOpPattern, NegOpPattern>(&getContext(),
-                                                                 converter);
+                  EncodePlaintextWithCrtOpPattern, NegOpPattern,
+                  TraceCiphertextOpPattern>(&getContext(), converter);
  patterns.insert<SubIntGLWEOpPattern>(&getContext());

  // if overflow detection is enable, then rewrite to CAPI functions that
--- a/compilers/concrete-compiler/compiler/lib/Dialect/FHE/Analysis/ConcreteOptimizer.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Dialect/FHE/Analysis/ConcreteOptimizer.cpp
@@ -6,11 +6,13 @@
 #include <chrono>
 #include <cmath>
 #include <initializer_list>
+#include <iostream>
 #include <optional>
 #include <vector>

 #include "boost/outcome.h"

+#include "concretelang/Dialect/FHE/Interfaces/FHEInterfaces.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -34,6 +36,9 @@
 #define GEN_PASS_CLASSES
 #include "concretelang/Dialect/FHE/Analysis/ConcreteOptimizer.h.inc"

+using concrete_optimizer::utils::location_from_string;
+using concrete_optimizer::utils::location_unknown;
+
 namespace mlir {
 namespace concretelang {
 namespace optimizer {
@@ -110,7 +115,9 @@ struct FunctionToDag {
    }
    auto precision = fhe::utils::getEintPrecision(arg);
    auto shape = getShape(arg);
-    auto opI = dagBuilder.add_input(precision, slice(shape));
+    auto opI =
+        dagBuilder.add_input(precision, slice(shape),
+                             *loc_to_location(arg.getParentRegion()->getLoc()));
    index[arg] = opI;
    return opI;
  }
@@ -190,6 +197,11 @@ struct FunctionToDag {
    } else if (auto matmulEintEint = asMatmulEintEint(op)) {
      addEncMatMulTensor(matmulEintEint, encrypted_inputs, precision);
      return;
+    } else if (auto zero = asZeroNoise(op)) {
+      // special case as zero are rewritten in several optimizer nodes
+      index = addZeroNoise(zero);
+    } else if (auto additive = asAdditiveNoise(op)) {
+      index = addAdditiveNoise(additive, encrypted_inputs);
    } else {
      index = addLevelledOp(op, encrypted_inputs);
    }
@@ -201,6 +213,7 @@ struct FunctionToDag {
  void addLut(mlir::Operation &op, FHE::FheIntegerInterface inputType,
              Inputs &encrypted_inputs, int precision) {
    auto val = op.getResult(0);
+    auto loc = loc_to_location(op.getLoc());
    assert(encrypted_inputs.size() == 1);
    // No need to distinguish different lut kind until we do approximate
    // paradigm on outputs
@@ -209,13 +222,14 @@ struct FunctionToDag {
    std::vector<int32_t> operatorIndexes;
    if (inputType.isSigned()) {
      // std::vector<std::int64_t> weights_vector{1};
-      auto addIndex = dagBuilder.add_dot(
-          slice(encrypted_inputs), concrete_optimizer::weights::number(1));
+      auto addIndex =
+          dagBuilder.add_dot(slice(encrypted_inputs),
+                             concrete_optimizer::weights::number(1), *loc);
      encrypted_input = addIndex;
      operatorIndexes.push_back(addIndex.index);
    }
-    auto lutIndex =
-        dagBuilder.add_lut(encrypted_input, slice(unknowFunction), precision);
+    auto lutIndex = dagBuilder.add_lut(encrypted_input, slice(unknowFunction),
+                                       precision, *loc);
    operatorIndexes.push_back(lutIndex.index);
    mlir::Builder builder(op.getContext());
    if (setOptimizerID)
@@ -229,7 +243,8 @@ struct FunctionToDag {
    // No need to distinguish different lut kind until we do approximate
    // paradigm on outputs
    auto encrypted_input = encrypted_inputs[0];
-    index[val] = dagBuilder.add_round_op(encrypted_input, rounded_precision);
+    index[val] = dagBuilder.add_round_op(encrypted_input, rounded_precision,
+                                         *loc_to_location(val.getLoc()));
    return index[val];
  }

@@ -238,7 +253,8 @@ struct FunctionToDag {
                          int new_precision) {
    assert(encrypted_inputs.size() == 1);
    auto encrypted_input = encrypted_inputs[0];
-    index[val] = dagBuilder.add_unsafe_cast_op(encrypted_input, new_precision);
+    index[val] = dagBuilder.add_unsafe_cast_op(encrypted_input, new_precision,
+                                               *loc_to_location(val.getLoc()));
    return index[val];
  }

@@ -247,8 +263,8 @@ struct FunctionToDag {
         std::vector<std::int64_t> &weights_vector) {
    assert(encrypted_inputs.size() == 1);
    auto weights = concrete_optimizer::weights::vector(slice(weights_vector));
-    index[val] =
-        dagBuilder.add_dot(slice(encrypted_inputs), std::move(weights));
+    index[val] = dagBuilder.add_dot(slice(encrypted_inputs), std::move(weights),
+                                    *loc_to_location(val.getLoc()));
    return index[val];
  }

@@ -256,32 +272,110 @@ struct FunctionToDag {
    std::string loc;
    llvm::raw_string_ostream loc_stream(loc);
    location.print(loc_stream);
+    loc.erase(0, 4); // erase the loc( prefix
+    loc.pop_back();  // erase the ) postfix
+    loc.erase(std::remove(loc.begin(), loc.end(), '"'),
+              loc.end()); // erase the " characters
    return loc;
  }

+  concrete_optimizer::dag::OperatorIndex
+  addZeroNoise(concretelang::FHE::ZeroNoise &op) {
+    auto val = op->getOpResult(0);
+    auto outShape = getShape(val);
+    auto loc = loc_to_location(op.getLoc());
+
+    // Trivial encrypted constants encoding
+    // There are converted to input + levelledop
+    auto precision = fhe::utils::getEintPrecision(val);
+    auto opI = dagBuilder.add_input(precision, slice(outShape), *loc);
+    auto inputs = Inputs{opI};
+
+    // Default complexity is negligible
+    double const fixedCost = NEGLIGIBLE_COMPLEXITY;
+    double const lweDimCostFactor = NEGLIGIBLE_COMPLEXITY;
+    auto comment = std::string(op->getName().getStringRef()) + " " +
+                   loc_to_string(op.getLoc());
+    auto weights = std::vector<double>{1.};
+    index[val] = dagBuilder.add_levelled_op(slice(inputs), lweDimCostFactor,
+                                            fixedCost, slice(weights),
+                                            slice(outShape), comment, *loc);
+    return index[val];
+  }
+
+  concrete_optimizer::dag::OperatorIndex
+  addAdditiveNoise(concretelang::FHE::AdditiveNoise &op, Inputs &inputs) {
+    auto val = op->getResult(0);
+    auto out_shape = getShape(val);
+    // Default complexity is negligible
+    double fixed_cost = NEGLIGIBLE_COMPLEXITY;
+    double lwe_dim_cost_factor = NEGLIGIBLE_COMPLEXITY;
+    auto comment = std::string(op->getName().getStringRef()) + " " +
+                   loc_to_string(op.getLoc());
+    auto loc = loc_to_location(op.getLoc());
+    auto weights = std::vector<double>(inputs.size(), 1.);
+    index[val] = dagBuilder.add_levelled_op(slice(inputs), lwe_dim_cost_factor,
+                                            fixed_cost, slice(weights),
+                                            slice(out_shape), comment, *loc);
+    return index[val];
+  }
+
+  rust::Box<concrete_optimizer::Location>
+  loc_to_location(mlir::Location location) {
+    return location_from_string(loc_to_string(location));
+  }
  concrete_optimizer::dag::OperatorIndex addLevelledOp(mlir::Operation &op,
                                                       Inputs &inputs) {
    auto val = op.getResult(0);
    auto out_shape = getShape(val);
+    auto loc = loc_to_location(op.getLoc());
    if (inputs.empty()) {
      // Trivial encrypted constants encoding
      // There are converted to input + levelledop
      auto precision = fhe::utils::getEintPrecision(val);
-      auto opI = dagBuilder.add_input(precision, slice(out_shape));
+      auto opI = dagBuilder.add_input(precision, slice(out_shape), *loc);
      inputs.push_back(opI);
    }
    // Default complexity is negligible
    double fixed_cost = NEGLIGIBLE_COMPLEXITY;
    double lwe_dim_cost_factor = NEGLIGIBLE_COMPLEXITY;
    auto smanp_int = op.getAttrOfType<mlir::IntegerAttr>("SMANP");
-    auto loc = loc_to_string(op.getLoc());
    assert(smanp_int && "Missing manp value on a crypto operation");
-    // TODO: use APIFloat.sqrt when it's available
-    double manp = sqrt(smanp_int.getValue().roundToDouble());
-    auto comment = std::string(op.getName().getStringRef()) + " " + loc;
-    index[val] =
-        dagBuilder.add_levelled_op(slice(inputs), lwe_dim_cost_factor,
-                                   fixed_cost, manp, slice(out_shape), comment);
+    auto comment = std::string(op.getName().getStringRef()) + " " +
+                   loc_to_string(op.getLoc());
+
+    size_t n_inputs = 0;
+    double sq_sum = 0;
+    for (auto input : op.getOperands()) {
+      if (!fhe::utils::isEncryptedValue(input)) {
+        continue;
+      }
+      n_inputs += 1;
+      if (input.isa<mlir::BlockArgument>()) {
+        sq_sum += 1.0;
+      } else {
+        auto inpSmanpInt =
+            input.getDefiningOp()->getAttrOfType<mlir::IntegerAttr>("SMANP");
+        const double inpSManp = inpSmanpInt.getValue().roundToDouble();
+        sq_sum += inpSManp;
+      }
+    }
+    assert(inputs.size() == n_inputs);
+    double weight;
+    if (sq_sum == 0) {
+      // The max input manp is zero, meaning the inputs are all zero tensors
+      // with no noise. In this case it does not matter the weight since it will
+      // multiply zero.
+      weight = 1.;
+    } else {
+      double smanp_dbl = smanp_int.getValue().roundToDouble();
+      weight = std::max(sqrt(smanp_dbl / sq_sum), 1.0);
+      assert(!std::isnan(weight));
+    }
+    auto weights = std::vector<double>(n_inputs, weight);
+    index[val] = dagBuilder.add_levelled_op(slice(inputs), lwe_dim_cost_factor,
+                                            fixed_cost, slice(weights),
+                                            slice(out_shape), comment, *loc);
    return index[val];
  }

@@ -297,17 +391,19 @@ struct FunctionToDag {
    assert(encrypted_inputs.size() == 1);
    auto input = lsbOp.getInput();
    auto result = lsbOp.getResult();
+    auto loc = loc_to_location(lsbOp.getLoc());
    auto input_precision = fhe::utils::getEintPrecision(input);
    auto output_precision = fhe::utils::getEintPrecision(result);
    auto lsb_shiffted_as_1bit_wop = dagBuilder.add_dot(
        slice(encrypted_inputs),
-        concrete_optimizer::weights::number(1 << input_precision));
+        concrete_optimizer::weights::number(1 << input_precision), *loc);
    std::vector<std::uint64_t> unknownFunction;
    auto overflow_bit_precision = 0;
-    auto lsb_as_0_bits = dagBuilder.add_unsafe_cast_op(
-        lsb_shiffted_as_1bit_wop, overflow_bit_precision); // id for rotation
+    auto lsb_as_0_bits = dagBuilder.add_unsafe_cast_op(lsb_shiffted_as_1bit_wop,
+                                                       overflow_bit_precision,
+                                                       *loc); // id for rotation
    auto lsb_result = dagBuilder.add_lut(lsb_as_0_bits, slice(unknownFunction),
-                                         output_precision);
+                                         output_precision, *loc);
    auto lsb_result_corrected = idPlaceholder(lsb_result);
    index[result] = lsb_result_corrected;

@@ -336,46 +432,23 @@ struct FunctionToDag {
    mlir::Value result = mulOp.getResult();
    const std::vector<uint64_t> resultShape = getShape(result);

-    Operation *xOp = mulOp.getLhs().getDefiningOp();
-    Operation *yOp = mulOp.getRhs().getDefiningOp();
-
    const double fixedCost = NEGLIGIBLE_COMPLEXITY;
    const double lweDimCostFactor = NEGLIGIBLE_COMPLEXITY;

-    llvm::APInt xSmanp = llvm::APInt{1, 1, false};
-    if (xOp != nullptr) {
-      const auto xSmanpAttr = xOp->getAttrOfType<mlir::IntegerAttr>("SMANP");
-      assert(xSmanpAttr && "Missing SMANP value on a crypto operation");
-      xSmanp = xSmanpAttr.getValue();
-    }
-
-    llvm::APInt ySmanp = llvm::APInt{1, 1, false};
-    if (yOp != nullptr) {
-      const auto ySmanpAttr = yOp->getAttrOfType<mlir::IntegerAttr>("SMANP");
-      assert(ySmanpAttr && "Missing SMANP value on a crypto operation");
-      ySmanp = ySmanpAttr.getValue();
-    }
-
-    auto loc = loc_to_string(mulOp.getLoc());
-    auto comment = std::string(mulOp->getName().getStringRef()) + " " + loc;
-
-    // (x + y) and (x - y)
-    const double addSubManp =
-        sqrt(xSmanp.roundToDouble() + ySmanp.roundToDouble());
-
-    // tlu(v)
-    const double tluManp = 1;
-
-    // tlu(v1) - tlu(v2)
-    const double tluSubManp = sqrt(tluManp + tluManp);
+    auto comment = std::string(mulOp->getName().getStringRef()) + " " +
+                   loc_to_string(mulOp.getLoc());
+    auto loc = loc_to_location(mulOp.getLoc());

    // for tlus
    const std::vector<std::uint64_t> unknownFunction;

    // tlu(x + y)
-    auto addNode =
-        dagBuilder.add_levelled_op(slice(inputs), lweDimCostFactor, fixedCost,
-                                   addSubManp, slice(resultShape), comment);
+
+    auto addWeights = std::vector<double>{1, 1};
+    auto addNode = dagBuilder.add_levelled_op(
+        slice(inputs), lweDimCostFactor, fixedCost, slice(addWeights),
+        slice(resultShape), comment, *loc);
+
    std::optional<concrete_optimizer::dag::OperatorIndex> lhsCorrectionNode;
    if (isSignedEint(mulOp.getType())) {
      // If signed mul we need to add the addition node for correction of the
@@ -383,31 +456,37 @@ struct FunctionToDag {
      addNode = dagBuilder.add_dot(
          slice(std::vector<concrete_optimizer::dag::OperatorIndex>{addNode}),
          concrete_optimizer::weights::vector(
-              slice(std::vector<std::int64_t>{1})));
+              slice(std::vector<std::int64_t>{1})),
+          *loc);
      lhsCorrectionNode = addNode;
    }
    auto lhsTluNode =
-        dagBuilder.add_lut(addNode, slice(unknownFunction), precision);
+        dagBuilder.add_lut(addNode, slice(unknownFunction), precision, *loc);

    // tlu(x - y)
-    auto subNode =
-        dagBuilder.add_levelled_op(slice(inputs), lweDimCostFactor, fixedCost,
-                                   addSubManp, slice(resultShape), comment);
+    auto subWeights = std::vector<double>{1, 1};
+    auto subNode = dagBuilder.add_levelled_op(
+        slice(inputs), lweDimCostFactor, fixedCost, slice(subWeights),
+        slice(resultShape), comment, *loc);
+
    // This is a signed tlu so we need to also add the addition for correction
    // signed tlu
    auto rhsCorrectionNode = dagBuilder.add_dot(
        slice(std::vector<concrete_optimizer::dag::OperatorIndex>{subNode}),
        concrete_optimizer::weights::vector(
-            slice(std::vector<std::int64_t>{1})));
-    auto rhsTluNode = dagBuilder.add_lut(rhsCorrectionNode,
-                                         slice(unknownFunction), precision);
+            slice(std::vector<std::int64_t>{1})),
+        *loc);
+    auto rhsTluNode = dagBuilder.add_lut(
+        rhsCorrectionNode, slice(unknownFunction), precision, *loc);

    // tlu(x + y) - tlu(x - y)
+    auto resultWeights = std::vector<double>{1, 1};
    const std::vector<concrete_optimizer::dag::OperatorIndex> subInputs = {
        lhsTluNode, rhsTluNode};
    auto resultNode = dagBuilder.add_levelled_op(
-        slice(subInputs), lweDimCostFactor, fixedCost, tluSubManp,
-        slice(resultShape), comment);
+        slice(subInputs), lweDimCostFactor, fixedCost, slice(resultWeights),
+        slice(resultShape), comment, *loc);
+
    index[result] = resultNode;

    mlir::Builder builder(mulOp.getContext());
@@ -430,6 +509,7 @@ struct FunctionToDag {
  addTensorInnerProductEncEnc(InnerProductOp &innerProductOp, Inputs &inputs,
                              int precision) {
    mlir::Value result = innerProductOp.getResult();
+    auto loc = loc_to_location(innerProductOp.getLoc());
    const std::vector<uint64_t> resultShape = getShape(result);

    // We assume a first tensorized matmul step
@@ -512,34 +592,11 @@ struct FunctionToDag {

    // 1. (x + y) and (x - y) -> supposing broadcasting is used
    // to tensorize this operation
-
-    Operation *xOp = innerProductOp.getLhs().getDefiningOp();
-    Operation *yOp = innerProductOp.getRhs().getDefiningOp();
-
    const double fixedCost = NEGLIGIBLE_COMPLEXITY;
    const double lweDimCostFactor = NEGLIGIBLE_COMPLEXITY;

-    llvm::APInt xSmanp = llvm::APInt{1, 1, false};
-    if (xOp != nullptr) {
-      const auto xSmanpAttr = xOp->getAttrOfType<mlir::IntegerAttr>("SMANP");
-      assert(xSmanpAttr && "Missing SMANP value on a crypto operation");
-      xSmanp = xSmanpAttr.getValue();
-    }
-
-    llvm::APInt ySmanp = llvm::APInt{1, 1, false};
-    if (yOp != nullptr) {
-      const auto ySmanpAttr = yOp->getAttrOfType<mlir::IntegerAttr>("SMANP");
-      assert(ySmanpAttr && "Missing SMANP value on a crypto operation");
-      ySmanp = ySmanpAttr.getValue();
-    }
-
-    auto loc = loc_to_string(innerProductOp.getLoc());
-    auto comment =
-        std::string(innerProductOp->getName().getStringRef()) + " " + loc;
-
-    // (x + y) and (x - y)
-    const double addSubManp =
-        sqrt(xSmanp.roundToDouble() + ySmanp.roundToDouble());
+    auto comment = std::string(innerProductOp->getName().getStringRef()) + " " +
+                   loc_to_string(innerProductOp.getLoc());

    // tlu(v)
    const double tluManp = 1;
@@ -551,9 +608,11 @@ struct FunctionToDag {
    const std::vector<std::uint64_t> unknownFunction;

    // tlu(x + y)
-    auto addNode =
-        dagBuilder.add_levelled_op(slice(inputs), lweDimCostFactor, fixedCost,
-                                   addSubManp, slice(pairMatrixShape), comment);
+    auto addWeights = std::vector<double>{1, 1};
+    auto addNode = dagBuilder.add_levelled_op(
+        slice(inputs), lweDimCostFactor, fixedCost, slice(addWeights),
+        slice(pairMatrixShape), comment, *loc);
+
    std::optional<concrete_optimizer::dag::OperatorIndex> lhsCorrectionNode;
    if (isSignedEint(innerProductOp.getType())) {
      // If signed mul we need to add the addition node for correction of the
@@ -561,31 +620,36 @@ struct FunctionToDag {
      addNode = dagBuilder.add_dot(
          slice(std::vector<concrete_optimizer::dag::OperatorIndex>{addNode}),
          concrete_optimizer::weights::vector(
-              slice(std::vector<std::int64_t>{1})));
+              slice(std::vector<std::int64_t>{1})),
+          *loc);
      lhsCorrectionNode = addNode;
    }
    auto lhsTluNode =
-        dagBuilder.add_lut(addNode, slice(unknownFunction), precision);
+        dagBuilder.add_lut(addNode, slice(unknownFunction), precision, *loc);

    // tlu(x - y)
-    auto subNode =
-        dagBuilder.add_levelled_op(slice(inputs), lweDimCostFactor, fixedCost,
-                                   addSubManp, slice(pairMatrixShape), comment);
+    auto subWeights = std::vector<double>{1, 1};
+    auto subNode = dagBuilder.add_levelled_op(
+        slice(inputs), lweDimCostFactor, fixedCost, slice(subWeights),
+        slice(pairMatrixShape), comment, *loc);
+
    // This is a signed tlu so we need to also add the addition for correction
    // signed tlu
    auto rhsCorrectionNode = dagBuilder.add_dot(
        slice(std::vector<concrete_optimizer::dag::OperatorIndex>{subNode}),
        concrete_optimizer::weights::vector(
-            slice(std::vector<std::int64_t>{1})));
-    auto rhsTluNode = dagBuilder.add_lut(rhsCorrectionNode,
-                                         slice(unknownFunction), precision);
+            slice(std::vector<std::int64_t>{1})),
+        *loc);
+    auto rhsTluNode = dagBuilder.add_lut(
+        rhsCorrectionNode, slice(unknownFunction), precision, *loc);

    // tlu(x + y) - tlu(x - y)
+    auto resultWeights = std::vector<double>{1, 1};
    const std::vector<concrete_optimizer::dag::OperatorIndex> subInputs = {
        lhsTluNode, rhsTluNode};
    auto resultNode = dagBuilder.add_levelled_op(
-        slice(subInputs), lweDimCostFactor, fixedCost, tluSubManp,
-        slice(pairMatrixShape), comment);
+        slice(subInputs), lweDimCostFactor, fixedCost, slice(resultWeights),
+        slice(pairMatrixShape), comment, *loc);

    // 3. Sum(tlu(x + y) - tlu(x - y))
    // Create a leveled op that simulates concatenation. It takes
@@ -606,9 +670,10 @@ struct FunctionToDag {

    // TODO: use APIFloat.sqrt when it's available
    double manp = sqrt(smanp_int.getValue().roundToDouble());
+    auto weights = std::vector<double>(sumOperands.size(), manp / tluSubManp);
    index[result] = dagBuilder.add_levelled_op(
-        slice(sumOperands), lwe_dim_cost_factor, fixed_cost, manp,
-        slice(resultShape), comment);
+        slice(sumOperands), lwe_dim_cost_factor, fixed_cost, slice(weights),
+        slice(resultShape), comment, *loc);

    // Create the TFHE.OId attributes
    // The first elements of the vector are nodes for the encrypted
@@ -647,49 +712,31 @@ struct FunctionToDag {

  void addMax(FHE::MaxEintOp &maxOp, Inputs &inputs, int precision) {
    mlir::Value result = maxOp.getResult();
+    auto loc = loc_to_location(maxOp.getLoc());
    const std::vector<uint64_t> resultShape = getShape(result);

-    Operation *xOp = maxOp.getX().getDefiningOp();
-    Operation *yOp = maxOp.getY().getDefiningOp();
-
    const double fixedCost = NEGLIGIBLE_COMPLEXITY;
    const double lweDimCostFactor = NEGLIGIBLE_COMPLEXITY;

-    llvm::APInt xSmanp = llvm::APInt{1, 1, false};
-    if (xOp != nullptr) {
-      const auto xSmanpAttr = xOp->getAttrOfType<mlir::IntegerAttr>("SMANP");
-      assert(xSmanpAttr && "Missing SMANP value on a crypto operation");
-      xSmanp = xSmanpAttr.getValue();
-    }
+    auto comment = std::string(maxOp->getName().getStringRef()) + " " +
+                   loc_to_string(maxOp.getLoc());

-    llvm::APInt ySmanp = llvm::APInt{1, 1, false};
-    if (yOp != nullptr) {
-      const auto ySmanpAttr = yOp->getAttrOfType<mlir::IntegerAttr>("SMANP");
-      assert(ySmanpAttr && "Missing SMANP value on a crypto operation");
-      ySmanp = ySmanpAttr.getValue();
-    }
+    auto subWeights = std::vector<double>{1, 1};
+    auto subNode = dagBuilder.add_levelled_op(
+        slice(inputs), lweDimCostFactor, fixedCost, slice(subWeights),
+        slice(resultShape), comment, *loc);

-    const double subManp =
-        sqrt(xSmanp.roundToDouble() + ySmanp.roundToDouble());
-
-    auto loc = loc_to_string(maxOp.getLoc());
-    auto comment = std::string(maxOp->getName().getStringRef()) + " " + loc;
-
-    auto subNode =
-        dagBuilder.add_levelled_op(slice(inputs), lweDimCostFactor, fixedCost,
-                                   subManp, slice(resultShape), comment);
-
-    const double tluNodeManp = 1;
    const std::vector<std::uint64_t> unknownFunction;
    auto tluNode =
-        dagBuilder.add_lut(subNode, slice(unknownFunction), precision);
+        dagBuilder.add_lut(subNode, slice(unknownFunction), precision, *loc);

-    const double addManp = sqrt(tluNodeManp + ySmanp.roundToDouble());
    const std::vector<concrete_optimizer::dag::OperatorIndex> addInputs = {
        tluNode, inputs[1]};
+    auto addWeights = std::vector<double>{1, 1};
    auto resultNode = dagBuilder.add_levelled_op(
-        slice(addInputs), lweDimCostFactor, fixedCost, addManp,
-        slice(resultShape), comment);
+        slice(addInputs), lweDimCostFactor, fixedCost, slice(addWeights),
+        slice(resultShape), comment, *loc);
+
    index[result] = resultNode;

    // Set attribute on the MLIR node
@@ -705,6 +752,7 @@ struct FunctionToDag {
                    int precision) {
    mlir::Value result = maxpool2dOp.getResult();
    const std::vector<uint64_t> resultShape = getShape(result);
+    auto loc = loc_to_location(maxpool2dOp.getLoc());

    // all TLUs are flattened into a dimension
    // to create a single TLU node in optimizer dag
@@ -732,25 +780,29 @@ struct FunctionToDag {

    const double subManp = sqrt(2 * inputSmanp.roundToDouble() + 1);

-    auto loc = loc_to_string(maxpool2dOp.getLoc());
-    auto comment =
-        std::string(maxpool2dOp->getName().getStringRef()) + " " + loc;
+    auto comment = std::string(maxpool2dOp->getName().getStringRef()) + " " +
+                   loc_to_string(maxpool2dOp.getLoc());

-    auto subNode =
-        dagBuilder.add_levelled_op(slice(inputs), lweDimCostFactor, fixedCost,
-                                   subManp, slice(fakeShape), comment);
+    auto subWeights = std::vector<double>(
+        inputs.size(), subManp / sqrt(inputSmanp.roundToDouble()));
+    auto subNode = dagBuilder.add_levelled_op(slice(inputs), lweDimCostFactor,
+                                              fixedCost, slice(subWeights),
+                                              slice(fakeShape), comment, *loc);

    const std::vector<std::uint64_t> unknownFunction;
    auto tluNode =
-        dagBuilder.add_lut(subNode, slice(unknownFunction), precision);
+        dagBuilder.add_lut(subNode, slice(unknownFunction), precision, *loc);

    const double addManp = sqrt(inputSmanp.roundToDouble() + 1);
    const std::vector<concrete_optimizer::dag::OperatorIndex> addInputs = {
        tluNode, inputs[0]};

+    auto resultWeights = std::vector<double>(
+        addInputs.size(), addManp / sqrt(inputSmanp.roundToDouble()));
    auto resultNode = dagBuilder.add_levelled_op(
-        slice(addInputs), lweDimCostFactor, fixedCost, addManp,
-        slice(resultShape), comment);
+        slice(addInputs), lweDimCostFactor, fixedCost, slice(resultWeights),
+        slice(resultShape), comment, *loc);
+
    index[result] = resultNode;
    // Set attribute on the MLIR node
    mlir::Builder builder(maxpool2dOp.getContext());
@@ -771,7 +823,8 @@ struct FunctionToDag {
  idPlaceholder(concrete_optimizer::dag::OperatorIndex input) {
    std::vector inputs = {input};
    return dagBuilder.add_dot(slice(inputs),
-                              concrete_optimizer::weights::number(1));
+                              concrete_optimizer::weights::number(1),
+                              *location_unknown());
  }

  Inputs encryptedInputs(mlir::Operation &op) {
@@ -852,6 +905,14 @@ struct FunctionToDag {
    return llvm::dyn_cast<mlir::concretelang::FHELinalg::MulEintOp>(op);
  }

+  mlir::concretelang::FHE::ZeroNoise asZeroNoise(mlir::Operation &op) {
+    return llvm::dyn_cast<mlir::concretelang::FHE::ZeroNoise>(op);
+  }
+
+  mlir::concretelang::FHE::AdditiveNoise asAdditiveNoise(mlir::Operation &op) {
+    return llvm::dyn_cast<mlir::concretelang::FHE::AdditiveNoise>(op);
+  }
+
  mlir::concretelang::FHE::MaxEintOp asMax(mlir::Operation &op) {
    return llvm::dyn_cast<mlir::concretelang::FHE::MaxEintOp>(op);
  }
--- a/compilers/concrete-compiler/compiler/lib/Dialect/FHE/Analysis/MANP.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Dialect/FHE/Analysis/MANP.cpp
@@ -794,14 +794,12 @@ public:
  std::optional<llvm::APInt>
  norm2SqEquivFromOp(Operation *op, ArrayRef<const MANPLattice *> operands) {
    std::optional<llvm::APInt> norm2SqEquiv;
-    if (auto cstNoiseOp =
-            llvm::dyn_cast<mlir::concretelang::FHE::ConstantNoise>(op)) {
-      if (llvm::isa<mlir::concretelang::FHE::ZeroEintOp,
-                    mlir::concretelang::FHE::ZeroTensorOp>(op)) {
-        norm2SqEquiv = llvm::APInt{1, 0, false};
-      } else {
-        norm2SqEquiv = llvm::APInt{1, 1, false};
-      }
+    if (auto zeroNoiseOp =
+            llvm::dyn_cast<mlir::concretelang::FHE::ZeroNoise>(op)) {
+      norm2SqEquiv = llvm::APInt{1, 0, false};
+    } else if (auto cstNoiseOp =
+                   llvm::dyn_cast<mlir::concretelang::FHE::ConstantNoise>(op)) {
+      norm2SqEquiv = llvm::APInt{1, 1, false};
    } else if (llvm::isa<mlir::concretelang::FHE::ToBoolOp>(op) ||
               llvm::isa<mlir::concretelang::FHE::FromBoolOp>(op)) {
      norm2SqEquiv = getNoOpSqMANP(operands);
--- a/compilers/concrete-compiler/compiler/lib/Dialect/RT/Analysis/LowerDataflowTasksToRT.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Dialect/RT/Analysis/LowerDataflowTasksToRT.cpp
@@ -306,19 +306,6 @@ static void lowerDataflowTaskOp(RT::DataflowTaskOp DFTOp,
  DFTOp.erase();
 }

-static void registerWorkFunction(mlir::func::FuncOp parentFunc,
-                                 mlir::func::FuncOp workFunction) {
-  OpBuilder builder(parentFunc.getBody());
-  builder.setInsertionPointToStart(&parentFunc.getBody().front());
-
-  auto fnptr = builder.create<mlir::func::ConstantOp>(
-      parentFunc.getLoc(), workFunction.getFunctionType(),
-      SymbolRefAttr::get(builder.getContext(), workFunction.getName()));
-
-  builder.create<RT::RegisterTaskWorkFunctionOp>(parentFunc.getLoc(),
-                                                 fnptr.getResult());
-}
-
 static func::FuncOp getCalledFunction(CallOpInterface callOp) {
  SymbolRefAttr sym = callOp.getCallableForCallee().dyn_cast<SymbolRefAttr>();
  if (!sym)
@@ -333,8 +320,6 @@ struct LowerDataflowTasksPass

  void runOnOperation() override {
    auto module = getOperation();
-    SmallVector<func::FuncOp, 4> workFunctions;
-    SmallVector<func::FuncOp, 1> entryPoints;

    module.walk([&](mlir::func::FuncOp func) {
      static int wfn_id = 0;
@@ -357,58 +342,12 @@ struct LowerDataflowTasksPass
        outliningMap.push_back(
            std::pair<RT::DataflowTaskOp, func::FuncOp>(op, outlinedFunc));
        symbolTable.insert(outlinedFunc);
-        workFunctions.push_back(outlinedFunc);
        return WalkResult::advance();
      });
      // Lower the DF task ops to RT dialect ops.
      for (auto mapping : outliningMap)
        lowerDataflowTaskOp(mapping.first, mapping.second);
-
-      // Gather all entry points (assuming no recursive calls to entry points)
-      // Main is always an entry-point - otherwise check if this
-      // function is called within the module.  TODO: we assume no
-      // recursion.
-      if (func.getName() == "main")
-        entryPoints.push_back(func);
-      else {
-        bool found = false;
-        module.walk([&](mlir::func::CallOp op) {
-          if (getCalledFunction(op) == func)
-            found = true;
-        });
-        if (!found)
-          entryPoints.push_back(func);
-      }
    });
-
-    for (auto entryPoint : entryPoints) {
-      // If this is a JIT invocation and we're not on the root node,
-      // we do not need to do any computation, only register all work
-      // functions with the runtime system
-      if (!workFunctions.empty()) {
-        if (!dfr::_dfr_is_root_node()) {
-          entryPoint.eraseBody();
-          Block *b = new Block;
-          FunctionType funTy = entryPoint.getFunctionType();
-          SmallVector<Location> locations(funTy.getInputs().size(),
-                                          entryPoint.getLoc());
-          b->addArguments(funTy.getInputs(), locations);
-          entryPoint.getBody().push_front(b);
-          for (int i = funTy.getNumInputs() - 1; i >= 0; --i)
-            entryPoint.eraseArgument(i);
-          for (int i = funTy.getNumResults() - 1; i >= 0; --i)
-            entryPoint.eraseResult(i);
-          OpBuilder builder(entryPoint.getBody());
-          builder.setInsertionPointToEnd(&entryPoint.getBody().front());
-          builder.create<mlir::func::ReturnOp>(entryPoint.getLoc());
-        }
-      }
-
-      // Generate code to register all work-functions with the
-      // runtime.
-      for (auto wf : workFunctions)
-        registerWorkFunction(entryPoint, wf);
-    }
  }
  LowerDataflowTasksPass(bool debug) : debug(debug){};

@@ -428,29 +367,27 @@ struct StartStopPass : public StartStopBase<StartStopPass> {

  void runOnOperation() override {
    auto module = getOperation();
-    int useDFR = 0;
    SmallVector<func::FuncOp, 1> entryPoints;

+    // Gather all entry points in the module.
    module.walk([&](mlir::func::FuncOp func) {
-      // Do not add start/stop to work functions - but if any are
-      // present, then we need to activate the runtime
-      if (func->getAttr("_dfr_work_function_attribute")) {
-        useDFR = 1;
-      } else {
-        // Main is always an entry-point - otherwise check if this
-        // function is called within the module.  TODO: we assume no
-        // recursion.
-        if (func.getName() == "main")
+      // Work functions are never allowed to be an entry point.
+      if (func->getAttr("_dfr_work_function_attribute"))
+        return;
+
+      // Main is always an entry-point - otherwise check if this
+      // function is called within the module.  TODO: we assume no
+      // recursion.
+      if (func.getName() == "main")
+        entryPoints.push_back(func);
+      else {
+        bool found = false;
+        module.walk([&](mlir::func::CallOp op) {
+          if (getCalledFunction(op) == func)
+            found = true;
+        });
+        if (!found)
          entryPoints.push_back(func);
-        else {
-          bool found = false;
-          module.walk([&](mlir::func::CallOp op) {
-            if (getCalledFunction(op) == func)
-              found = true;
-          });
-          if (!found)
-            entryPoints.push_back(func);
-        }
      }
    });

@@ -459,7 +396,7 @@ struct StartStopPass : public StartStopBase<StartStopPass> {
      OpBuilder builder(entryPoint.getBody());
      builder.setInsertionPointToStart(&entryPoint.getBody().front());
      Value useDFRVal = builder.create<arith::ConstantOp>(
-          entryPoint.getLoc(), builder.getI64IntegerAttr(useDFR));
+          entryPoint.getLoc(), builder.getI64IntegerAttr(1));

      // Check if this entry point uses a context
      Value ctx = nullptr;
--- a/compilers/concrete-compiler/compiler/lib/Dialect/SDFG/Transforms/CMakeLists.txt
+++ b/compilers/concrete-compiler/compiler/lib/Dialect/SDFG/Transforms/CMakeLists.txt
@@ -2,12 +2,14 @@ add_mlir_dialect_library(
  ConcretelangSDFGTransforms
  BufferizableOpInterfaceImpl.cpp
  SDFGConvertibleOpInterfaceImpl.cpp
+  SDFGBufferOwnership.cpp
  ADDITIONAL_HEADER_DIRS
  ${PROJECT_SOURCE_DIR}/include/concretelang/Dialect/Concrete
  ${PROJECT_SOURCE_DIR}/include/concretelang/Dialect/SDFG
  DEPENDS
  mlir-headers
  SDFGDialect
+  ConcretelangSDFGTransformsPassIncGen
  ConcretelangSDFGInterfaces
  LINK_LIBS
  PUBLIC
--- a/compilers/concrete-compiler/compiler/lib/Dialect/SDFG/Transforms/SDFGBufferOwnership.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Dialect/SDFG/Transforms/SDFGBufferOwnership.cpp
@@ -0,0 +1,112 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
+// for license information.
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/Operation.h"
+
+#include "concretelang/Conversion/Tools.h"
+#include "concretelang/Dialect/Concrete/IR/ConcreteDialect.h"
+#include "concretelang/Dialect/Concrete/IR/ConcreteOps.h"
+#include "concretelang/Dialect/SDFG/IR/SDFGDialect.h"
+#include "concretelang/Dialect/SDFG/IR/SDFGOps.h"
+#include "concretelang/Dialect/SDFG/IR/SDFGTypes.h"
+#include "concretelang/Dialect/SDFG/Transforms/BufferizableOpInterfaceImpl.h"
+#include "concretelang/Support/CompilerEngine.h"
+#include <concretelang/Dialect/SDFG/Transforms/Passes.h>
+#include <mlir/IR/AffineExpr.h>
+#include <mlir/IR/AffineMap.h>
+#include <mlir/IR/BuiltinTypes.h>
+
+using namespace mlir;
+using namespace mlir::bufferization;
+using namespace mlir::tensor;
+
+namespace SDFG = mlir::concretelang::SDFG;
+
+namespace mlir {
+namespace concretelang {
+namespace {
+
+static void getAliasedUses(Value val, DenseSet<OpOperand *> &aliasedUses) {
+  for (auto &use : val.getUses()) {
+    aliasedUses.insert(&use);
+    if (dyn_cast<ViewLikeOpInterface>(use.getOwner()))
+      getAliasedUses(use.getOwner()->getResult(0), aliasedUses);
+  }
+}
+
+static func::FuncOp getCalledFunction(CallOpInterface callOp) {
+  SymbolRefAttr sym = callOp.getCallableForCallee().dyn_cast<SymbolRefAttr>();
+  if (!sym)
+    return nullptr;
+  return dyn_cast_or_null<func::FuncOp>(
+      SymbolTable::lookupNearestSymbolFrom(callOp, sym));
+}
+
+struct SDFGBufferOwnershipPass
+    : public SDFGBufferOwnershipBase<SDFGBufferOwnershipPass> {
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    std::vector<Operation *> deallocOps;
+
+    // Find all SDFG put operations that use a buffer only used for
+    // this operation, then deallocated. In such cases there is no
+    // need to copy the data again in the runtime and we can take
+    // ownership of the buffer instead, removing the deallocation and
+    // allowing the runtime to deallocate when appropriate.
+    module.walk([&](mlir::memref::DeallocOp op) {
+      DominanceInfo domInfo(op);
+      Value alloc = op.getOperand();
+      DenseSet<OpOperand *> aliasedUses;
+      getAliasedUses(alloc, aliasedUses);
+
+      // Check if this memref is used in a SDFG put operation
+      for (auto use : aliasedUses) {
+        if (isa<mlir::func::CallOp>(use->getOwner())) {
+          mlir::func::CallOp callOp = cast<func::CallOp>(use->getOwner());
+          mlir::func::FuncOp funcOp = getCalledFunction(callOp);
+          std::string putName = "stream_emulator_put_memref";
+          if (funcOp.getName().str().compare(0, putName.size(), putName) == 0) {
+            // If the put operation dominates the deallocation, then
+            // ownership of the data can be transferred to the runtime
+            // and deallocation can be removed. We mark the ownership
+            // flag in the PUT operation to notify the runtime that it
+            // gets ownership.
+            if (domInfo.properlyDominates(callOp, op)) {
+              deallocOps.push_back(op);
+              OpBuilder builder(callOp);
+              mlir::Value cst1 = builder.create<mlir::arith::ConstantOp>(
+                  callOp.getLoc(), builder.getI64IntegerAttr(1));
+              callOp->setOperand(2, cst1);
+            }
+            return;
+          }
+        }
+      }
+    });
+
+    for (auto dop : deallocOps) {
+      dop->erase();
+    }
+  }
+};
+} // end anonymous namespace
+
+std::unique_ptr<mlir::Pass> createSDFGBufferOwnershipPass() {
+  return std::make_unique<SDFGBufferOwnershipPass>();
+}
+
+} // end namespace concretelang
+} // end namespace mlir
--- a/compilers/concrete-compiler/compiler/lib/Dialect/TFHE/Transforms/TFHECircuitSolutionParametrization.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Dialect/TFHE/Transforms/TFHECircuitSolutionParametrization.cpp
@@ -997,6 +997,9 @@ public:
        return mlir::failure();
    }

+    if (oldOp->getAttr("_dfr_work_function_attribute"))
+      newOp->setAttr("_dfr_work_function_attribute", rewriter.getUnitAttr());
+
    return mlir::success();
  }

--- a/compilers/concrete-compiler/compiler/lib/Runtime/CMakeLists.txt
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/CMakeLists.txt
@@ -1,12 +1,28 @@
 add_compile_options(-fsized-deallocation)

 if(CONCRETELANG_CUDA_SUPPORT)
-  add_library(ConcretelangRuntime SHARED context.cpp simulation.cpp wrappers.cpp DFRuntime.cpp key_manager.cpp
-                                         GPUDFG.cpp)
+  add_library(
+    ConcretelangRuntime SHARED
+    context.cpp
+    utils.cpp
+    simulation.cpp
+    wrappers.cpp
+    DFRuntime.cpp
+    key_manager.cpp
+    GPUDFG.cpp
+    time_util.cpp)
  target_link_libraries(ConcretelangRuntime PRIVATE hwloc)
 else()
-  add_library(ConcretelangRuntime SHARED context.cpp simulation.cpp wrappers.cpp DFRuntime.cpp key_manager.cpp
-                                         StreamEmulator.cpp)
+  add_library(
+    ConcretelangRuntime SHARED
+    context.cpp
+    utils.cpp
+    simulation.cpp
+    wrappers.cpp
+    DFRuntime.cpp
+    key_manager.cpp
+    GPUDFG.cpp
+    time_util.cpp)
 endif()

 add_dependencies(ConcretelangRuntime concrete_cpu concrete_cpu_noise_model concrete-protocol)
--- a/compilers/concrete-compiler/compiler/lib/Runtime/DFRuntime.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/DFRuntime.cpp
@@ -36,6 +36,10 @@ static size_t num_nodes = 0;
 static struct timespec init_timer, broadcast_timer, compute_timer, whole_timer;
 #endif
 } // namespace
+
+void *dl_handle = nullptr;
+WorkFunctionRegistry *_dfr_node_level_work_function_registry;
+
 } // namespace dfr
 } // namespace concretelang
 } // namespace mlir
@@ -99,7 +103,6 @@ void _dfr_create_async_task(wfnptr wfn, void *ctx, size_t num_params,
    param_types.push_back(va_arg(args, uint64_t));
  }
  va_end(args);
-
  dfr_create_async_task_impl(wfn, ctx, refcounted_futures, param_sizes,
                             param_types, outputs, output_sizes, output_types);
 }
@@ -179,6 +182,7 @@ bool _dfr_is_jit() { return is_jit_p; }
 bool _dfr_is_root_node() { return is_root_node_p; }
 bool _dfr_use_omp() { return use_omp_p; }
 bool _dfr_is_distributed() { return num_nodes > 1; }
+void _dfr_register_lib(void *dlh) { dl_handle = dlh; }
 } // namespace dfr
 } // namespace concretelang
 } // namespace mlir
@@ -210,8 +214,10 @@ static inline void _dfr_stop_impl() {
 }

 static inline void _dfr_start_impl(int argc, char *argv[]) {
+  CONCRETELANG_ENABLE_TIMING();
  BEGIN_TIME(&init_timer);
-  dl_handle = dlopen(nullptr, RTLD_NOW);
+  if (dl_handle == nullptr)
+    dl_handle = dlopen(nullptr, RTLD_NOW);

  // If OpenMP is to be used, we need to force its initialization
  // before thread binding occurs. Otherwise OMP threads will be bound
@@ -315,7 +321,7 @@ static inline void _dfr_start_impl(int argc, char *argv[]) {
  is_root_node_p = (hpx::find_here() == hpx::find_root_locality());
  num_nodes = hpx::get_num_localities().get();

-  new WorkFunctionRegistry();
+  _dfr_node_level_work_function_registry = new WorkFunctionRegistry();

  char *env = getenv("DFR_LAZY_KEY_TRANSFER");
  bool lazy = false;
@@ -324,7 +330,7 @@ static inline void _dfr_start_impl(int argc, char *argv[]) {
        !strncmp(env, "On", 2) || !strncmp(env, "on", 2) ||
        !strncmp(env, "1", 1))
      lazy = true;
-  new RuntimeContextManager(lazy);
+  _dfr_node_level_runtime_context_manager = new RuntimeContextManager(lazy);

  _dfr_jit_phase_barrier = new hpx::distributed::barrier(
      "phase_barrier", num_nodes, hpx::get_locality_id());
@@ -346,6 +352,7 @@ static inline void _dfr_start_impl(int argc, char *argv[]) {
    JIT invocation).  These serve to pause/resume the runtime
    scheduler and to clean up used resources.  */
 void _dfr_start(int64_t use_dfr_p, void *ctx) {
+  CONCRETELANG_ENABLE_TIMING();
  BEGIN_TIME(&whole_timer);
  if (use_dfr_p) {
    // The first invocation will initialise the runtime. As each call to
@@ -365,12 +372,10 @@ void _dfr_start(int64_t use_dfr_p, void *ctx) {
    if (num_nodes > 1) {
      BEGIN_TIME(&broadcast_timer);
      _dfr_node_level_runtime_context_manager->setContext(ctx);
-    }
-    if (_dfr_is_root_node())
      _dfr_startup_barrier->wait();
-
-    if (num_nodes > 1 && ctx) {
-      END_TIME(&broadcast_timer, "Key broadcasting");
+      if (ctx) {
+        END_TIME(&broadcast_timer, "Key broadcasting");
+      }
    }
  }
  BEGIN_TIME(&compute_timer);
@@ -383,24 +388,29 @@ void _dfr_start(int64_t use_dfr_p, void *ctx) {
 void _dfr_stop(int64_t use_dfr_p) {
  if (use_dfr_p) {
    if (num_nodes > 1) {
-      // Non-root nodes synchronize here with the root to mark the point
-      // where the root is free to send work out (only needed in JIT).
-      if (!_dfr_is_root_node())
-        _dfr_startup_barrier->wait();
-
      // The barrier is only needed to synchronize the different
      // computation phases when the compute nodes need to generate and
      // register new work functions in each phase.
      _dfr_jit_phase_barrier->wait();
-
      _dfr_node_level_runtime_context_manager->clearContext();
      _dfr_node_level_work_function_registry->clearRegistry();
+      _dfr_jit_phase_barrier->wait();
    }
  }
  END_TIME(&compute_timer, "Compute");
  END_TIME(&whole_timer, "Total execution");
 }

+namespace mlir {
+namespace concretelang {
+namespace dfr {
+void _dfr_run_remote_scheduler() {
+  _dfr_start(1, nullptr);
+  _dfr_stop(1);
+}
+} // namespace dfr
+} // namespace concretelang
+} // namespace mlir
 void _dfr_try_initialize() {
  // Initialize and immediately suspend the HPX runtime if not yet done.
  uint64_t expected = uninitialised;
@@ -484,6 +494,8 @@ bool _dfr_is_jit() { return is_jit_p; }
 bool _dfr_is_root_node() { return true; }
 bool _dfr_use_omp() { return use_omp_p; }
 bool _dfr_is_distributed() { return num_nodes > 1; }
+void _dfr_run_remote_scheduler() {}
+void _dfr_register_lib(void *dlh) {}

 } // namespace dfr
 } // namespace concretelang
@@ -491,8 +503,8 @@ bool _dfr_is_distributed() { return num_nodes > 1; }

 using namespace mlir::concretelang::dfr;

-void _dfr_start(int64_t use_dfr_p, void *ctx) { BEGIN_TIME(&compute_timer); }
-void _dfr_stop(int64_t use_dfr_p) { END_TIME(&compute_timer, "Compute"); }
+void _dfr_start(int64_t use_dfr_p, void *ctx) {}
+void _dfr_stop(int64_t use_dfr_p) {}

 void _dfr_terminate() {}
 #endif
--- a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
@@ -3,6 +3,7 @@
 // https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
 // for license information.

+#ifdef CONCRETELANG_CUDA_SUPPORT
 #include <atomic>
 #include <cmath>
 #include <cstdarg>
@@ -18,15 +19,11 @@
 #include <utility>
 #include <vector>

+#include <concretelang/Runtime/GPUDFG.hpp>
 #include <concretelang/Runtime/stream_emulator_api.h>
+#include <concretelang/Runtime/time_util.h>
 #include <concretelang/Runtime/wrappers.h>

-#ifdef CONCRETELANG_CUDA_SUPPORT
-#include "bootstrap.h"
-#include "device.h"
-#include "keyswitch.h"
-#include "linear_algebra.h"
-
 using RuntimeContext = mlir::concretelang::RuntimeContext;

 namespace mlir {
@@ -34,6 +31,10 @@ namespace concretelang {
 namespace gpu_dfg {
 namespace {

+#if CONCRETELANG_TIMING_ENABLED
+static struct timespec init_timer, blocking_get_timer, acc1, acc2;
+#endif
+
 using MemRef2 = MemRefDescriptor<2>;

 // When not using all accelerators on the machine, we distribute work
@@ -93,6 +94,7 @@ union Context {
 // across multiple locations.
 static const int32_t host_location = -1;
 static const int32_t split_location = -2;
+static const int32_t invalid_location = -3;
 // Similarly dependence chunks are either indexed (which does not
 // always correlate to the device index on which they are located) or
 // this dependence is split further.
@@ -216,17 +218,12 @@ struct GPU_DFG {
                                      polynomial_size,
                                      input_lwe_ciphertext_count);
  }
-  void free_streams() {
-    streams.sort();
-    streams.unique();
-    for (auto s : streams)
-      delete s;
-  }
  inline void *get_gpu_stream(int32_t loc) {
    if (loc < 0)
      return nullptr;
    return gpus[loc].get_gpu_stream();
  }
+  void free_streams();

 private:
  std::list<void *> to_free_list;
@@ -253,6 +250,7 @@ struct Dependence {
  int32_t chunk_id;
  size_t stream_generation;
  std::vector<Dependence *> chunks;
+  std::vector<size_t> chunking_schedule;
  Dependence(int32_t l, MemRef2 hd, void *dd, bool ohr, bool alloc = false,
             int32_t chunk_id = single_chunk, size_t gen = 0)
      : location(l), host_data(hd), device_data(dd), onHostReady(ohr),
@@ -266,17 +264,41 @@ struct Dependence {
  // Split a dependence into a number of chunks either to run on
  // multiple GPUs or execute concurrently on the host.
  void split_dependence(size_t num_chunks, size_t num_gpu_chunks,
-                        size_t chunk_dim, bool constant) {
+                        size_t chunk_dim, bool constant,
+                        size_t gpu_chunk_factor, GPU_DFG *dfg) {
    // If this dependence is already split, check that the split
    // matches the new request
    if (chunk_id == split_chunks) {
-      if (num_chunks + num_gpu_chunks != chunks.size())
-        warnx("WARNING: requesting to split dependence across different number "
-              "of chunks (%lu) than it already is split (%lu) which would "
-              "require remapping. This is not supported.",
-              num_chunks + num_gpu_chunks, chunks.size());
-      assert(num_chunks + num_gpu_chunks == chunks.size());
-      return;
+      if (num_chunks + num_gpu_chunks != chunks.size()) {
+        // If this is not available on host, then we need to merge on
+        // host and re-split
+        if (!onHostReady) {
+          size_t data_size = 0;
+          size_t num_samples = 0;
+          for (auto c : chunks) {
+            move_chunk_off_device(c->chunk_id, dfg);
+            data_size += memref_get_data_size(c->host_data);
+            num_samples += c->host_data.sizes[chunk_dim];
+            sdfg_gpu_debug_print_mref("Chunk", c->host_data);
+          }
+          host_data = chunks[0]->host_data;
+          host_data.allocated = host_data.aligned =
+              (uint64_t *)malloc(data_size);
+          host_data.sizes[chunk_dim] = num_samples;
+          size_t pos = 0;
+          for (auto c : chunks) {
+            memcpy(((char *)host_data.aligned) + pos, c->host_data.aligned,
+                   memref_get_data_size(c->host_data));
+            pos += memref_get_data_size(c->host_data);
+          }
+          for (auto c : chunks)
+            free_chunk_host_data(c->chunk_id, dfg);
+          onHostReady = true;
+          hostAllocated = true;
+        }
+      } else {
+        return;
+      }
    }
    if (!chunks.empty()) {
      for (auto c : chunks)
@@ -301,8 +323,8 @@ struct Dependence {
      return;
    }
    size_t chunk_size =
-        num_samples / (num_chunks + num_gpu_chunks * device_compute_factor);
-    size_t gpu_chunk_size = chunk_size * device_compute_factor;
+        num_samples / (num_chunks + num_gpu_chunks * gpu_chunk_factor);
+    size_t gpu_chunk_size = chunk_size * gpu_chunk_factor;
    chunk_size = (num_samples - gpu_chunk_size * num_gpu_chunks) / num_chunks;
    size_t chunk_remainder =
        (num_samples - gpu_chunk_size * num_gpu_chunks) % num_chunks;
@@ -331,62 +353,43 @@ struct Dependence {
    chunk_id = split_chunks;
    location = split_location;
  }
-  void merge_dependence(GPU_DFG *dfg) {
+  void finalize_merged_dependence(GPU_DFG *dfg) {
    assert(!chunks.empty() && "Cannot merge dependence with no chunks");
-    size_t data_size = 0;
-    size_t num_samples = 0;
-    for (auto c : chunks) {
-      data_size += memref_get_data_size(c->host_data);
-      num_samples += c->host_data.sizes[0];
-    }
-    uint64_t *data = (uint64_t *)malloc(data_size);
-    MemRef2 output = {data,
-                      data,
-                      0,
-                      {num_samples, chunks.front()->host_data.sizes[1]},
-                      {chunks.front()->host_data.sizes[1], 1}};
-
-    std::list<cudaStream_t *> custreams_used;
-    for (auto c : chunks) {
-      // Write out the piece in the final target dependence
-      size_t csize = memref_get_data_size(c->host_data);
-      if (c->onHostReady) {
-        memcpy(((char *)output.aligned) + output.offset, c->host_data.aligned,
-               csize);
-      } else {
-        assert(c->location > host_location);
-        cudaStream_t *s = (cudaStream_t *)dfg->get_gpu_stream(c->location);
-        cuda_memcpy_async_to_cpu(((char *)output.aligned) + output.offset,
-                                 c->device_data, csize, s, c->location);
-        custreams_used.push_back(s);
-      }
-      output.offset += csize;
-    }
-    output.offset = 0;
-    for (auto c : chunks)
-      c->free_data(dfg, true);
+    assert(host_data.allocated != nullptr);
    chunks.clear();
-
-    custreams_used.sort();
-    custreams_used.unique();
-    for (auto s : custreams_used)
-      cudaStreamSynchronize(*s);
-
    location = host_location;
    onHostReady = true;
-    assert(host_data.allocated == nullptr);
-    host_data = output;
    assert(device_data == nullptr);
-    hostAllocated = true;
    chunk_id = single_chunk;
  }
+  void copy_chunk_off_device(int32_t chunk_id, GPU_DFG *dfg) {
+    if (chunks[chunk_id]->onHostReady)
+      return;
+    chunks[chunk_id]->copy(host_location, dfg, false);
+  }
  void move_chunk_off_device(int32_t chunk_id, GPU_DFG *dfg) {
-    chunks[chunk_id]->copy(host_location, dfg);
+    copy_chunk_off_device(chunk_id, dfg);
+    chunks[chunk_id]->location = host_location;
+    if (chunks[chunk_id]->device_data == nullptr)
+      return;
    cuda_drop_async(
        chunks[chunk_id]->device_data,
        (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
        chunks[chunk_id]->location);
-    chunks[chunk_id]->location = host_location;
+    chunks[chunk_id]->device_data = nullptr;
+  }
+  void merge_output_off_device(int32_t chunk_id, GPU_DFG *dfg) {
+    assert(chunks[chunk_id]->location > host_location);
+    size_t data_offset = 0;
+    for (int32_t c = 0; c < chunk_id; ++c)
+      data_offset +=
+          chunking_schedule[c] * host_data.sizes[1] * sizeof(uint64_t);
+    size_t csize = memref_get_data_size(chunks[chunk_id]->host_data);
+    cudaStream_t *s =
+        (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location);
+    cuda_memcpy_async_to_cpu(((char *)host_data.aligned) + data_offset,
+                             chunks[chunk_id]->device_data, csize, s,
+                             chunks[chunk_id]->location);
  }
  void free_chunk_host_data(int32_t chunk_id, GPU_DFG *dfg) {
    assert(chunks[chunk_id]->location == host_location &&
@@ -397,16 +400,18 @@ struct Dependence {
    chunks[chunk_id]->onHostReady = false;
  }
  void free_chunk_device_data(int32_t chunk_id, GPU_DFG *dfg) {
-    assert(chunks[chunk_id]->location > host_location &&
-           chunks[chunk_id]->device_data != nullptr);
+    if (chunks[chunk_id]->device_data == nullptr)
+      return;
    cuda_drop_async(
        chunks[chunk_id]->device_data,
        (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
        chunks[chunk_id]->location);
    chunks[chunk_id]->device_data = nullptr;
+    chunks[chunk_id]->location =
+        (chunks[chunk_id]->onHostReady) ? host_location : invalid_location;
  }
  inline void free_data(GPU_DFG *dfg, bool immediate = false) {
-    if (location >= 0 && device_data != nullptr) {
+    if (device_data != nullptr) {
      cuda_drop_async(device_data,
                      (cudaStream_t *)dfg->get_gpu_stream(location), location);
    }
@@ -414,17 +419,19 @@ struct Dependence {
      // As streams are not synchronized aside from the GET operation,
      // we cannot free host-side data until after the synchronization
      // point as it could still be used by an asynchronous operation.
-      if (immediate)
+      if (immediate) {
        free(host_data.allocated);
-      else
+        host_data.allocated = nullptr;
+      } else {
        dfg->register_stream_order_dependent_allocation(host_data.allocated);
+      }
    }
    for (auto c : chunks)
      c->free_data(dfg, immediate);
    chunks.clear();
    delete (this);
  }
-  inline void copy(int32_t loc, GPU_DFG *dfg) {
+  inline void copy(int32_t loc, GPU_DFG *dfg, bool synchronize = true) {
    size_t data_size = memref_get_data_size(host_data);
    if (loc == location)
      return;
@@ -438,7 +445,8 @@ struct Dependence {
      cudaStream_t *s = (cudaStream_t *)dfg->get_gpu_stream(location);
      cuda_memcpy_async_to_cpu(host_data.aligned, device_data, data_size, s,
                               location);
-      cudaStreamSynchronize(*s);
+      if (synchronize)
+        cudaStreamSynchronize(*s);
      onHostReady = true;
    } else {
      assert(onHostReady &&
@@ -508,6 +516,7 @@ struct Stream {
  bool ct_stream;
  bool pt_stream;
  size_t generation;
+  std::atomic<size_t> uses = {0};
  const char *name;
  Stream(stream_type t, const char *sname = nullptr)
      : dep(nullptr), type(t), producer(nullptr), dfg(nullptr),
@@ -524,7 +533,7 @@ struct Stream {
  }
  ~Stream() {
    if (dep != nullptr)
-      dep->free_data(dfg);
+      dep->free_data(dfg, true);
    if (producer != nullptr)
      delete producer;
  }
@@ -547,6 +556,7 @@ struct Stream {
      dep = d;
    }
    dep->stream_generation = generation;
+    uses = 0;
  }
  // For a given dependence, traverse the DFG backwards to extract the lattice
  // of kernels required to execute to produce this data
@@ -579,18 +589,22 @@ struct Stream {
    size_t subgraph_bootstraps = 0;
    for (auto p : queue) {
      is_batched_subgraph |= p->batched_process;
-      subgraph_bootstraps +=
-          (p->fun == memref_bootstrap_lwe_u64_process) ? 1 : 0;
+      subgraph_bootstraps += (p->fun == memref_bootstrap_lwe_u64_process ||
+                              p->fun == memref_keyswitch_lwe_u64_process)
+                                 ? 1
+                                 : 0;
    }
    // If this subgraph is not batched, then use this DFG's allocated
    // GPU to offload to.  If this does not bootstrap, just execute on
    // the host.
    if (!is_batched_subgraph) {
-      for (auto p : queue) {
-        schedule_kernel(
-            p, (subgraph_bootstraps > 0) ? dfg->gpu_idx : host_location,
-            single_chunk, nullptr);
-      }
+      int32_t loc = (subgraph_bootstraps > 0) ? dfg->gpu_idx : host_location;
+      for (auto p : queue)
+        schedule_kernel(p, loc, single_chunk,
+                        (p == producer) ? out.aligned : nullptr);
+      if (loc != host_location)
+        dep->copy(host_location, dfg, true);
+      dep->onHostReady = true;
      return;
    }

@@ -645,8 +659,6 @@ struct Stream {
        num_real_inputs++;
        if (s->dep->host_data.sizes[0] > num_samples)
          num_samples = s->dep->host_data.sizes[0];
-        if (!s->dep->chunks.empty())
-          num_samples = s->dep->chunks.size();
      } else {
        mem_per_sample += sizeof(uint64_t);
      }
@@ -657,7 +669,7 @@ struct Stream {
                      (num_real_inputs ? num_real_inputs : 1);
    size_t num_chunks = 1;
    size_t num_gpu_chunks = 0;
-    int32_t num_devices_to_use = 0;
+    size_t gpu_chunk_factor = device_compute_factor;
    // If the subgraph does not have sufficient computational
    // intensity (which we approximate by whether it bootstraps), then
    // we assume (TODO: confirm with profiling) that it is not
@@ -681,16 +693,20 @@ struct Stream {
          (available_mem - const_mem_per_sample) /
          ((mem_per_sample ? mem_per_sample : 1) * gpu_memory_inflation_factor);

-      if (num_samples < num_cores + device_compute_factor * num_devices) {
-        num_devices_to_use = 0;
+      while (gpu_chunk_factor > 4) {
+        if (num_samples < num_cores + gpu_chunk_factor * num_devices)
+          gpu_chunk_factor >>= 1;
+        else
+          break;
+      }
+
+      if (num_samples < num_cores + gpu_chunk_factor * num_devices) {
        num_chunks = std::min(num_cores, num_samples);
      } else {
-        num_devices_to_use = num_devices;
-        size_t compute_resources =
-            num_cores + num_devices * device_compute_factor;
+        size_t compute_resources = num_cores + num_devices * gpu_chunk_factor;
        size_t gpu_chunk_size =
            std::ceil((double)num_samples / compute_resources) *
-            device_compute_factor;
+            gpu_chunk_factor;
        size_t scale_factor =
            std::ceil((double)gpu_chunk_size / max_samples_per_chunk);
        num_chunks = num_cores * scale_factor;
@@ -702,7 +718,8 @@ struct Stream {

    for (auto i : inputs)
      i->dep->split_dependence(num_chunks, num_gpu_chunks,
-                               (i->ct_stream) ? 0 : 1, i->const_stream);
+                               (i->ct_stream) ? 0 : 1, i->const_stream,
+                               gpu_chunk_factor, dfg);
    for (auto iv : intermediate_values) {
      if (iv->need_new_gen()) {
        iv->put(new Dependence(split_location,
@@ -712,21 +729,67 @@ struct Stream {
      }
    }
    for (auto o : outputs) {
-      if (o->need_new_gen()) {
-        o->put(new Dependence(split_location,
-                              {nullptr, nullptr, 0, {0, 0}, {0, 0}}, nullptr,
-                              false, false, split_chunks));
-        o->dep->chunks.resize(num_chunks + num_gpu_chunks, nullptr);
+      if (!o->need_new_gen())
+        continue;
+      std::function<uint64_t(Stream *)> get_output_size =
+          [&](Stream *s) -> uint64_t {
+        uint64_t res = 0;
+        // If this stream is not produced within SDFG, we could use
+        // the input size. For now return 0.
+        if (s->producer == nullptr)
+          return 0;
+        // If the producer process has an output size registered,
+        // return it.
+        if (s->producer->output_size.val > 0)
+          return s->producer->output_size.val;
+        // Finally we look for sizes from inputs to the producer if
+        // we don't have it registered as poly size does not change
+        // in operators that do not register size.
+        for (auto p : s->producer->input_streams) {
+          uint64_t p_size = get_output_size(p);
+          if (p_size == 0)
+            continue;
+          if (res == 0)
+            res = get_output_size(p);
+          else
+            assert(res == p_size);
+        }
+        return res;
+      };
+      MemRef2 out_mref;
+      bool allocated = false;
+      if (o == this) {
+        out_mref = out;
+      } else {
+        uint64_t output_size = get_output_size(o);
+        out_mref = {0, 0, 0, {num_samples, output_size}, {output_size, 1}};
+        size_t data_size = memref_get_data_size(out_mref);
+        out_mref.allocated = out_mref.aligned = (uint64_t *)malloc(data_size);
+        allocated = true;
+      }
+
+      o->put(new Dependence(split_location, out_mref, nullptr, false, allocated,
+                            split_chunks));
+      o->dep->chunks.resize(num_chunks + num_gpu_chunks, nullptr);
+    }
+    for (auto o : outputs) {
+      o->dep->chunking_schedule.clear();
+      for (auto i : inputs) {
+        size_t cdim = (i->ct_stream) ? 0 : 1;
+        if (i->dep->host_data.sizes[cdim] == num_samples) {
+          for (auto c : i->dep->chunks)
+            o->dep->chunking_schedule.push_back(c->host_data.sizes[cdim]);
+          break;
+        }
      }
    }
-
    // Execute graph
    std::list<std::thread> workers;
    std::list<std::thread> gpu_schedulers;
    std::vector<std::list<size_t>> gpu_chunk_list;
    gpu_chunk_list.resize(num_devices);
    int32_t dev = 0;
-    for (size_t c = 0; c < num_chunks + num_gpu_chunks; ++c) {
+    for (int c = num_chunks + num_gpu_chunks - 1; c >= 0; --c) {
      if (!subgraph_bootstraps) {
        workers.push_back(std::thread(
            [&](std::list<Process *> queue, size_t c, int32_t host_location) {
@@ -742,7 +805,21 @@ struct Stream {
          workers.push_back(std::thread(
              [&](std::list<Process *> queue, size_t c, int32_t host_location) {
                for (auto p : queue) {
-                  schedule_kernel(p, host_location, c, nullptr);
+                  Stream *os = p->output_streams[0];
+                  auto it = std::find(outputs.begin(), outputs.end(), os);
+                  if (it == outputs.end()) {
+                    schedule_kernel(p, host_location, c, nullptr);
+                  } else {
+                    size_t data_offset = 0;
+                    for (int32_t ch = 0; ch < c; ++ch)
+                      data_offset +=
+                          outputs.front()->dep->chunking_schedule[ch] *
+                          os->dep->host_data.sizes[1] * sizeof(uint64_t);
+                    schedule_kernel(
+                        p, host_location, c,
+                        (uint64_t *)(((char *)os->dep->host_data.aligned) +
+                                     data_offset));
+                  }
                }
                for (auto iv : intermediate_values)
                  if (iv->consumers.size() == 1)
@@ -765,15 +842,35 @@ struct Stream {
              assert(status == cudaSuccess);
              cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem);
              assert(status == cudaSuccess);
-              for (auto p : queue)
+              for (auto p : queue) {
                schedule_kernel(p, dev, c, nullptr);
+                for (auto out_str : p->output_streams) {
+                  // For all output streams, if this is an output,
+                  // schedule copy out of the data produced by this
+                  // process.
+                  if (auto it =
+                          std::find(outputs.begin(), outputs.end(), out_str);
+                      it != outputs.end()) {
+                    out_str->dep->merge_output_off_device(c, dfg);
+                    continue;
+                  }
+                  // If this is not an output, but some process is not
+                  // part of this subgraph, we need to copy the data
+                  // out.
+                  for (auto cons_proc : out_str->consumers)
+                    if (auto it =
+                            std::find(queue.begin(), queue.end(), cons_proc);
+                        it == queue.end())
+                      out_str->dep->copy_chunk_off_device(c, dfg);
+                }
+              }
+              for (auto i : inputs)
+                if (++i->uses == i->consumers.size())
+                  i->dep->free_chunk_device_data(c, dfg);
              for (auto iv : intermediate_values)
-                if (iv->consumers.size() > 1)
-                  iv->dep->move_chunk_off_device(c, dfg);
-                else
-                  iv->dep->free_chunk_device_data(c, dfg);
+                iv->dep->free_chunk_device_data(c, dfg);
              for (auto o : outputs)
-                o->dep->move_chunk_off_device(c, dfg);
+                o->dep->free_chunk_device_data(c, dfg);
              cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(dev));
            }
          },
@@ -786,65 +883,43 @@ struct Stream {
      gs.join();
    gpu_schedulers.clear();
    // Build output out of the separate chunks processed
-    for (auto o : outputs) {
-      assert(o->batched_stream && o->ct_stream &&
-             "Only operations with ciphertext output supported.");
-      o->dep->merge_dependence(dfg);
-    }
+    for (auto o : outputs)
+      o->dep->finalize_merged_dependence(dfg);
+    for (dev = 0; dev < num_devices; ++dev)
+      cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(dev));
    // We will assume that only one subgraph is being processed per
    // DFG at a time, so we can safely free these here.
    dfg->free_stream_order_dependent_data();
    return;
  }
-  Dependence *get_on_host(MemRef2 &out) {
+  void get_on_host(MemRef2 &out) {
+    // Schedule the execution of the SDFG subgraph required to compute
+    // the value requested
    schedule_work(out);
-    assert(dep != nullptr && "GET on empty stream not allowed.");
-    // If this was already copied to host, copy out
-    if (dep->onHostReady) {
-      memref_copy_contiguous(out, dep->host_data);
-      return dep;
-    } else if (dep->location == split_location) {
-      char *pos = (char *)(out.aligned + out.offset);
-      std::list<int32_t> devices_used;
-      for (auto c : dep->chunks) {
-        size_t data_size = memref_get_data_size(c->host_data);
-        cuda_memcpy_async_to_cpu(
-            pos, c->device_data, data_size,
-            (cudaStream_t *)dfg->get_gpu_stream(c->location), c->location);
-        pos += data_size;
-        devices_used.push_back(c->location);
-      }
-      // We should only synchronize devices that had data chunks
-      devices_used.sort();
-      devices_used.unique();
-      for (auto i : devices_used)
-        cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(i));
-    } else {
-      size_t data_size = memref_get_data_size(dep->host_data);
-      cuda_memcpy_async_to_cpu(out.aligned + out.offset, dep->device_data,
-                               data_size, (cudaStream_t *)dfg->gpu_stream,
-                               dep->location);
-      cudaStreamSynchronize(*(cudaStream_t *)dfg->gpu_stream);
-    }
-    // After this synchronization point, all of the host-side
-    // allocated memory can be freed as we know all asynchronous
-    // operations have finished.
-    dfg->free_stream_order_dependent_data();
+
+    // The result should already have been copied to host, nothing to
+    // do (synchronization of transfers to host are pre-synchronized
+    // in schedule_work).
+    assert(dep != nullptr && dep->onHostReady);
+
+    // For now we make a copy of this dependence for future use as we
+    // can't assume that the output location will remain live until
+    // the next use.
+    // TODO: eliminate this copy.
    if (!dep->hostAllocated)
      dep->host_data = memref_copy_alloc(out);
    dep->onHostReady = true;
    dep->hostAllocated = true;
-    return dep;
  }
  Dependence *get(int32_t location, int32_t chunk_id = single_chunk) {
    assert(dep != nullptr && "Dependence could not be computed.");
    assert(chunk_id != split_chunks);
    if (chunk_id != single_chunk) {
      Dependence *d = dep->chunks[chunk_id];
-      d->copy(location, dfg);
+      d->copy(location, dfg, false);
      return d;
    }
-    dep->copy(location, dfg);
+    dep->copy(location, dfg, false);
    return dep;
  }
  inline bool need_new_gen(int32_t chunk_id = single_chunk) {
@@ -863,6 +938,13 @@ struct Stream {
  }
 };

+void GPU_DFG::free_streams() {
+  streams.sort();
+  streams.unique();
+  for (auto s : streams)
+    delete s;
+}
+
 static inline mlir::concretelang::gpu_dfg::Process *
 make_process_1_1(void *dfg, void *sin1, void *sout,
                 void (*fun)(Process *, int32_t, int32_t, uint64_t *)) {
@@ -882,6 +964,7 @@ make_process_1_1(void *dfg, void *sin1, void *sout,
  p->dfg->register_stream(s1);
  p->dfg->register_stream(so);
  p->batched_process = s1->batched_stream;
+  p->output_size.val = 0;
  return p;
 }

@@ -909,6 +992,7 @@ make_process_2_1(void *dfg, void *sin1, void *sin2, void *sout,
  p->dfg->register_stream(s2);
  p->dfg->register_stream(so);
  p->batched_process = s1->batched_stream;
+  p->output_size.val = 0;
  return p;
 }

@@ -957,8 +1041,11 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
                                      uint64_t *out_ptr) {
  auto sched = [&](Dependence *d) {
    uint64_t num_samples = d->host_data.sizes[0];
-    MemRef2 out = {
-        0, 0, 0, {num_samples, p->output_size.val}, {p->output_size.val, 1}};
+    MemRef2 out = {out_ptr,
+                   out_ptr,
+                   0,
+                   {num_samples, p->output_size.val},
+                   {p->output_size.val, 1}};
    size_t data_size = memref_get_data_size(out);
    if (loc == host_location) {
      // If it is not profitable to offload, schedule kernel on CPU
@@ -972,8 +1059,8 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
          d->host_data.strides[1], p->level.val, p->base_log.val,
          p->input_lwe_dim.val, p->output_lwe_dim.val, p->sk_index.val,
          p->ctx.val);
-      Dependence *dep =
-          new Dependence(loc, out, nullptr, true, true, d->chunk_id);
+      Dependence *dep = new Dependence(loc, out, nullptr, true,
+                                       (out_ptr == nullptr), d->chunk_id);
      return dep;
    } else {
      // Schedule the keyswitch kernel on the GPU
@@ -999,7 +1086,8 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
 void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
                                      uint64_t *out_ptr) {
  assert(p->output_size.val == p->glwe_dim.val * p->poly_size.val + 1);
-
+  if (!p->output_streams[0]->need_new_gen(chunk_id))
+    return;
  Dependence *idep1 = p->input_streams[1]->get(host_location, chunk_id);
  MemRef2 &mtlu = idep1->host_data;
  uint32_t num_lut_vectors = mtlu.sizes[0];
@@ -1023,8 +1111,11 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
                   std::vector<size_t> &lut_indexes, cudaStream_t *s,
                   int32_t loc) {
    uint64_t num_samples = d0->host_data.sizes[0];
-    MemRef2 out = {
-        0, 0, 0, {num_samples, p->output_size.val}, {p->output_size.val, 1}};
+    MemRef2 out = {out_ptr,
+                   out_ptr,
+                   0,
+                   {num_samples, p->output_size.val},
+                   {p->output_size.val, 1}};
    size_t data_size = memref_get_data_size(out);

    // Move test vector indexes to the GPU, the test vector indexes is set of 0
@@ -1065,8 +1156,8 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
            d1->host_data.strides[1], p->input_lwe_dim.val, p->poly_size.val,
            p->level.val, p->base_log.val, p->glwe_dim.val, p->sk_index.val,
            p->ctx.val);
-      Dependence *dep =
-          new Dependence(loc, out, nullptr, true, true, d0->chunk_id);
+      Dependence *dep = new Dependence(loc, out, nullptr, true,
+                                       (out_ptr == nullptr), d0->chunk_id);
      free(glwe_ct);
      return dep;
    } else {
@@ -1129,8 +1220,8 @@ void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc,
    assert(d0->host_data.sizes[1] == d1->host_data.sizes[1]);
    assert(d0->chunk_id == d1->chunk_id);
    uint64_t num_samples = d0->host_data.sizes[0];
-    MemRef2 out = {0,
-                   0,
+    MemRef2 out = {out_ptr,
+                   out_ptr,
                   0,
                   {num_samples, d0->host_data.sizes[1]},
                   {d0->host_data.sizes[1], 1}};
@@ -1148,8 +1239,8 @@ void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc,
          d1->host_data.aligned, d1->host_data.offset, d1->host_data.sizes[0],
          d1->host_data.sizes[1], d1->host_data.strides[0],
          d1->host_data.strides[1]);
-      Dependence *dep =
-          new Dependence(loc, out, nullptr, true, true, d0->chunk_id);
+      Dependence *dep = new Dependence(loc, out, nullptr, true,
+                                       (out_ptr == nullptr), d0->chunk_id);
      return dep;
    } else {
      // Schedule the kernel on the GPU
@@ -1179,8 +1270,8 @@ void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
           d1->host_data.sizes[1] == 1);
    assert(d0->chunk_id == d1->chunk_id);
    uint64_t num_samples = d0->host_data.sizes[0];
-    MemRef2 out = {0,
-                   0,
+    MemRef2 out = {out_ptr,
+                   out_ptr,
                   0,
                   {num_samples, d0->host_data.sizes[1]},
                   {d0->host_data.sizes[1], 1}};
@@ -1206,8 +1297,8 @@ void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
            d0->host_data.strides[1], d1->host_data.allocated,
            d1->host_data.aligned, d1->host_data.offset, d1->host_data.sizes[1],
            d1->host_data.strides[1]);
-      Dependence *dep =
-          new Dependence(loc, out, nullptr, true, true, d0->chunk_id);
+      Dependence *dep = new Dependence(loc, out, nullptr, true,
+                                       (out_ptr == nullptr), d0->chunk_id);
      return dep;
    } else {
      // Schedule the kernel on the GPU
@@ -1237,8 +1328,8 @@ void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
           d1->host_data.sizes[1] == 1);
    assert(d0->chunk_id == d1->chunk_id);
    uint64_t num_samples = d0->host_data.sizes[0];
-    MemRef2 out = {0,
-                   0,
+    MemRef2 out = {out_ptr,
+                   out_ptr,
                   0,
                   {num_samples, d0->host_data.sizes[1]},
                   {d0->host_data.sizes[1], 1}};
@@ -1264,8 +1355,8 @@ void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
            d0->host_data.strides[1], d1->host_data.allocated,
            d1->host_data.aligned, d1->host_data.offset, d1->host_data.sizes[1],
            d1->host_data.strides[1]);
-      Dependence *dep =
-          new Dependence(loc, out, nullptr, true, true, d0->chunk_id);
+      Dependence *dep = new Dependence(loc, out, nullptr, true,
+                                       (out_ptr == nullptr), d0->chunk_id);
      return dep;
    } else {
      // Schedule the keyswitch kernel on the GPU
@@ -1291,8 +1382,8 @@ void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc,
                                              uint64_t *out_ptr) {
  auto sched = [&](Dependence *d0, cudaStream_t *s, int32_t loc) {
    uint64_t num_samples = d0->host_data.sizes[0];
-    MemRef2 out = {0,
-                   0,
+    MemRef2 out = {out_ptr,
+                   out_ptr,
                   0,
                   {num_samples, d0->host_data.sizes[1]},
                   {d0->host_data.sizes[1], 1}};
@@ -1307,8 +1398,8 @@ void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc,
          d0->host_data.aligned, d0->host_data.offset, d0->host_data.sizes[0],
          d0->host_data.sizes[1], d0->host_data.strides[0],
          d0->host_data.strides[1]);
-      Dependence *dep =
-          new Dependence(loc, out, nullptr, true, true, d0->chunk_id);
+      Dependence *dep = new Dependence(loc, out, nullptr, true,
+                                       (out_ptr == nullptr), d0->chunk_id);
      return dep;
    } else {
      // Schedule the kernel on the GPU
@@ -1544,12 +1635,14 @@ void *stream_emulator_make_memref_stream(const char *name, stream_type stype) {
 }
 void stream_emulator_put_memref(void *stream, uint64_t *allocated,
                                uint64_t *aligned, uint64_t offset,
-                                uint64_t size, uint64_t stride) {
+                                uint64_t size, uint64_t stride,
+                                uint64_t data_ownership = 0) {
  assert(stride == 1 && "Strided memrefs not supported");
  Stream *s = (Stream *)stream;
  MemRef2 m = {allocated, aligned, offset, {1, size}, {size, stride}};
  Dependence *dep =
-      new Dependence(host_location, memref_copy_alloc(m), nullptr, true, true);
+      new Dependence(host_location, (data_ownership) ? m : memref_copy_alloc(m),
+                     nullptr, true, true);
  s->put(dep);
  s->generation++;
 }
@@ -1573,12 +1666,14 @@ void *stream_emulator_make_memref_batch_stream(const char *name,
 void stream_emulator_put_memref_batch(void *stream, uint64_t *allocated,
                                      uint64_t *aligned, uint64_t offset,
                                      uint64_t size0, uint64_t size1,
-                                      uint64_t stride0, uint64_t stride1) {
+                                      uint64_t stride0, uint64_t stride1,
+                                      uint64_t data_ownership = 0) {
  assert(stride1 == 1 && "Strided memrefs not supported");
  Stream *s = (Stream *)stream;
  MemRef2 m = {allocated, aligned, offset, {size0, size1}, {stride0, stride1}};
  Dependence *dep =
-      new Dependence(host_location, memref_copy_alloc(m), nullptr, true, true);
+      new Dependence(host_location, (data_ownership) ? m : memref_copy_alloc(m),
+                     nullptr, true, true);
  s->put(dep);
  s->generation++;
 }
@@ -1587,6 +1682,10 @@ void stream_emulator_get_memref_batch(void *stream, uint64_t *out_allocated,
                                      uint64_t out_offset, uint64_t out_size0,
                                      uint64_t out_size1, uint64_t out_stride0,
                                      uint64_t out_stride1) {
+  static size_t count = 0;
+  END_TIME_C_ACC(&blocking_get_timer, "Non-GPU section execution", count,
+                 &acc1);
+  BEGIN_TIME(&blocking_get_timer);
  assert(out_stride1 == 1 && "Strided memrefs not supported");
  MemRef2 mref = {out_allocated,
                  out_aligned,
@@ -1595,9 +1694,13 @@ void stream_emulator_get_memref_batch(void *stream, uint64_t *out_allocated,
                  {out_stride0, out_stride1}};
  auto s = (Stream *)stream;
  s->get_on_host(mref);
+  END_TIME_C_ACC(&blocking_get_timer, "GPU section execution", count++, &acc2);
+  BEGIN_TIME(&blocking_get_timer);
 }

 void *stream_emulator_init() {
+  CONCRETELANG_ENABLE_TIMING();
+  BEGIN_TIME(&init_timer);
  int num;
  assert(cudaGetDeviceCount(&num) == cudaSuccess);
  num_devices = num;
@@ -1646,9 +1749,42 @@ void *stream_emulator_init() {
  if (num_cores < 1)
    num_cores = 1;

+  END_TIME(&init_timer, "Initialization of the SDFG runtime");
+  BEGIN_TIME(&init_timer);
+
  int device = next_device.fetch_add(1) % num_devices;
  return new GPU_DFG(device);
 }
-void stream_emulator_run(void *dfg) {}
+void stream_emulator_run(void *dfg) {
+  END_TIME(&init_timer, "Building the SDFG graph");
+  BEGIN_TIME(&blocking_get_timer);
+}
 void stream_emulator_delete(void *dfg) { delete (GPU_DFG *)dfg; }
 #endif
+
+namespace mlir {
+namespace concretelang {
+namespace gpu_dfg {
+
+bool check_cuda_device_available() {
+#ifdef CONCRETELANG_CUDA_SUPPORT
+  int num;
+  if (cudaGetDeviceCount(&num) != cudaSuccess)
+    return false;
+  return num > 0;
+#else
+  return false;
+#endif
+}
+
+bool check_cuda_runtime_enabled() {
+#ifdef CONCRETELANG_CUDA_SUPPORT
+  return true;
+#else
+  return false;
+#endif
+}
+
+} // namespace gpu_dfg
+} // namespace concretelang
+} // namespace mlir
--- a/compilers/concrete-compiler/compiler/lib/Runtime/context.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/context.cpp
@@ -41,14 +41,17 @@ RuntimeContext::RuntimeContext(ServerKeyset serverKeyset)
  }

 #ifdef CONCRETELANG_CUDA_SUPPORT
-  assert(cudaGetDeviceCount(&num_devices) == cudaSuccess);
-  bsk_gpu.resize(num_devices);
-  ksk_gpu.resize(num_devices);
-  for (int i = 0; i < num_devices; ++i) {
-    bsk_gpu[i].resize(serverKeyset.lweBootstrapKeys.size(), nullptr);
-    ksk_gpu[i].resize(serverKeyset.lweKeyswitchKeys.size(), nullptr);
-    bsk_gpu_mutex.push_back(std::make_unique<std::mutex>());
-    ksk_gpu_mutex.push_back(std::make_unique<std::mutex>());
+  if (cudaGetDeviceCount(&num_devices) == cudaSuccess) {
+    bsk_gpu.resize(num_devices);
+    ksk_gpu.resize(num_devices);
+    for (int i = 0; i < num_devices; ++i) {
+      bsk_gpu[i].resize(serverKeyset.lweBootstrapKeys.size(), nullptr);
+      ksk_gpu[i].resize(serverKeyset.lweKeyswitchKeys.size(), nullptr);
+      bsk_gpu_mutex.push_back(std::make_unique<std::mutex>());
+      ksk_gpu_mutex.push_back(std::make_unique<std::mutex>());
+    }
+  } else {
+    num_devices = 0;
  }
 #endif
 }
--- a/compilers/concrete-compiler/compiler/lib/Runtime/time_util.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/time_util.cpp
@@ -0,0 +1,21 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
+// for license information.
+
+#include "concretelang/Runtime/time_util.h"
+
+#if CONCRETELANG_TIMING_ENABLED
+
+namespace mlir {
+namespace concretelang {
+namespace time_util {
+
+bool timing_enabled = false;
+struct timespec timestamp;
+
+} // namespace time_util
+} // namespace concretelang
+} // namespace mlir
+
+#endif
--- a/compilers/concrete-compiler/compiler/lib/Runtime/utils.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/utils.cpp
@@ -0,0 +1,15 @@
+// Part of the Concrete Compiler Project, under the BSD3 License with Zama
+// Exceptions. See
+// https://github.com/zama-ai/concrete/blob/main/LICENSE.txt
+// for license information.
+
+#include "concretelang/Runtime/utils.h"
+
+namespace mlir {
+namespace concretelang {
+void LLVMInitializeNativeTarget() {
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+}
+} // namespace concretelang
+} // namespace mlir
--- a/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
@@ -112,7 +112,6 @@ void memref_batched_keyswitch_lwe_cuda_u64(
    uint64_t ct0_stride0, uint64_t ct0_stride1, uint32_t level,
    uint32_t base_log, uint32_t input_lwe_dim, uint32_t output_lwe_dim,
    uint32_t ksk_index, mlir::concretelang::RuntimeContext *context) {
-  assert(ksk_index == 0 && "multiple ksk is not yet implemented on GPU");
  assert(out_size0 == ct0_size0);
  assert(out_size1 == output_lwe_dim + 1);
  assert(ct0_size1 == input_lwe_dim + 1);
@@ -159,7 +158,6 @@ void memref_batched_bootstrap_lwe_cuda_u64(
    uint64_t tlu_stride, uint32_t input_lwe_dim, uint32_t poly_size,
    uint32_t level, uint32_t base_log, uint32_t glwe_dim, uint32_t bsk_index,
    mlir::concretelang::RuntimeContext *context) {
-  assert(bsk_index == 0 && "multiple bsk is not yet implemented on GPU");
  assert(out_size0 == ct0_size0);
  assert(out_size1 == glwe_dim * poly_size + 1);
  // TODO: Multi GPU
@@ -247,7 +245,6 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64(
    uint32_t input_lwe_dim, uint32_t poly_size, uint32_t level,
    uint32_t base_log, uint32_t glwe_dim, uint32_t bsk_index,
    mlir::concretelang::RuntimeContext *context) {
-  assert(bsk_index == 0 && "multiple bsk is not yet implemented on GPU");
  assert(out_size0 == ct0_size0);
  assert(out_size1 == glwe_dim * poly_size + 1);
  assert((out_size0 == tlu_size0 || tlu_size0 == 1) &&
--- a/compilers/concrete-compiler/compiler/lib/ServerLib/ServerLib.cpp
+++ b/compilers/concrete-compiler/compiler/lib/ServerLib/ServerLib.cpp
@@ -16,6 +16,7 @@
 #include "concretelang/Common/Protocol.h"
 #include "concretelang/Common/Transformers.h"
 #include "concretelang/Common/Values.h"
+#include "concretelang/Runtime/DFRuntime.hpp"
 #include "concretelang/Runtime/context.h"
 #include "concretelang/ServerLib/ServerLib.h"
 #include "concretelang/Support/CompilerEngine.h"
@@ -355,6 +356,7 @@ DynamicModule::open(const std::string &sharedLibPath) {
  if (!module->libraryHandle) {
    return StringError("Cannot open shared library ") << dlerror();
  }
+  mlir::concretelang::dfr::_dfr_register_lib(module->libraryHandle);
  return module;
 }

@@ -430,6 +432,13 @@ bool getGateIsSigned(const Message<concreteprotocol::GateInfo> &gateInfo) {
 Result<std::vector<TransportValue>>
 ServerCircuit::call(const ServerKeyset &serverKeyset,
                    std::vector<TransportValue> &args) {
+  std::vector<TransportValue> returns(returnsBuffer.size());
+  mlir::concretelang::dfr::_dfr_register_lib(dynamicModule->libraryHandle);
+  if (!mlir::concretelang::dfr::_dfr_is_root_node()) {
+    mlir::concretelang::dfr::_dfr_run_remote_scheduler();
+    return returns;
+  }
+
  if (args.size() != argsBuffer.size()) {
    return StringError("Called circuit with wrong number of arguments");
  }
@@ -444,7 +453,6 @@ ServerCircuit::call(const ServerKeyset &serverKeyset,
  invoke(serverKeyset);

  // We process the return values to turn them into transport values.
-  std::vector<TransportValue> returns(returnsBuffer.size());
  for (size_t i = 0; i < returnsBuffer.size(); i++) {
    OUTCOME_TRY(returns[i], returnTransformers[i](returnsBuffer[i]));
  }
--- a/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Support/CompilerEngine.cpp
@@ -63,14 +63,10 @@
 #include "concretelang/Support/LLVMEmitFile.h"
 #include "concretelang/Support/Pipeline.h"
 #include "concretelang/Support/Utils.h"
+#include <concretelang/Runtime/GPUDFG.hpp>

 namespace mlir {
 namespace concretelang {
-// TODO: should be removed when bufferization is not related to CAPI lowering
-// Control whether we should call a cpu of gpu function when lowering
-// to CAPI
-static bool EMIT_GPU_OPS;
-bool getEmitGPUOption() { return EMIT_GPU_OPS; }

 /// Creates a new compilation context that can be shared across
 /// compilation engines and results
@@ -297,9 +293,6 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,

  mlir::MLIRContext &mlirContext = *this->compilationContext->getMLIRContext();

-  // enable/disable usage of gpu functions during bufferization
-  EMIT_GPU_OPS = options.emitGPUOps;
-
  auto dataflowParallelize =
      options.autoParallelize || options.dataflowParallelize;
  auto loopParallelize = options.autoParallelize || options.loopParallelize;
@@ -307,6 +300,56 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
  if (loopParallelize)
    mlir::concretelang::dfr::_dfr_set_use_omp(true);

+  // Sanity checks for enabling GPU usage: the compiler must have been
+  // compiled with Cuda support (especially important when building
+  // python wheels), and at least one device must be available to
+  // execute on.
+  if (options.emitGPUOps) {
+    // If this compiler is not compiled using Cuda support, then
+    // requesting GPU is forbidden - instead of a hard error, issue a
+    // warning and disable the GPU option.
+    if (!mlir::concretelang::gpu_dfg::check_cuda_runtime_enabled()) {
+      // Allow compilation to complete if only code generation is expected.
+      if (target != Target::LIBRARY) {
+        warnx("This instance of the Concrete compiler does not support GPU "
+              "acceleration."
+              " Allowing code generation to proceed, but execution will not be "
+              "possible.");
+      } else {
+        warnx("This instance of the Concrete compiler does not support GPU "
+              "acceleration."
+              " If you are using Concrete-Python, it means that the module "
+              "installed is not GPU enabled.\n"
+              "Continuing without GPU acceleration.");
+        options.emitGPUOps = false;
+        options.emitSDFGOps = false;
+        options.batchTFHEOps = false;
+      }
+    } else {
+      // Ensure that at least one Cuda device is available if GPU option
+      // is used
+      if (!mlir::concretelang::gpu_dfg::check_cuda_device_available()) {
+        warnx("No Cuda device available on this system (either not present or "
+              "the driver is not online).\n"
+              "Continuing without GPU acceleration.");
+        options.emitGPUOps = false;
+        options.emitSDFGOps = false;
+        options.batchTFHEOps = false;
+      }
+    }
+
+    // Finally for now we cannot allow dataflow parallelization at the
+    // same time as GPU usage.  This restriction will be relaxed later.
+    if (dataflowParallelize) {
+      warnx("Dataflow parallelization and GPU offloading have both been "
+            "requested.  This is not currently supported.  Continuing without "
+            "dataflow parallelization.");
+      dataflowParallelize = false;
+    }
+  }
+
+  // If dataflow parallelization will proceed, mark it for
+  // initialising the runtime
  if (dataflowParallelize)
    mlir::concretelang::dfr::_dfr_set_required(true);

@@ -594,8 +637,11 @@ CompilerEngine::compile(mlir::ModuleOp moduleOp, Target target,
    }
  }

+  // Restrict direct lowering when already generating GPU code through
+  // the SDFG dialect.
+  bool lowerDirectlyToGPUOps = (options.emitGPUOps && !options.emitSDFGOps);
  if (mlir::concretelang::pipeline::lowerToCAPI(mlirContext, module, enablePass,
-                                                options.emitGPUOps)
+                                                lowerDirectlyToGPUOps)
          .failed()) {
    return StreamStringError("Failed to lower to CAPI");
  }
--- a/compilers/concrete-compiler/compiler/lib/Support/Pipeline.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Support/Pipeline.cpp
@@ -47,8 +47,10 @@
 #include "concretelang/Dialect/FHELinalg/Transforms/Tiling.h"
 #include "concretelang/Dialect/RT/Analysis/Autopar.h"
 #include "concretelang/Dialect/RT/Transforms/Passes.h"
+#include "concretelang/Dialect/SDFG/Transforms/Passes.h"
 #include "concretelang/Dialect/TFHE/Analysis/ExtractStatistics.h"
 #include "concretelang/Dialect/TFHE/Transforms/Transforms.h"
+#include "concretelang/Runtime/utils.h"
 #include "concretelang/Support/CompilerEngine.h"
 #include "concretelang/Support/Error.h"
 #include "concretelang/Support/Pipeline.h"
@@ -361,6 +363,7 @@ mlir::LogicalResult batchTFHE(mlir::MLIRContext &context,
      pm, mlir::concretelang::createCollapseParallelLoops(), enablePass);
  addPotentiallyNestedPass(
      pm, mlir::concretelang::createBatchingPass(maxBatchSize), enablePass);
+  addPotentiallyNestedPass(pm, mlir::createCanonicalizerPass(), enablePass);

  return pm.run(module.getOperation());
 }
@@ -569,6 +572,8 @@ mlir::LogicalResult lowerToStd(mlir::MLIRContext &context,
                           enablePass);
  addPotentiallyNestedPass(
      pm, mlir::concretelang::createFixupBufferDeallocationPass(), enablePass);
+  addPotentiallyNestedPass(
+      pm, mlir::concretelang::createSDFGBufferOwnershipPass(), enablePass);

  return pm.run(module);
 }
@@ -610,8 +615,7 @@ std::unique_ptr<llvm::Module>
 lowerLLVMDialectToLLVMIR(mlir::MLIRContext &context,
                         llvm::LLVMContext &llvmContext,
                         mlir::ModuleOp &module) {
-  llvm::InitializeNativeTarget();
-  llvm::InitializeNativeTargetAsmPrinter();
+  mlir::concretelang::LLVMInitializeNativeTarget();
  mlir::registerLLVMDialectTranslation(*module->getContext());
  mlir::registerOpenMPDialectTranslation(*module->getContext());

--- a/compilers/concrete-compiler/compiler/lib/Transforms/ForLoopToParallel.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Transforms/ForLoopToParallel.cpp
@@ -23,39 +23,26 @@ public:
  matchAndRewrite(mlir::scf::ForOp forOp,
                  mlir::PatternRewriter &rewriter) const override {
    auto attr = forOp->getAttrOfType<mlir::BoolAttr>("parallel");
-    if (attr == nullptr) {
+
+    if (!attr || !attr.getValue()) {
      return mlir::failure();
    }
+
    assert(forOp.getRegionIterArgs().size() == 0 &&
           "unexpecting iter args when loops are bufferized");
-    if (attr.getValue()) {
-      rewriter.replaceOpWithNewOp<mlir::scf::ParallelOp>(
-          forOp, mlir::ValueRange{forOp.getLowerBound()},
-          mlir::ValueRange{forOp.getUpperBound()}, forOp.getStep(),
-          std::nullopt,
-          [&](mlir::OpBuilder &builder, mlir::Location location,
-              mlir::ValueRange indVar, mlir::ValueRange iterArgs) {
-            mlir::IRMapping map;
-            map.map(forOp.getInductionVar(), indVar.front());
-            for (auto &op : forOp.getRegion().front()) {
-              auto newOp = builder.clone(op, map);
-              map.map(op.getResults(), newOp->getResults());
-            }
-          });
-    } else {
-      rewriter.replaceOpWithNewOp<mlir::scf::ForOp>(
-          forOp, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep(),
-          std::nullopt,
-          [&](mlir::OpBuilder &builder, mlir::Location location,
-              mlir::Value indVar, mlir::ValueRange iterArgs) {
-            mlir::IRMapping map;
-            map.map(forOp.getInductionVar(), indVar);
-            for (auto &op : forOp.getRegion().front()) {
-              auto newOp = builder.clone(op, map);
-              map.map(op.getResults(), newOp->getResults());
-            }
-          });
-    }
+
+    rewriter.replaceOpWithNewOp<mlir::scf::ParallelOp>(
+        forOp, mlir::ValueRange{forOp.getLowerBound()},
+        mlir::ValueRange{forOp.getUpperBound()}, forOp.getStep(), std::nullopt,
+        [&](mlir::OpBuilder &builder, mlir::Location location,
+            mlir::ValueRange indVar, mlir::ValueRange iterArgs) {
+          mlir::IRMapping map;
+          map.map(forOp.getInductionVar(), indVar.front());
+          for (auto &op : forOp.getRegion().front()) {
+            auto newOp = builder.clone(op, map);
+            map.map(op.getResults(), newOp->getResults());
+          }
+        });

    return mlir::success();
  }
--- a/compilers/concrete-compiler/compiler/tests/check_tests/Transforms/batching.mlir
+++ b/compilers/concrete-compiler/compiler/tests/check_tests/Transforms/batching.mlir
@@ -122,8 +122,7 @@ func.func @batch_offset_extract_keyswitch(%arg0: tensor<99x2x3x4x99x99x!TFHE.glw
  %c97 = arith.constant 97 : index

  %0 = bufferization.alloc_tensor() : tensor<2x3x4x!TFHE.glwe<sk<1,1,750>>>
-  // CHECK: %[[VDROP1DIMS:.*]] = tensor.collapse_shape [[ARG:.*]] {{\[\[0, 1\], \[2\], \[3, 4, 5\]\]}} : tensor<1x2x3x4x1x1x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>> into tensor<2x3x4x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>
-  // CHECK: %[[V0:.*]] = tensor.collapse_shape %[[VDROP1DIMS]] {{\[\[0, 1, 2\]\]}} : tensor<2x3x4x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>> into tensor<24x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>
+  // CHECK: %[[V0:.*]] = tensor.collapse_shape %[[SLICE:.*]] {{\[\[0, 1, 2, 3, 4, 5\]\]}} : tensor<1x2x3x4x1x1x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>> into tensor<24x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>
  // CHECK: %[[V1:.*]] = "TFHE.batched_keyswitch_glwe"(%[[V0]]) {key = #TFHE<ksk{{\[}}[[KSK:.*]]{{\]}}<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>, sk{{\[}}[[SK_OUT]]{{\]}}<1,750>, 3, 4>>} : (tensor<24x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>) -> tensor<24x!TFHE.glwe<sk{{\[}}[[SK_OUT]]{{\]}}<1,750>>>
  // CHECK: %[[V2:.*]] = tensor.expand_shape %[[V1]] {{\[\[0, 1, 2\]\]}} : tensor<24x!TFHE.glwe<sk{{\[}}[[SK_OUT]]{{\]}}<1,750>>> into tensor<2x3x4x!TFHE.glwe<sk{{\[}}[[SK_OUT]]{{\]}}<1,750>>>
  // CHECK: return %[[V2]]
@@ -161,8 +160,7 @@ func.func @batch_offset_shifted_bounds_nonunitstep_extract_keyswitch(%arg0: tens
  %0 = bufferization.alloc_tensor() : tensor<2x2x2x!TFHE.glwe<sk<1,1,750>>>

  // CHECK: %[[V1:.*]] = tensor.extract_slice %arg0{{\[0, 3, 7, 9, 97, 1\] \[1, 2, 2, 2, 1, 1\] \[1, 2, 1, 7, 1, 1\]}} : tensor<99x20x30x40x99x99x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>> to tensor<1x2x2x2x1x1x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>
-  // CHECK-NEXT: %[[V2:.*]] = tensor.collapse_shape %[[V1]] {{\[\[0, 1\], \[2\], \[3, 4, 5\]\]}} : tensor<1x2x2x2x1x1x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>> into tensor<2x2x2x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>
-  // CHECK-NEXT: %[[V3:.*]] = tensor.collapse_shape %[[V2]] {{\[\[0, 1, 2\]\]}} : tensor<2x2x2x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>> into tensor<8x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>
+  // CHECK-NEXT: %[[V3:.*]] = tensor.collapse_shape %[[V1]] {{\[\[0, 1, 2, 3, 4, 5\]\]}} : tensor<1x2x2x2x1x1x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>> into tensor<8x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>
  // CHECK-NEXT: %[[V4:.*]] = "TFHE.batched_keyswitch_glwe"(%[[V3]]) {key = #TFHE<ksk{{\[}}[[KSK:.*]]{{\]}}<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>, sk{{\[}}[[SK_OUT]]{{\]}}<1,750>, 3, 4>>} : (tensor<8x!TFHE.glwe<sk{{\[}}[[SK_IN]]{{\]}}<1,2048>>>) -> tensor<8x!TFHE.glwe<sk{{\[}}[[SK_OUT]]{{\]}}<1,750>>>
  // CHECK-NEXT: %[[V5:.*]] = tensor.expand_shape %[[V4]] {{\[\[0, 1, 2\]\]}} : tensor<8x!TFHE.glwe<sk{{\[}}[[SK_OUT]]{{\]}}<1,750>>> into tensor<2x2x2x!TFHE.glwe<sk{{\[}}[[SK_OUT]]{{\]}}<1,750>>>
  // CHECK-NEXT: return %[[V5]] : tensor<2x2x2x!TFHE.glwe<sk{{\[}}[[SK_OUT]]{{\]}}<1,750>>>
--- a/compilers/concrete-compiler/compiler/tests/check_tests/Transforms/scf-to-parallel.mlir
+++ b/compilers/concrete-compiler/compiler/tests/check_tests/Transforms/scf-to-parallel.mlir
@@ -0,0 +1,18 @@
+// RUN: concretecompiler --split-input-file --action=dump-std --parallelize --parallelize-loops --skip-program-info --passes=for-loop-to-parallel --skip-program-info %s 2>&1| FileCheck %s
+
+func.func @bar() -> () {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %i0 = arith.constant 0 : i32
+  %i1 = arith.constant 1 : i32
+
+  // CHECK-NOT: scf.parallel
+  %0 = scf.for %iv = %c0 to %c4 step %c1 iter_args(%ia = %i0) -> i32 {
+    "Tracing.trace_plaintext"(%i0) : (i32) -> ()
+    %yld = arith.addi %ia, %i1 : i32
+    scf.yield %yld : i32
+  } {"parallel" = false }
+  
+  return
+}
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_benchmarks/end_to_end_benchmark.cpp
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_benchmarks/end_to_end_benchmark.cpp
@@ -54,15 +54,13 @@ static void BM_ExportArguments(benchmark::State &state,
  inputArguments.reserve(test.inputs.size());

  auto client = tc.getClientCircuit().value();
-  if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-    for (auto _ : state) {
-      for (size_t i = 0; i < test.inputs.size(); i++) {
-        auto input = client.prepareInput(test.inputs[i].getValue(), i).value();
-        inputArguments.push_back(input);
-      }
+  for (auto _ : state) {
+    for (size_t i = 0; i < test.inputs.size(); i++) {
+      auto input = client.prepareInput(test.inputs[i].getValue(), i).value();
+      inputArguments.push_back(input);
    }
-    inputArguments.resize(0);
  }
+  inputArguments.resize(0);
 }

 /// Benchmark time of the program evaluation
@@ -78,12 +76,10 @@ static void BM_Evaluate(benchmark::State &state, EndToEndDesc description,
  auto inputArguments = std::vector<TransportValue>();
  inputArguments.reserve(test.inputs.size());

-  if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-    for (size_t i = 0; i < test.inputs.size(); i++) {
-      auto input =
-          clientCircuit.prepareInput(test.inputs[i].getValue(), i).value();
-      inputArguments.push_back(input);
-    }
+  for (size_t i = 0; i < test.inputs.size(); i++) {
+    auto input =
+        clientCircuit.prepareInput(test.inputs[i].getValue(), i).value();
+    inputArguments.push_back(input);
  }

  auto serverCircuit = tc.getServerCircuit().value();
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_fixture/tests_cpu/end_to_end_linalg_enc_enc_matmul_dot.yaml
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_fixture/tests_cpu/end_to_end_linalg_enc_enc_matmul_dot.yaml
@@ -26,14 +26,14 @@ program: |
 p-error: 1e-06
 tests:
  - inputs: 
-    - tensor: [-2, -1, -1, -2, -2, -1, -1, -1, -1, -1, -2, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -2, -2, -2]
+    - tensor: [-1, -1, -2, -2, -1, -2, -2, -1, -2, -1, -2, -1, -1, -2, -2, -2, -2, -1, -2, -1, -1, -2, -1, -1]
      shape: [2,3,4]
      signed: True
-    - tensor: [-2, -1, -2, -1, -1, -1, -2, -2, -1, -2, -2, -1, -1, -1, -2, -1]
+    - tensor: [-1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -2, -1, -2, -1, -2]
      shape: [2,4,2]
      signed: True
    outputs:
-    - tensor: [11, 8, 9, 6, 8, 6, 9, 7, 6, 5, 12, 10]
+    - tensor: [8, 8, 8, 8, 8, 8, 9, 13, 7, 10, 7, 9]
      shape: [2,3,2]
      signed: True
 ---
@@ -63,14 +63,14 @@ program: |
 p-error: 1e-06
 tests:
  - inputs: 
-    - tensor: [-1, -2, -2, -1, -2, -1, -2, -2, -1, -1, -1, -1]
+    - tensor: [-2, -1, -1, -1, -1, -2, -1, -1, -1, -1, -2, -2]
      shape: [3,4]
      signed: True
-    - tensor: [-2, -2, -2, -1, -1, -2, -1, -1]
+    - tensor: [-2, -2, -1, -1, -1, -2, -2, -2]
      shape: [4,2]
      signed: True
    outputs:
-    - tensor: [9, 9, 10, 11, 6, 6]
+    - tensor: [8, 9, 7, 8, 9, 11]
      shape: [3,2]
      signed: True
 ---
@@ -100,14 +100,14 @@ program: |
 p-error: 1e-06
 tests:
  - inputs: 
-    - tensor: [-1, -2, -1]
+    - tensor: [-1, -2, -2]
      shape: [3]
      signed: True
-    - tensor: [-2, -1, -2, -2, -2, -2, -2, -1, -1, -1, -2, -2, -2, -2, -2, -1, -2, -2, -1, -1, -1, -2, -2, -1]
+    - tensor: [-2, -2, -1, -2, -1, -1, -2, -1, -2, -2, -1, -2, -2, -2, -2, -1, -1, -2, -1, -1, -1, -1, -2, -1]
      shape: [4,3,2]
      signed: True
    outputs:
-    - tensor: [8, 7, 6, 5, 8, 6, 5, 6]
+    - tensor: [6, 8, 8, 9, 8, 8, 7, 5]
      shape: [4,2]
      signed: True
 ---
@@ -137,14 +137,14 @@ program: |
 p-error: 1e-06
 tests:
  - inputs: 
-    - tensor: [-1, -2, -1, -2, -2, -2, -2, -1, -2, -2, -2, -2, -1, -2, -1, -1, -2, -2, -1, -1, -2, -2, -2, -2]
+    - tensor: [-2, -1, -1, -2, -2, -1, -1, -2, -1, -2, -2, -2, -1, -2, -2, -2, -2, -1, -1, -2, -2, -2, -1, -1]
      shape: [2,3,4]
      signed: True
-    - tensor: [-1, -2, -1, -1]
+    - tensor: [-2, -2, -2, -2]
      shape: [4]
      signed: True
    outputs:
-    - tensor: [8, 9, 10, 7, 8, 10]
+    - tensor: [12, 12, 14, 14, 12, 12]
      shape: [2,3]
      signed: True
 ---
@@ -174,14 +174,14 @@ program: |
 p-error: 1e-06
 tests:
  - inputs: 
-    - tensor: [-2, -1, -2, -1, -1, -1, -2, -2, -2, -2, -2, -1, -2, -1, -2, -1, -1, -1, -1, -2, -2, -1, -2, -2]
+    - tensor: [-2, -2, -2, -1, -1, -1, -2, -1, -1, -2, -2, -2, -1, -1, -2, -2, -2, -1, -1, -1, -1, -2, -1, -2]
      shape: [2,1,3,4]
      signed: True
-    - tensor: [-2, -1, -2, -2, -2, -1, -1, -1, -2, -2, -1, -1, -2, -2, -2, -1, -2, -1, -2, -2, -2, -1, -2, -2, -1, -2, -2, -2, -2, -2, -1, -1, -1, -2, -2, -1, -2, -1, -1, -1]
+    - tensor: [-2, -1, -2, -1, -2, -2, -1, -2, -1, -2, -2, -2, -2, -2, -1, -2, -1, -1, -1, -2, -2, -1, -2, -2, -2, -2, -2, -1, -2, -1, -1, -2, -1, -2, -1, -1, -1, -1, -2, -1]
      shape: [5,4,2]
      signed: True
    outputs:
-    - tensor: [11, 7, 10, 7, 13, 9, 11, 10, 11, 9, 12, 11, 12, 8, 12, 9, 14, 10, 9, 11, 9, 10, 11, 13, 9, 8, 9, 7, 11, 9, 11, 7, 8, 6, 12, 8, 11, 10, 9, 7, 13, 11, 12, 8, 10, 8, 14, 10, 9, 11, 7, 8, 10, 12, 9, 8, 7, 6, 10, 9]
+    - tensor: [13, 10, 9, 8, 12, 11, 11, 14, 8, 10, 11, 14, 10, 10, 8, 7, 11, 11, 13, 10, 9, 7, 12, 10, 8, 9, 6, 6, 9, 8, 10, 10, 9, 7, 10, 9, 9, 12, 7, 10, 9, 12, 10, 9, 7, 7, 9, 10, 10, 9, 9, 8, 10, 9, 8, 7, 6, 7, 8, 7]
      shape: [2,5,3,2]
      signed: True
 ---
@@ -212,10 +212,10 @@ program: |
 p-error: 1e-06
 tests:
  - inputs: 
-    - tensor: [-1, -2, -2]
+    - tensor: [-2, -2, -1]
      shape: [3]
      signed: True
-    - tensor: [-1, -1, -2]
+    - tensor: [-2, -1, -1]
      shape: [3]
      signed: True
    outputs:
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_tests/CMakeLists.txt
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_tests/CMakeLists.txt
@@ -28,9 +28,3 @@ add_concretecompiler_unittest(end_to_end_jit_test end_to_end_jit_test.cc globals
 add_concretecompiler_unittest(end_to_end_test end_to_end_test.cc globals.cc)

 add_concretecompiler_unittest(end_to_end_jit_lambda end_to_end_jit_lambda.cc globals.cc)
-
-if(CONCRETELANG_DATAFLOW_EXECUTION_ENABLED)
-  add_concretecompiler_unittest(end_to_end_jit_auto_parallelization end_to_end_jit_auto_parallelization.cc globals.cc)
-  add_concretecompiler_unittest(end_to_end_jit_distributed end_to_end_jit_distributed.cc globals.cc)
-  add_concretecompiler_unittest(end_to_end_jit_aes_short end_to_end_jit_aes_short.cc globals.cc)
-endif()
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_aes_short.cc
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_aes_short.cc
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_auto_parallelization.cc
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_auto_parallelization.cc
@@ -1,167 +0,0 @@
-
-#include <concretelang/Runtime/DFRuntime.hpp>
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <tuple>
-#include <type_traits>
-
-#include "concretelang/TestLib/TestProgram.h"
-#include "end_to_end_jit_test.h"
-#include "tests_tools/GtestEnvironment.h"
-///////////////////////////////////////////////////////////////////////////////
-// Auto-parallelize independent FHE ops /////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-TEST(ParallelizeAndRunFHE, add_eint_tree) {
-  checkedJit(testCircuit, R"XXX(
-func.func @main(%arg0: !FHE.eint<7>, %arg1: !FHE.eint<7>, %arg2: !FHE.eint<7>, %arg3: !FHE.eint<7>) -> !FHE.eint<7> {
-  %1 = "FHE.add_eint"(%arg0, %arg1): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %2 = "FHE.add_eint"(%arg0, %arg2): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %3 = "FHE.add_eint"(%arg0, %arg3): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %4 = "FHE.add_eint"(%arg1, %arg2): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %5 = "FHE.add_eint"(%arg1, %arg3): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %6 = "FHE.add_eint"(%arg2, %arg3): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-
-  %7 = "FHE.add_eint"(%1, %2): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %8 = "FHE.add_eint"(%1, %3): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %9 = "FHE.add_eint"(%1, %4): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %10 = "FHE.add_eint"(%1, %5): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %11 = "FHE.add_eint"(%1, %6): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %12 = "FHE.add_eint"(%2, %3): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %13 = "FHE.add_eint"(%2, %4): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %14 = "FHE.add_eint"(%2, %5): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %15 = "FHE.add_eint"(%2, %6): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %16 = "FHE.add_eint"(%3, %4): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %17 = "FHE.add_eint"(%3, %5): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %18 = "FHE.add_eint"(%3, %6): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %19 = "FHE.add_eint"(%4, %5): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %20 = "FHE.add_eint"(%4, %6): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %21 = "FHE.add_eint"(%5, %6): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-
-  %22 = "FHE.add_eint"(%7, %8): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %23 = "FHE.add_eint"(%9, %10): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %24 = "FHE.add_eint"(%11, %12): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %25 = "FHE.add_eint"(%13, %14): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %26 = "FHE.add_eint"(%15, %16): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %27 = "FHE.add_eint"(%17, %18): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %28 = "FHE.add_eint"(%19, %20): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-
-  %29 = "FHE.add_eint"(%22, %23): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %30 = "FHE.add_eint"(%24, %25): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %31 = "FHE.add_eint"(%26, %27): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %32 = "FHE.add_eint"(%21, %28): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-
-  %33 = "FHE.add_eint"(%29, %30): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  %34 = "FHE.add_eint"(%31, %32): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-
-  %35 = "FHE.add_eint"(%33, %34): (!FHE.eint<7>, !FHE.eint<7>) -> (!FHE.eint<7>)
-  return %35: !FHE.eint<7>
-}
-)XXX",
-             "main", false, true, false, false, 1e-40);
-
-  auto lambda = [&](std::vector<concretelang::values::Value> args) {
-    return testCircuit.call(args)
-        .value()[0]
-        .template getTensor<uint64_t>()
-        .value()[0];
-  };
-
-  if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-    ASSERT_EQ(lambda({Tensor<uint64_t>(1), Tensor<uint64_t>(2),
-                      Tensor<uint64_t>(3), Tensor<uint64_t>(4)}),
-              (uint64_t)150);
-    ASSERT_EQ(lambda({Tensor<uint64_t>(4), Tensor<uint64_t>(5),
-                      Tensor<uint64_t>(6), Tensor<uint64_t>(7)}),
-              (uint64_t)74);
-    ASSERT_EQ(lambda({Tensor<uint64_t>(1), Tensor<uint64_t>(1),
-                      Tensor<uint64_t>(1), Tensor<uint64_t>(1)}),
-              (uint64_t)60);
-    ASSERT_EQ(lambda({Tensor<uint64_t>(5), Tensor<uint64_t>(7),
-                      Tensor<uint64_t>(11), Tensor<uint64_t>(13)}),
-              (uint64_t)28);
-  } else {
-    ASSERT_OUTCOME_HAS_FAILURE(testCircuit.call({}));
-    ASSERT_OUTCOME_HAS_FAILURE(testCircuit.call({}));
-    ASSERT_OUTCOME_HAS_FAILURE(testCircuit.call({}));
-    ASSERT_OUTCOME_HAS_FAILURE(testCircuit.call({}));
-  }
-}
-
-std::vector<uint64_t> parallel_results;
-
-TEST(ParallelizeAndRunFHE, nn_small_parallel) {
-  checkedJit(lambda, R"XXX(
-  func.func @main(%arg0: tensor<4x5x!FHE.eint<5>>) -> tensor<4x7x!FHE.eint<5>> {
-    %cst = arith.constant dense<[[0, 0, 1, 0, 1, 1, 0], [1, 1, 1, 0, 1, 0, 0], [1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1]]> : tensor<4x7xi6>
-    %cst_0 = arith.constant dense<[[1, 0, 1, 1, 0, 1, 1], [0, 1, 0, 0, 0, 0, 1], [0, 1, 1, 1, 1, 0, 0], [0, 1, 1, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 1]]> : tensor<5x7xi6>
-    %0 = "FHELinalg.matmul_eint_int"(%arg0, %cst_0) : (tensor<4x5x!FHE.eint<5>>, tensor<5x7xi6>) -> tensor<4x7x!FHE.eint<5>>
-    %1 = "FHELinalg.add_eint_int"(%0, %cst) : (tensor<4x7x!FHE.eint<5>>, tensor<4x7xi6>) -> tensor<4x7x!FHE.eint<5>>
-    %cst_1 = arith.constant dense<[0, 3, 7, 10, 14, 17, 21, 24, 28, 31, 35, 38, 42, 45, 49, 52, 56, 59, 63, 66, 70, 73, 77, 80, 84, 87, 91, 94, 98, 101, 105, 108]> : tensor<32xi64>
-    %2 = "FHELinalg.apply_lookup_table"(%1, %cst_1) : (tensor<4x7x!FHE.eint<5>>, tensor<32xi64>) -> tensor<4x7x!FHE.eint<5>>
-    return %2 : tensor<4x7x!FHE.eint<5>>
-  }
-)XXX",
-             "main", false, true, true);
-
-  const size_t dim0 = 4;
-  const size_t dim1 = 5;
-  const size_t dim2 = 7;
-  const std::vector<size_t> inputShape({dim0, dim1});
-  const std::vector<size_t> outputShape({dim0, dim2});
-  std::vector<uint64_t> values;
-  values.reserve(dim0 * dim1);
-  for (size_t i = 0; i < dim0 * dim1; ++i) {
-    values.push_back(i % 17 % 4);
-  }
-  auto input = Tensor<uint64_t>(values, inputShape);
-
-  if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-    auto maybeResult = lambda.call({input});
-    ASSERT_OUTCOME_HAS_VALUE(maybeResult);
-    auto result = maybeResult.value()[0].template getTensor<uint64_t>().value();
-    ASSERT_EQ(result.dimensions, outputShape);
-    parallel_results = result.values;
-  } else {
-    ASSERT_OUTCOME_HAS_FAILURE(lambda.call({}));
-  }
-}
-
-TEST(ParallelizeAndRunFHE, nn_small_sequential) {
-  if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-    checkedJit(lambda, R"XXX(
-  func.func @main(%arg0: tensor<4x5x!FHE.eint<5>>) -> tensor<4x7x!FHE.eint<5>> {
-    %cst = arith.constant dense<[[0, 0, 1, 0, 1, 1, 0], [1, 1, 1, 0, 1, 0, 0], [1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1]]> : tensor<4x7xi6>
-    %cst_0 = arith.constant dense<[[1, 0, 1, 1, 0, 1, 1], [0, 1, 0, 0, 0, 0, 1], [0, 1, 1, 1, 1, 0, 0], [0, 1, 1, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 1]]> : tensor<5x7xi6>
-    %0 = "FHELinalg.matmul_eint_int"(%arg0, %cst_0) : (tensor<4x5x!FHE.eint<5>>, tensor<5x7xi6>) -> tensor<4x7x!FHE.eint<5>>
-    %1 = "FHELinalg.add_eint_int"(%0, %cst) : (tensor<4x7x!FHE.eint<5>>, tensor<4x7xi6>) -> tensor<4x7x!FHE.eint<5>>
-    %cst_1 = arith.constant dense<[0, 3, 7, 10, 14, 17, 21, 24, 28, 31, 35, 38, 42, 45, 49, 52, 56, 59, 63, 66, 70, 73, 77, 80, 84, 87, 91, 94, 98, 101, 105, 108]> : tensor<32xi64>
-    %2 = "FHELinalg.apply_lookup_table"(%1, %cst_1) : (tensor<4x7x!FHE.eint<5>>, tensor<32xi64>) -> tensor<4x7x!FHE.eint<5>>
-    return %2 : tensor<4x7x!FHE.eint<5>>
-  }
-)XXX",
-               "main", false, false, false);
-
-    const size_t dim0 = 4;
-    const size_t dim1 = 5;
-    const size_t dim2 = 7;
-    const std::vector<size_t> inputShape({dim0, dim1});
-    const std::vector<size_t> outputShape({dim0, dim2});
-    std::vector<uint64_t> values;
-    values.reserve(dim0 * dim1);
-    for (size_t i = 0; i < dim0 * dim1; ++i) {
-      values.push_back(i % 17 % 4);
-    }
-    auto input = Tensor<uint64_t>(values, inputShape);
-
-    if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-      auto maybeResult = lambda.call({input});
-      ASSERT_OUTCOME_HAS_VALUE(maybeResult);
-      auto result =
-          maybeResult.value()[0].template getTensor<uint64_t>().value();
-      for (size_t i = 0; i < dim0 * dim2; i++)
-        EXPECT_EQ(parallel_results[i], result.values[i])
-            << "result differ at pos " << i;
-    }
-  }
-}
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.cc
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.cc
@@ -1,148 +0,0 @@
-#include <concretelang/Runtime/DFRuntime.hpp>
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <tuple>
-#include <type_traits>
-
-#include "concretelang/TestLib/TestProgram.h"
-#include "end_to_end_jit_test.h"
-#include "tests_tools/GtestEnvironment.h"
-
-///////////////////////////////////////////////////////////////////////////////
-// Auto-parallelize independent FHE ops /////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////
-
-std::vector<uint64_t> distributed_results;
-
-TEST(Distributed, nn_med_nested) {
-  checkedJit(lambda, R"XXX(
-func.func @main(%arg0: tensor<200x4x!FHE.eint<4>>) -> tensor<200x8x!FHE.eint<4>> {
-  %cst = arith.constant dense<"0x01010100010100000001010101000101010101010101010001000101000001010001010100000101000001000001010001000001010100010001000000010100010001010001000001000101010101000100010001000000000100010001000101000001000101010100010001000000000101000100000000000001000100000100000100000001010000010001000101000100010001000100000100000100010101010000000000000000010001010000000100000100010100000100000000010001000101000100000000000101010101000101010101010100010100010100000000000101010100000100010100000001000101000000010101000101000100000101010100010101010000010101010100010000000000000001010101000100010101000001010001010000010001010101000000000000000001000001000000010100000100000101010100010001000000000000010100010101000000010100000100010001010001000000000100010001000101010100010100000001010100010101010100010100010001000001000000000101000101010001000100000101010100000101010100000100010101000100000101000101010100010001000101010100010001010001010000010000010001010000000001000101010001000000000101000000010000010100010001000001000001010101000100010001010100000101000000010001000000000101000101000000010000000001000101010100010001000000000001010000010001000001010101000101010101010100000000000001000100000100000001000000010101010101000000000101010101000100000101000100000000000001000100000101000101010100010000000101000000000100000100000101010000010100000000010000000000010001000100000101010001010101000000000000010000010101010001000000010001010001010000000000000101000000010101010101000001010101000001000001010100000000010001010100000100000101000101010100010001010001000001000100000101000100010100000100010000000101000000010000010001010101010000000101000000010101000001010100000100010001000000000001010000000100010000000000000000000000000001010101010101010101000001010101000001010100000001000101010101010000010101000101010100010101010000010101010100000100000000000101010000000000010101010000000001000000010100000100000001000101010000000001000001000001010001010000010001000101010001010001010101000100010000000100000100010101000000000101010101010001000100000000000101010000010101000001010001010000000001010100000101000001010000000001010101000100010000010101000000000001000101000001010101000101000001000001000000010100010001000101010100010001010000000101000000010001000001000100000101010001000001000001000101010000010001000001000101000000000000000101010000010000000101010100010100010001010101010000000000010001000101010000000001010100000000010001010100010001000001000101000000010100010000010000010001010100010000010001010100010000010100010101010001000100010100010101000100000101010100000100010100000100000000010101000000010001000001010000000101000100000100010101000000010100000101000001010001010100010000000101010000000001010001000000010100010101010001000100010001000001010101000000010001000100000100010101000000000000010100010000000100000000010100010000000100000101010000010101000100010000010100000001000100000000000100000001010101010101000100010001000000010101010100000001000001000001010001000101010100000001010001010100010101000101000000010001010100010101000100000101000101000001000001000001000101010100010001010000000100000101010100000001000000000000010101000100010001000001000001000000000000010100000100000001"> : tensor<200x8xi5>
-  %cst_0 = arith.constant dense<[[1, 0, 0, 0, 1, 0, 0, 1], [0, 0, 1, 1, 0, 0, 0, 0], [1, 1, 0, 1, 1, 0, 1, 1], [1, 1, 0, 0, 1, 0, 1, 1]]> : tensor<4x8xi5>
-  %cst_1 = arith.constant dense<[0, 3, 7, 10, 14, 17, 21, 24, 28, 31, 35, 38, 42, 45, 49, 52]> : tensor<16xi64>
-  %0 = "FHELinalg.matmul_eint_int"(%arg0, %cst_0) : (tensor<200x4x!FHE.eint<4>>, tensor<4x8xi5>) -> tensor<200x8x!FHE.eint<4>>
-  %1 = "FHELinalg.add_eint_int"(%0, %cst) : (tensor<200x8x!FHE.eint<4>>, tensor<200x8xi5>) -> tensor<200x8x!FHE.eint<4>>
-
-  %res = "FHE.zero_tensor"() : () -> tensor<200x8x!FHE.eint<4>>
-
-  %slice_A = tensor.extract_slice %1[0,   0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_B = tensor.extract_slice %1[25,  0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_C = tensor.extract_slice %1[50,  0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_D = tensor.extract_slice %1[75,  0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_E = tensor.extract_slice %1[100, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_F = tensor.extract_slice %1[125, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_G = tensor.extract_slice %1[150, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_H = tensor.extract_slice %1[175, 0][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_I = tensor.extract_slice %1[0,   4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_J = tensor.extract_slice %1[25,  4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_K = tensor.extract_slice %1[50,  4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_L = tensor.extract_slice %1[75,  4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_M = tensor.extract_slice %1[100, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_N = tensor.extract_slice %1[125, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_O = tensor.extract_slice %1[150, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-  %slice_P = tensor.extract_slice %1[175, 4][25, 4][1, 1] : tensor<200x8x!FHE.eint<4>> to tensor<25x4x!FHE.eint<4>>
-
-  %part_A = "FHELinalg.apply_lookup_table"(%slice_A, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_B = "FHELinalg.apply_lookup_table"(%slice_B, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_C = "FHELinalg.apply_lookup_table"(%slice_C, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_D = "FHELinalg.apply_lookup_table"(%slice_D, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_E = "FHELinalg.apply_lookup_table"(%slice_E, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_F = "FHELinalg.apply_lookup_table"(%slice_F, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_G = "FHELinalg.apply_lookup_table"(%slice_G, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_H = "FHELinalg.apply_lookup_table"(%slice_H, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_I = "FHELinalg.apply_lookup_table"(%slice_I, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_J = "FHELinalg.apply_lookup_table"(%slice_J, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_K = "FHELinalg.apply_lookup_table"(%slice_K, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_L = "FHELinalg.apply_lookup_table"(%slice_L, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_M = "FHELinalg.apply_lookup_table"(%slice_M, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_N = "FHELinalg.apply_lookup_table"(%slice_N, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_O = "FHELinalg.apply_lookup_table"(%slice_O, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-  %part_P = "FHELinalg.apply_lookup_table"(%slice_P, %cst_1) : (tensor<25x4x!FHE.eint<4>>, tensor<16xi64>) -> tensor<25x4x!FHE.eint<4>>
-
-  %res_A = tensor.insert_slice %part_A into %res  [0,   0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_B = tensor.insert_slice %part_B into %res_A[25,  0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_C = tensor.insert_slice %part_C into %res_B[50,  0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_D = tensor.insert_slice %part_D into %res_C[75,  0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_E = tensor.insert_slice %part_E into %res_D[100, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_F = tensor.insert_slice %part_F into %res_E[125, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_G = tensor.insert_slice %part_G into %res_F[150, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_H = tensor.insert_slice %part_H into %res_G[175, 0][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_I = tensor.insert_slice %part_I into %res_H[0,   4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_J = tensor.insert_slice %part_J into %res_I[25,  4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_K = tensor.insert_slice %part_K into %res_J[50,  4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_L = tensor.insert_slice %part_L into %res_K[75,  4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_M = tensor.insert_slice %part_M into %res_L[100, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_N = tensor.insert_slice %part_N into %res_M[125, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_O = tensor.insert_slice %part_O into %res_N[150, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-  %res_P = tensor.insert_slice %part_P into %res_O[175, 4][25, 4][1, 1] : tensor<25x4x!FHE.eint<4>> into tensor<200x8x!FHE.eint<4>>
-
-  return %res_P : tensor<200x8x!FHE.eint<4>>
-}
-)XXX",
-             "main", false, true, true, DEFAULT_batchTFHEOps,
-             DEFAULT_global_p_error, DEFAULT_chunkedIntegers, DEFAULT_chunkSize,
-             DEFAULT_chunkWidth, false);
-
-  const size_t dim0 = 200;
-  const size_t dim1 = 4;
-  const size_t dim2 = 8;
-  const std::vector<size_t> inputShape({dim0, dim1});
-  const std::vector<size_t> outputShape({dim0, dim2});
-  std::vector<uint64_t> values;
-  values.reserve(dim0 * dim1);
-  for (size_t i = 0; i < dim0 * dim1; ++i) {
-    values.push_back(i % 17 % 4);
-  }
-  auto input = Tensor<uint64_t>(values, inputShape);
-
-  if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-    auto maybeResult = lambda.call({input});
-    ASSERT_OUTCOME_HAS_VALUE(maybeResult);
-    auto result = maybeResult.value()[0].template getTensor<uint64_t>().value();
-    ASSERT_EQ(result.dimensions, outputShape);
-    distributed_results = result.values;
-  } else {
-    ASSERT_OUTCOME_HAS_VALUE(lambda.call({}));
-  }
-}
-
-TEST(Distributed, nn_med_sequential) {
-  if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-    checkedJit(lambda, R"XXX(
-  func.func @main(%arg0: tensor<200x4x!FHE.eint<4>>) -> tensor<200x8x!FHE.eint<4>> {
-    %cst = arith.constant dense<"0x01010100010100000001010101000101010101010101010001000101000001010001010100000101000001000001010001000001010100010001000000010100010001010001000001000101010101000100010001000000000100010001000101000001000101010100010001000000000101000100000000000001000100000100000100000001010000010001000101000100010001000100000100000100010101010000000000000000010001010000000100000100010100000100000000010001000101000100000000000101010101000101010101010100010100010100000000000101010100000100010100000001000101000000010101000101000100000101010100010101010000010101010100010000000000000001010101000100010101000001010001010000010001010101000000000000000001000001000000010100000100000101010100010001000000000000010100010101000000010100000100010001010001000000000100010001000101010100010100000001010100010101010100010100010001000001000000000101000101010001000100000101010100000101010100000100010101000100000101000101010100010001000101010100010001010001010000010000010001010000000001000101010001000000000101000000010000010100010001000001000001010101000100010001010100000101000000010001000000000101000101000000010000000001000101010100010001000000000001010000010001000001010101000101010101010100000000000001000100000100000001000000010101010101000000000101010101000100000101000100000000000001000100000101000101010100010000000101000000000100000100000101010000010100000000010000000000010001000100000101010001010101000000000000010000010101010001000000010001010001010000000000000101000000010101010101000001010101000001000001010100000000010001010100000100000101000101010100010001010001000001000100000101000100010100000100010000000101000000010000010001010101010000000101000000010101000001010100000100010001000000000001010000000100010000000000000000000000000001010101010101010101000001010101000001010100000001000101010101010000010101000101010100010101010000010101010100000100000000000101010000000000010101010000000001000000010100000100000001000101010000000001000001000001010001010000010001000101010001010001010101000100010000000100000100010101000000000101010101010001000100000000000101010000010101000001010001010000000001010100000101000001010000000001010101000100010000010101000000000001000101000001010101000101000001000001000000010100010001000101010100010001010000000101000000010001000001000100000101010001000001000001000101010000010001000001000101000000000000000101010000010000000101010100010100010001010101010000000000010001000101010000000001010100000000010001010100010001000001000101000000010100010000010000010001010100010000010001010100010000010100010101010001000100010100010101000100000101010100000100010100000100000000010101000000010001000001010000000101000100000100010101000000010100000101000001010001010100010000000101010000000001010001000000010100010101010001000100010001000001010101000000010001000100000100010101000000000000010100010000000100000000010100010000000100000101010000010101000100010000010100000001000100000000000100000001010101010101000100010001000000010101010100000001000001000001010001000101010100000001010001010100010101000101000000010001010100010101000100000101000101000001000001000001000101010100010001010000000100000101010100000001000000000000010101000100010001000001000001000000000000010100000100000001"> : tensor<200x8xi5>
-    %cst_0 = arith.constant dense<[[1, 0, 0, 0, 1, 0, 0, 1], [0, 0, 1, 1, 0, 0, 0, 0], [1, 1, 0, 1, 1, 0, 1, 1], [1, 1, 0, 0, 1, 0, 1, 1]]> : tensor<4x8xi5>
-    %0 = "FHELinalg.matmul_eint_int"(%arg0, %cst_0) : (tensor<200x4x!FHE.eint<4>>, tensor<4x8xi5>) -> tensor<200x8x!FHE.eint<4>>
-    %1 = "FHELinalg.add_eint_int"(%0, %cst) : (tensor<200x8x!FHE.eint<4>>, tensor<200x8xi5>) -> tensor<200x8x!FHE.eint<4>>
-    %cst_1 = arith.constant dense<[0, 3, 7, 10, 14, 17, 21, 24, 28, 31, 35, 38, 42, 45, 49, 52]> : tensor<16xi64>
-    %2 = "FHELinalg.apply_lookup_table"(%1, %cst_1) : (tensor<200x8x!FHE.eint<4>>, tensor<16xi64>) -> tensor<200x8x!FHE.eint<4>>
-    return %2 : tensor<200x8x!FHE.eint<4>>
-  }
-)XXX",
-               "main", false, false, false, DEFAULT_batchTFHEOps,
-               DEFAULT_global_p_error, DEFAULT_chunkedIntegers,
-               DEFAULT_chunkSize, DEFAULT_chunkWidth, false);
-
-    const size_t dim0 = 200;
-    const size_t dim1 = 4;
-    const size_t dim2 = 8;
-    const std::vector<size_t> inputShape({dim0, dim1});
-    const std::vector<size_t> outputShape({dim0, dim2});
-    std::vector<uint64_t> values;
-    values.reserve(dim0 * dim1);
-    for (size_t i = 0; i < dim0 * dim1; ++i) {
-      values.push_back(i % 17 % 4);
-    }
-    auto input = Tensor<uint64_t>(values, inputShape);
-
-    if (mlir::concretelang::dfr::_dfr_is_root_node()) {
-      auto maybeResult = lambda.call({input});
-      ASSERT_OUTCOME_HAS_VALUE(maybeResult);
-      auto result =
-          maybeResult.value()[0].template getTensor<uint64_t>().value();
-      for (size_t i = 0; i < dim0 * dim2; i++)
-        EXPECT_EQ(distributed_results[i], result.values[i])
-            << "result differ at pos " << i;
-    }
-  }
-}
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.sh
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_distributed.sh
@@ -1,23 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=end_to_end_jit_distributed
-#SBATCH --mail-type=BEGIN,END,FAIL
-#SBATCH --mail-user=antoniu.pop@zama.ai
-#SBATCH --nodes=4
-#SBATCH --cpus-per-task=8
-#SBATCH --time=00:45:00
-#SBATCH --output=end_to_end_jit_distributed_%j.log
-
-echo "Date              = $(date)"
-echo "Hostname          = $(hostname -s)"
-echo "Working Directory = $(pwd)"
-echo ""
-echo "Number of Nodes Allocated      = $SLURM_JOB_NUM_NODES"
-echo "Number of Tasks Allocated      = $SLURM_NTASKS"
-echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK"
-
-export OMP_NUM_THREADS=8
-export DFR_NUM_THREADS=2
-
-srun ./build/bin/end_to_end_jit_distributed
-
-date
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_test.cc
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_jit_test.cc
@@ -407,8 +407,10 @@ func.func @main(%arg0: !FHE.eint<3>) -> !FHE.eint<3> {
 }
 )XXX");
  ASSERT_OUTCOME_HAS_FAILURE_WITH_ERRORMSG(
-      err, "Program can not be composed: Dag is not composable, because of "
-           "output 1: Partition 0 has input coefficient 4");
+      err, "Program can not be composed: At -:4:8: please add "
+           "`fhe.refresh(...)` to guarantee the function composability.\n"
+           "The noise of the node 0 is contaminated by noise coming straight "
+           "from the input (partition: 0, coeff: 4.00).");
 }

 TEST(CompileNotComposable, not_composable_2) {
@@ -428,8 +430,10 @@ func.func @main(%arg0: !FHE.eint<3>) -> (!FHE.eint<3>, !FHE.eint<3>) {
 }
 )XXX");
  ASSERT_OUTCOME_HAS_FAILURE_WITH_ERRORMSG(
-      err, "Program can not be composed: Dag is not composable, because of "
-           "output 1: Partition 0 has input coefficient 4");
+      err, "Program can not be composed: At -:5:8: please add "
+           "`fhe.refresh(...)` to guarantee the function composability.\n"
+           "The noise of the node 0 is contaminated by noise coming straight "
+           "from the input (partition: 0, coeff: 4.00).");
 }

 TEST(CompileComposable, composable_supported_v0) {
--- a/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_test.cc
+++ b/compilers/concrete-compiler/compiler/tests/end_to_end_tests/end_to_end_test.cc
@@ -6,7 +6,6 @@
 #include <type_traits>

 #include "concretelang/Common/Values.h"
-#include "concretelang/Runtime/DFRuntime.hpp"
 #include "concretelang/Support/CompilationFeedback.h"
 #include "concretelang/TestLib/TestProgram.h"
 #include "end_to_end_fixture/EndToEndFixture.h"
@@ -59,18 +58,10 @@ public:
  void testOnce() {
    for (auto tests_rep = 0; tests_rep <= options.numberOfRetry; tests_rep++) {
      // We execute the circuit.
-      auto maybeRes =
-          testCircuit->call((mlir::concretelang::dfr::_dfr_is_root_node())
-                                ? args
-                                : std::vector<Value>());
-      if (!mlir::concretelang::dfr::_dfr_is_root_node())
-        return;
+      auto maybeRes = testCircuit->call(args);
      ASSERT_OUTCOME_HAS_VALUE(maybeRes);
      auto result = maybeRes.value();

-      if (!mlir::concretelang::dfr::_dfr_is_root_node())
-        return;
-
      /* Check results */
      bool allgood = true;
      for (size_t i = 0; i < desc.outputs.size(); i++) {
--- a/compilers/concrete-compiler/compiler/tests/unit_tests/concretelang/Encodings/Encodings_unit_tests.cpp
+++ b/compilers/concrete-compiler/compiler/tests/unit_tests/concretelang/Encodings/Encodings_unit_tests.cpp
@@ -36,16 +36,16 @@ Result<TestProgram> setupTestProgram(std::string source,
  auto outputs = circuitEncoding.asBuilder().initOutputs(1);
  circuitEncoding.asBuilder().setName(funcname);

-  auto encodingInfo = Message<concreteprotocol::EncodingInfo>().asBuilder();
-  encodingInfo.initShape();
-  auto integer = encodingInfo.getEncoding().initIntegerCiphertext();
+  auto encodingInfo = Message<concreteprotocol::EncodingInfo>();
+  encodingInfo.asBuilder().initShape();
+  auto integer = encodingInfo.asBuilder().getEncoding().initIntegerCiphertext();
  integer.getMode().initNative();
  integer.setWidth(3);
  integer.setIsSigned(false);

-  inputs.setWithCaveats(0, encodingInfo);
-  inputs.setWithCaveats(1, encodingInfo);
-  outputs.setWithCaveats(0, encodingInfo);
+  inputs.setWithCaveats(0, encodingInfo.asReader());
+  inputs.setWithCaveats(1, encodingInfo.asReader());
+  outputs.setWithCaveats(0, encodingInfo.asReader());

  options.encodings = Message<concreteprotocol::ProgramEncodingInfo>();
  options.encodings->asBuilder().initCircuits(1).setWithCaveats(
@@ -61,14 +61,14 @@ Result<TestProgram> setupTestProgram(std::string source,
 TEST(Encodings_unit_tests, multi_key) {
  std::string source = R"(
 func.func @main(
-  %arg0: !TFHE.glwe<sk<1,1,2048>>, 
+  %arg0: !TFHE.glwe<sk<1,1,2048>>,
  %arg1: !TFHE.glwe<sk<2,1,2048>>
  ) -> !TFHE.glwe<sk<2,1,2048>> {

  %0 = "TFHE.keyswitch_glwe"(%arg0) {key=#TFHE.ksk<sk<1,1,2048>, sk<2, 1,2048>, 7, 2>} : (!TFHE.glwe<sk<1, 1, 2048>>) -> !TFHE.glwe<sk<2, 1, 2048>>
  %1 = "TFHE.add_glwe"(%arg1, %0) : (!TFHE.glwe<sk<2,1,2048>>, !TFHE.glwe<sk<2,1,2048>>) -> !TFHE.glwe<sk<2,1,2048>>
  return %1 : !TFHE.glwe<sk<2,1,2048>>
-  
+
 }
 )";
  ASSERT_ASSIGN_OUTCOME_VALUE(circuit, setupTestProgram(source));
--- a/compilers/concrete-optimizer/concrete-optimizer-cpp/src/concrete-optimizer.rs
+++ b/compilers/concrete-optimizer/concrete-optimizer-cpp/src/concrete-optimizer.rs
@@ -1,3 +1,6 @@
+#![allow(clippy::boxed_local)]
+#![allow(clippy::too_many_arguments)]
+
 use concrete_optimizer::computing_cost::cpu::CpuComplexity;
 use concrete_optimizer::config;
 use concrete_optimizer::config::ProcessingUnit;
@@ -42,7 +45,7 @@ fn caches_from(options: ffi::Options) -> decomposition::PersistDecompCaches {
    decomposition::cache(
        options.security_level,
        processing_unit,
-        Some(ProcessingUnit::Cpu.complexity_model()),
+        Some(processing_unit.complexity_model()),
        options.cache_on_disk,
        options.ciphertext_modulus_log,
        options.fft_precision,
@@ -612,12 +615,19 @@ impl Dag {
 pub struct DagBuilder<'dag>(unparametrized::DagBuilder<'dag>);

 impl<'dag> DagBuilder<'dag> {
-    fn add_input(&mut self, out_precision: Precision, out_shape: &[u64]) -> ffi::OperatorIndex {
+    fn add_input(
+        &mut self,
+        out_precision: Precision,
+        out_shape: &[u64],
+        location: &Location,
+    ) -> ffi::OperatorIndex {
        let out_shape = Shape {
            dimensions_size: out_shape.to_owned(),
        };

-        self.0.add_input(out_precision, out_shape).into()
+        self.0
+            .add_input(out_precision, out_shape, location.0.clone())
+            .into()
    }

    fn add_lut(
@@ -625,12 +635,15 @@ impl<'dag> DagBuilder<'dag> {
        input: ffi::OperatorIndex,
        table: &[u64],
        out_precision: Precision,
+        location: &Location,
    ) -> ffi::OperatorIndex {
        let table = FunctionTable {
            values: table.to_owned(),
        };

-        self.0.add_lut(input.into(), table, out_precision).into()
+        self.0
+            .add_lut(input.into(), table, out_precision, location.0.clone())
+            .into()
    }

    #[allow(clippy::boxed_local)]
@@ -638,10 +651,11 @@ impl<'dag> DagBuilder<'dag> {
        &mut self,
        inputs: &[ffi::OperatorIndex],
        weights: Box<Weights>,
+        location: &Location,
    ) -> ffi::OperatorIndex {
        let inputs: Vec<OperatorIndex> = inputs.iter().copied().map(Into::into).collect();

-        self.0.add_dot(inputs, weights.0).into()
+        self.0.add_dot(inputs, weights.0, location.0.clone()).into()
    }

    fn add_levelled_op(
@@ -649,10 +663,12 @@ impl<'dag> DagBuilder<'dag> {
        inputs: &[ffi::OperatorIndex],
        lwe_dim_cost_factor: f64,
        fixed_cost: f64,
-        manp: f64,
+        weights: &[f64],
        out_shape: &[u64],
        comment: &str,
+        location: &Location,
    ) -> ffi::OperatorIndex {
+        debug_assert!(weights.len() == inputs.len());
        let inputs: Vec<OperatorIndex> = inputs.iter().copied().map(Into::into).collect();

        let out_shape = Shape {
@@ -665,7 +681,14 @@ impl<'dag> DagBuilder<'dag> {
        };

        self.0
-            .add_levelled_op(inputs, complexity, manp, out_shape, comment)
+            .add_levelled_op(
+                inputs,
+                complexity,
+                weights,
+                out_shape,
+                comment,
+                location.0.clone(),
+            )
            .into()
    }

@@ -673,16 +696,22 @@ impl<'dag> DagBuilder<'dag> {
        &mut self,
        input: ffi::OperatorIndex,
        rounded_precision: Precision,
+        location: &Location,
    ) -> ffi::OperatorIndex {
-        self.0.add_round_op(input.into(), rounded_precision).into()
+        self.0
+            .add_round_op(input.into(), rounded_precision, location.0.clone())
+            .into()
    }

    fn add_unsafe_cast_op(
        &mut self,
        input: ffi::OperatorIndex,
        new_precision: Precision,
+        location: &Location,
    ) -> ffi::OperatorIndex {
-        self.0.add_unsafe_cast(input.into(), new_precision).into()
+        self.0
+            .add_unsafe_cast(input.into(), new_precision, location.0.clone())
+            .into()
    }

    fn tag_operator_as_output(&mut self, op: ffi::OperatorIndex) {
@@ -694,6 +723,30 @@ impl<'dag> DagBuilder<'dag> {
    }
 }

+#[derive(Clone)]
+pub struct Location(operator::Location);
+
+fn location_unknown() -> Box<Location> {
+    Box::new(Location(operator::Location::Unknown))
+}
+
+fn location_from_string(string: &str) -> Box<Location> {
+    let location: Vec<&str> = string.split(':').collect();
+    match location[..] {
+        [file] => Box::new(Location(operator::Location::File(file.into()))),
+        [file, line] => Box::new(Location(operator::Location::Line(
+            file.into(),
+            line.parse().unwrap(),
+        ))),
+        [file, line, column] => Box::new(Location(operator::Location::LineColumn(
+            file.into(),
+            line.parse().unwrap(),
+            column.parse().unwrap(),
+        ))),
+        _ => Box::new(Location(operator::Location::Unknown)),
+    }
+}
+
 pub struct Weights(operator::Weights);

 fn vector(weights: &[i64]) -> Box<Weights> {
@@ -748,6 +801,14 @@ mod ffi {

        type DagBuilder<'dag>;

+        type Location;
+
+        #[namespace = "concrete_optimizer::utils"]
+        fn location_unknown() -> Box<Location>;
+
+        #[namespace = "concrete_optimizer::utils"]
+        fn location_from_string(string: &str) -> Box<Location>;
+
        #[namespace = "concrete_optimizer::dag"]
        fn empty() -> Box<Dag>;

@@ -761,6 +822,7 @@ mod ffi {
            self: &mut DagBuilder<'_>,
            out_precision: u8,
            out_shape: &[u64],
+            location: &Location,
        ) -> OperatorIndex;

        unsafe fn add_lut(
@@ -768,12 +830,14 @@ mod ffi {
            input: OperatorIndex,
            table: &[u64],
            out_precision: u8,
+            location: &Location,
        ) -> OperatorIndex;

        unsafe fn add_dot(
            self: &mut DagBuilder<'_>,
            inputs: &[OperatorIndex],
            weights: Box<Weights>,
+            location: &Location,
        ) -> OperatorIndex;

        unsafe fn add_levelled_op(
@@ -781,21 +845,24 @@ mod ffi {
            inputs: &[OperatorIndex],
            lwe_dim_cost_factor: f64,
            fixed_cost: f64,
-            manp: f64,
+            weights: &[f64],
            out_shape: &[u64],
            comment: &str,
+            location: &Location,
        ) -> OperatorIndex;

        unsafe fn add_round_op(
            self: &mut DagBuilder<'_>,
            input: OperatorIndex,
            rounded_precision: u8,
+            location: &Location,
        ) -> OperatorIndex;

        unsafe fn add_unsafe_cast_op(
            self: &mut DagBuilder<'_>,
            input: OperatorIndex,
            rounded_precision: u8,
+            location: &Location,
        ) -> OperatorIndex;

        unsafe fn tag_operator_as_output(self: &mut DagBuilder<'_>, op: OperatorIndex);
--- a/compilers/concrete-optimizer/concrete-optimizer-cpp/src/cpp/concrete-optimizer.cpp
+++ b/compilers/concrete-optimizer/concrete-optimizer-cpp/src/cpp/concrete-optimizer.cpp
@@ -943,6 +943,7 @@ struct CircuitKeys;
 namespace concrete_optimizer {
  struct Dag;
  struct DagBuilder;
+  struct Location;
  struct Weights;
  enum class Encoding : ::std::uint8_t;
  enum class MultiParamStrategy : ::std::uint8_t;
@@ -993,12 +994,12 @@ private:
 #define CXXBRIDGE1_STRUCT_concrete_optimizer$DagBuilder
 struct DagBuilder final : public ::rust::Opaque {
  ::rust::String dump() const noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_input(::std::uint8_t out_precision, ::rust::Slice<::std::uint64_t const> out_shape) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_lut(::concrete_optimizer::dag::OperatorIndex input, ::rust::Slice<::std::uint64_t const> table, ::std::uint8_t out_precision) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_dot(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, ::rust::Box<::concrete_optimizer::Weights> weights) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_levelled_op(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, double lwe_dim_cost_factor, double fixed_cost, double manp, ::rust::Slice<::std::uint64_t const> out_shape, ::rust::Str comment) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_round_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_unsafe_cast_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_input(::std::uint8_t out_precision, ::rust::Slice<::std::uint64_t const> out_shape, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_lut(::concrete_optimizer::dag::OperatorIndex input, ::rust::Slice<::std::uint64_t const> table, ::std::uint8_t out_precision, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_dot(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, ::rust::Box<::concrete_optimizer::Weights> weights, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_levelled_op(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, double lwe_dim_cost_factor, double fixed_cost, ::rust::Slice<double const> weights, ::rust::Slice<::std::uint64_t const> out_shape, ::rust::Str comment, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_round_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_unsafe_cast_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision, ::concrete_optimizer::Location const &location) noexcept;
  void tag_operator_as_output(::concrete_optimizer::dag::OperatorIndex op) noexcept;
  ~DagBuilder() = delete;

@@ -1011,6 +1012,20 @@ private:
 };
 #endif // CXXBRIDGE1_STRUCT_concrete_optimizer$DagBuilder

+#ifndef CXXBRIDGE1_STRUCT_concrete_optimizer$Location
+#define CXXBRIDGE1_STRUCT_concrete_optimizer$Location
+struct Location final : public ::rust::Opaque {
+  ~Location() = delete;
+
+private:
+  friend ::rust::layout;
+  struct layout {
+    static ::std::size_t size() noexcept;
+    static ::std::size_t align() noexcept;
+  };
+};
+#endif // CXXBRIDGE1_STRUCT_concrete_optimizer$Location
+
 #ifndef CXXBRIDGE1_STRUCT_concrete_optimizer$Weights
 #define CXXBRIDGE1_STRUCT_concrete_optimizer$Weights
 struct Weights final : public ::rust::Opaque {
@@ -1288,8 +1303,18 @@ extern "C" {
 ::std::size_t concrete_optimizer$cxxbridge1$Dag$operator$alignof() noexcept;
 ::std::size_t concrete_optimizer$cxxbridge1$DagBuilder$operator$sizeof() noexcept;
 ::std::size_t concrete_optimizer$cxxbridge1$DagBuilder$operator$alignof() noexcept;
+::std::size_t concrete_optimizer$cxxbridge1$Location$operator$sizeof() noexcept;
+::std::size_t concrete_optimizer$cxxbridge1$Location$operator$alignof() noexcept;
 } // extern "C"

+namespace utils {
+extern "C" {
+::concrete_optimizer::Location *concrete_optimizer$utils$cxxbridge1$location_unknown() noexcept;
+
+::concrete_optimizer::Location *concrete_optimizer$utils$cxxbridge1$location_from_string(::rust::Str string) noexcept;
+} // extern "C"
+} // namespace utils
+
 namespace dag {
 extern "C" {
 ::concrete_optimizer::Dag *concrete_optimizer$dag$cxxbridge1$empty() noexcept;
@@ -1303,17 +1328,17 @@ void concrete_optimizer$cxxbridge1$Dag$dump(::concrete_optimizer::Dag const &sel

 void concrete_optimizer$cxxbridge1$DagBuilder$dump(::concrete_optimizer::DagBuilder const &self, ::rust::String *return$) noexcept;

-::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_input(::concrete_optimizer::DagBuilder &self, ::std::uint8_t out_precision, ::rust::Slice<::std::uint64_t const> out_shape) noexcept;
+::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_input(::concrete_optimizer::DagBuilder &self, ::std::uint8_t out_precision, ::rust::Slice<::std::uint64_t const> out_shape, ::concrete_optimizer::Location const &location) noexcept;

-::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_lut(::concrete_optimizer::DagBuilder &self, ::concrete_optimizer::dag::OperatorIndex input, ::rust::Slice<::std::uint64_t const> table, ::std::uint8_t out_precision) noexcept;
+::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_lut(::concrete_optimizer::DagBuilder &self, ::concrete_optimizer::dag::OperatorIndex input, ::rust::Slice<::std::uint64_t const> table, ::std::uint8_t out_precision, ::concrete_optimizer::Location const &location) noexcept;

-::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_dot(::concrete_optimizer::DagBuilder &self, ::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, ::concrete_optimizer::Weights *weights) noexcept;
+::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_dot(::concrete_optimizer::DagBuilder &self, ::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, ::concrete_optimizer::Weights *weights, ::concrete_optimizer::Location const &location) noexcept;

-::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_levelled_op(::concrete_optimizer::DagBuilder &self, ::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, double lwe_dim_cost_factor, double fixed_cost, double manp, ::rust::Slice<::std::uint64_t const> out_shape, ::rust::Str comment) noexcept;
+::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_levelled_op(::concrete_optimizer::DagBuilder &self, ::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, double lwe_dim_cost_factor, double fixed_cost, ::rust::Slice<double const> weights, ::rust::Slice<::std::uint64_t const> out_shape, ::rust::Str comment, ::concrete_optimizer::Location const &location) noexcept;

-::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_round_op(::concrete_optimizer::DagBuilder &self, ::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision) noexcept;
+::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_round_op(::concrete_optimizer::DagBuilder &self, ::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision, ::concrete_optimizer::Location const &location) noexcept;

-::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_unsafe_cast_op(::concrete_optimizer::DagBuilder &self, ::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision) noexcept;
+::concrete_optimizer::dag::OperatorIndex concrete_optimizer$cxxbridge1$DagBuilder$add_unsafe_cast_op(::concrete_optimizer::DagBuilder &self, ::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision, ::concrete_optimizer::Location const &location) noexcept;

 void concrete_optimizer$cxxbridge1$DagBuilder$tag_operator_as_output(::concrete_optimizer::DagBuilder &self, ::concrete_optimizer::dag::OperatorIndex op) noexcept;

@@ -1393,6 +1418,24 @@ namespace utils {
  return concrete_optimizer$cxxbridge1$DagBuilder$operator$alignof();
 }

+::std::size_t Location::layout::size() noexcept {
+  return concrete_optimizer$cxxbridge1$Location$operator$sizeof();
+}
+
+::std::size_t Location::layout::align() noexcept {
+  return concrete_optimizer$cxxbridge1$Location$operator$alignof();
+}
+
+namespace utils {
+::rust::Box<::concrete_optimizer::Location> location_unknown() noexcept {
+  return ::rust::Box<::concrete_optimizer::Location>::from_raw(concrete_optimizer$utils$cxxbridge1$location_unknown());
+}
+
+::rust::Box<::concrete_optimizer::Location> location_from_string(::rust::Str string) noexcept {
+  return ::rust::Box<::concrete_optimizer::Location>::from_raw(concrete_optimizer$utils$cxxbridge1$location_from_string(string));
+}
+} // namespace utils
+
 namespace dag {
 ::rust::Box<::concrete_optimizer::Dag> empty() noexcept {
  return ::rust::Box<::concrete_optimizer::Dag>::from_raw(concrete_optimizer$dag$cxxbridge1$empty());
@@ -1415,28 +1458,28 @@ namespace dag {
  return ::std::move(return$.value);
 }

-::concrete_optimizer::dag::OperatorIndex DagBuilder::add_input(::std::uint8_t out_precision, ::rust::Slice<::std::uint64_t const> out_shape) noexcept {
-  return concrete_optimizer$cxxbridge1$DagBuilder$add_input(*this, out_precision, out_shape);
+::concrete_optimizer::dag::OperatorIndex DagBuilder::add_input(::std::uint8_t out_precision, ::rust::Slice<::std::uint64_t const> out_shape, ::concrete_optimizer::Location const &location) noexcept {
+  return concrete_optimizer$cxxbridge1$DagBuilder$add_input(*this, out_precision, out_shape, location);
 }

-::concrete_optimizer::dag::OperatorIndex DagBuilder::add_lut(::concrete_optimizer::dag::OperatorIndex input, ::rust::Slice<::std::uint64_t const> table, ::std::uint8_t out_precision) noexcept {
-  return concrete_optimizer$cxxbridge1$DagBuilder$add_lut(*this, input, table, out_precision);
+::concrete_optimizer::dag::OperatorIndex DagBuilder::add_lut(::concrete_optimizer::dag::OperatorIndex input, ::rust::Slice<::std::uint64_t const> table, ::std::uint8_t out_precision, ::concrete_optimizer::Location const &location) noexcept {
+  return concrete_optimizer$cxxbridge1$DagBuilder$add_lut(*this, input, table, out_precision, location);
 }

-::concrete_optimizer::dag::OperatorIndex DagBuilder::add_dot(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, ::rust::Box<::concrete_optimizer::Weights> weights) noexcept {
-  return concrete_optimizer$cxxbridge1$DagBuilder$add_dot(*this, inputs, weights.into_raw());
+::concrete_optimizer::dag::OperatorIndex DagBuilder::add_dot(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, ::rust::Box<::concrete_optimizer::Weights> weights, ::concrete_optimizer::Location const &location) noexcept {
+  return concrete_optimizer$cxxbridge1$DagBuilder$add_dot(*this, inputs, weights.into_raw(), location);
 }

-::concrete_optimizer::dag::OperatorIndex DagBuilder::add_levelled_op(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, double lwe_dim_cost_factor, double fixed_cost, double manp, ::rust::Slice<::std::uint64_t const> out_shape, ::rust::Str comment) noexcept {
-  return concrete_optimizer$cxxbridge1$DagBuilder$add_levelled_op(*this, inputs, lwe_dim_cost_factor, fixed_cost, manp, out_shape, comment);
+::concrete_optimizer::dag::OperatorIndex DagBuilder::add_levelled_op(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, double lwe_dim_cost_factor, double fixed_cost, ::rust::Slice<double const> weights, ::rust::Slice<::std::uint64_t const> out_shape, ::rust::Str comment, ::concrete_optimizer::Location const &location) noexcept {
+  return concrete_optimizer$cxxbridge1$DagBuilder$add_levelled_op(*this, inputs, lwe_dim_cost_factor, fixed_cost, weights, out_shape, comment, location);
 }

-::concrete_optimizer::dag::OperatorIndex DagBuilder::add_round_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision) noexcept {
-  return concrete_optimizer$cxxbridge1$DagBuilder$add_round_op(*this, input, rounded_precision);
+::concrete_optimizer::dag::OperatorIndex DagBuilder::add_round_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision, ::concrete_optimizer::Location const &location) noexcept {
+  return concrete_optimizer$cxxbridge1$DagBuilder$add_round_op(*this, input, rounded_precision, location);
 }

-::concrete_optimizer::dag::OperatorIndex DagBuilder::add_unsafe_cast_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision) noexcept {
-  return concrete_optimizer$cxxbridge1$DagBuilder$add_unsafe_cast_op(*this, input, rounded_precision);
+::concrete_optimizer::dag::OperatorIndex DagBuilder::add_unsafe_cast_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision, ::concrete_optimizer::Location const &location) noexcept {
+  return concrete_optimizer$cxxbridge1$DagBuilder$add_unsafe_cast_op(*this, input, rounded_precision, location);
 }

 void DagBuilder::tag_operator_as_output(::concrete_optimizer::dag::OperatorIndex op) noexcept {
@@ -1517,6 +1560,10 @@ namespace weights {
 } // namespace concrete_optimizer

 extern "C" {
+::concrete_optimizer::Location *cxxbridge1$box$concrete_optimizer$Location$alloc() noexcept;
+void cxxbridge1$box$concrete_optimizer$Location$dealloc(::concrete_optimizer::Location *) noexcept;
+void cxxbridge1$box$concrete_optimizer$Location$drop(::rust::Box<::concrete_optimizer::Location> *ptr) noexcept;
+
 ::concrete_optimizer::Dag *cxxbridge1$box$concrete_optimizer$Dag$alloc() noexcept;
 void cxxbridge1$box$concrete_optimizer$Dag$dealloc(::concrete_optimizer::Dag *) noexcept;
 void cxxbridge1$box$concrete_optimizer$Dag$drop(::rust::Box<::concrete_optimizer::Dag> *ptr) noexcept;
@@ -1605,6 +1652,18 @@ void cxxbridge1$rust_vec$concrete_optimizer$dag$InstructionKeys$truncate(::rust:
 namespace rust {
 inline namespace cxxbridge1 {
 template <>
+::concrete_optimizer::Location *Box<::concrete_optimizer::Location>::allocation::alloc() noexcept {
+  return cxxbridge1$box$concrete_optimizer$Location$alloc();
+}
+template <>
+void Box<::concrete_optimizer::Location>::allocation::dealloc(::concrete_optimizer::Location *ptr) noexcept {
+  cxxbridge1$box$concrete_optimizer$Location$dealloc(ptr);
+}
+template <>
+void Box<::concrete_optimizer::Location>::drop() noexcept {
+  cxxbridge1$box$concrete_optimizer$Location$drop(this);
+}
+template <>
 ::concrete_optimizer::Dag *Box<::concrete_optimizer::Dag>::allocation::alloc() noexcept {
  return cxxbridge1$box$concrete_optimizer$Dag$alloc();
 }
--- a/compilers/concrete-optimizer/concrete-optimizer-cpp/src/cpp/concrete-optimizer.hpp
+++ b/compilers/concrete-optimizer/concrete-optimizer-cpp/src/cpp/concrete-optimizer.hpp
@@ -924,6 +924,7 @@ struct CircuitKeys;
 namespace concrete_optimizer {
  struct Dag;
  struct DagBuilder;
+  struct Location;
  struct Weights;
  enum class Encoding : ::std::uint8_t;
  enum class MultiParamStrategy : ::std::uint8_t;
@@ -974,12 +975,12 @@ private:
 #define CXXBRIDGE1_STRUCT_concrete_optimizer$DagBuilder
 struct DagBuilder final : public ::rust::Opaque {
  ::rust::String dump() const noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_input(::std::uint8_t out_precision, ::rust::Slice<::std::uint64_t const> out_shape) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_lut(::concrete_optimizer::dag::OperatorIndex input, ::rust::Slice<::std::uint64_t const> table, ::std::uint8_t out_precision) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_dot(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, ::rust::Box<::concrete_optimizer::Weights> weights) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_levelled_op(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, double lwe_dim_cost_factor, double fixed_cost, double manp, ::rust::Slice<::std::uint64_t const> out_shape, ::rust::Str comment) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_round_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision) noexcept;
-  ::concrete_optimizer::dag::OperatorIndex add_unsafe_cast_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_input(::std::uint8_t out_precision, ::rust::Slice<::std::uint64_t const> out_shape, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_lut(::concrete_optimizer::dag::OperatorIndex input, ::rust::Slice<::std::uint64_t const> table, ::std::uint8_t out_precision, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_dot(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, ::rust::Box<::concrete_optimizer::Weights> weights, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_levelled_op(::rust::Slice<::concrete_optimizer::dag::OperatorIndex const> inputs, double lwe_dim_cost_factor, double fixed_cost, ::rust::Slice<double const> weights, ::rust::Slice<::std::uint64_t const> out_shape, ::rust::Str comment, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_round_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision, ::concrete_optimizer::Location const &location) noexcept;
+  ::concrete_optimizer::dag::OperatorIndex add_unsafe_cast_op(::concrete_optimizer::dag::OperatorIndex input, ::std::uint8_t rounded_precision, ::concrete_optimizer::Location const &location) noexcept;
  void tag_operator_as_output(::concrete_optimizer::dag::OperatorIndex op) noexcept;
  ~DagBuilder() = delete;

@@ -992,6 +993,20 @@ private:
 };
 #endif // CXXBRIDGE1_STRUCT_concrete_optimizer$DagBuilder

+#ifndef CXXBRIDGE1_STRUCT_concrete_optimizer$Location
+#define CXXBRIDGE1_STRUCT_concrete_optimizer$Location
+struct Location final : public ::rust::Opaque {
+  ~Location() = delete;
+
+private:
+  friend ::rust::layout;
+  struct layout {
+    static ::std::size_t size() noexcept;
+    static ::std::size_t align() noexcept;
+  };
+};
+#endif // CXXBRIDGE1_STRUCT_concrete_optimizer$Location
+
 #ifndef CXXBRIDGE1_STRUCT_concrete_optimizer$Weights
 #define CXXBRIDGE1_STRUCT_concrete_optimizer$Weights
 struct Weights final : public ::rust::Opaque {
@@ -1258,6 +1273,10 @@ namespace utils {
 ::concrete_optimizer::dag::DagSolution convert_to_dag_solution(::concrete_optimizer::v0::Solution const &solution) noexcept;

 ::concrete_optimizer::dag::CircuitSolution convert_to_circuit_solution(::concrete_optimizer::dag::DagSolution const &solution, ::concrete_optimizer::Dag const &dag) noexcept;
+
+::rust::Box<::concrete_optimizer::Location> location_unknown() noexcept;
+
+::rust::Box<::concrete_optimizer::Location> location_from_string(::rust::Str string) noexcept;
 } // namespace utils

 namespace dag {
--- a/compilers/concrete-optimizer/concrete-optimizer-cpp/tests/src/main.cpp
+++ b/compilers/concrete-optimizer/concrete-optimizer-cpp/tests/src/main.cpp
@@ -51,7 +51,7 @@ TEST test_dag_no_lut() {
  std::vector<uint64_t> shape = {3};

  concrete_optimizer::dag::OperatorIndex node1 =
-      builder->add_input(PRECISION_8B, slice(shape));
+      builder->add_input(PRECISION_8B, slice(shape), *concrete_optimizer::utils::location_unknown());

  std::vector<concrete_optimizer::dag::OperatorIndex> inputs = {node1};

@@ -60,7 +60,7 @@ TEST test_dag_no_lut() {
  rust::cxxbridge1::Box<concrete_optimizer::Weights> weights =
      concrete_optimizer::weights::vector(slice(weight_vec));

-  auto id = builder->add_dot(slice(inputs), std::move(weights));
+  auto id = builder->add_dot(slice(inputs), std::move(weights), *concrete_optimizer::utils::location_unknown());
  builder->tag_operator_as_output(id);

  auto solution = dag->optimize(default_options());
@@ -75,10 +75,10 @@ TEST test_dag_lut() {
  std::vector<uint64_t> shape = {3};

  concrete_optimizer::dag::OperatorIndex input =
-      builder->add_input(PRECISION_8B, slice(shape));
+      builder->add_input(PRECISION_8B, slice(shape), *concrete_optimizer::utils::location_unknown());

  std::vector<u_int64_t> table = {};
-  auto id = builder->add_lut(input, slice(table), PRECISION_8B);
+  auto id = builder->add_lut(input, slice(table), PRECISION_8B, *concrete_optimizer::utils::location_unknown());
  builder->tag_operator_as_output(id);

  auto solution = dag->optimize(default_options());
@@ -94,10 +94,10 @@ TEST test_dag_lut_wop() {
  std::vector<uint64_t> shape = {3};

  concrete_optimizer::dag::OperatorIndex input =
-      builder->add_input(PRECISION_16B, slice(shape));
+      builder->add_input(PRECISION_16B, slice(shape), *concrete_optimizer::utils::location_unknown());

  std::vector<u_int64_t> table = {};
-  auto id = builder->add_lut(input, slice(table), PRECISION_16B);
+  auto id = builder->add_lut(input, slice(table), PRECISION_16B, *concrete_optimizer::utils::location_unknown());
  builder->tag_operator_as_output(id);

  auto solution = dag->optimize(default_options());
@@ -113,10 +113,10 @@ TEST test_dag_lut_force_wop() {
  std::vector<uint64_t> shape = {3};

  concrete_optimizer::dag::OperatorIndex input =
-      builder->add_input(PRECISION_8B, slice(shape));
+      builder->add_input(PRECISION_8B, slice(shape), *concrete_optimizer::utils::location_unknown());

  std::vector<u_int64_t> table = {};
-  auto id = builder->add_lut(input, slice(table), PRECISION_8B);
+  auto id = builder->add_lut(input, slice(table), PRECISION_8B, *concrete_optimizer::utils::location_unknown());
  builder->tag_operator_as_output(id);

  auto options = default_options();
@@ -133,10 +133,10 @@ TEST test_multi_parameters_1_precision() {
  std::vector<uint64_t> shape = {3};

  concrete_optimizer::dag::OperatorIndex input =
-      builder->add_input(PRECISION_8B, slice(shape));
+      builder->add_input(PRECISION_8B, slice(shape), *concrete_optimizer::utils::location_unknown());

  std::vector<u_int64_t> table = {};
-  auto id = builder->add_lut(input, slice(table), PRECISION_8B);
+  auto id = builder->add_lut(input, slice(table), PRECISION_8B, *concrete_optimizer::utils::location_unknown());
  builder->tag_operator_as_output(id);

  auto options = default_options();
@@ -160,14 +160,14 @@ TEST test_multi_parameters_2_precision() {
  std::vector<uint64_t> shape = {3};

  concrete_optimizer::dag::OperatorIndex input1 =
-      builder->add_input(PRECISION_8B, slice(shape));
+      builder->add_input(PRECISION_8B, slice(shape), *concrete_optimizer::utils::location_unknown());

  concrete_optimizer::dag::OperatorIndex input2 =
-      builder->add_input(PRECISION_1B, slice(shape));
+      builder->add_input(PRECISION_1B, slice(shape), *concrete_optimizer::utils::location_unknown());

  std::vector<u_int64_t> table = {};
-  auto lut1 = builder->add_lut(input1, slice(table), PRECISION_8B);
-  auto lut2 = builder->add_lut(input2, slice(table), PRECISION_8B);
+  auto lut1 = builder->add_lut(input1, slice(table), PRECISION_8B, *concrete_optimizer::utils::location_unknown());
+  auto lut2 = builder->add_lut(input2, slice(table), PRECISION_8B, *concrete_optimizer::utils::location_unknown());

  std::vector<concrete_optimizer::dag::OperatorIndex> inputs = {lut1, lut2};

@@ -176,7 +176,7 @@ TEST test_multi_parameters_2_precision() {
  rust::cxxbridge1::Box<concrete_optimizer::Weights> weights =
      concrete_optimizer::weights::vector(slice(weight_vec));

-  auto id = builder->add_dot(slice(inputs), std::move(weights));
+  auto id = builder->add_dot(slice(inputs), std::move(weights), *concrete_optimizer::utils::location_unknown());
  builder->tag_operator_as_output(id);

  auto options = default_options();
@@ -201,14 +201,14 @@ TEST test_multi_parameters_2_precision_crt() {
  std::vector<uint64_t> shape = {3};

  concrete_optimizer::dag::OperatorIndex input1 =
-      builder->add_input(PRECISION_8B, slice(shape));
+      builder->add_input(PRECISION_8B, slice(shape), *concrete_optimizer::utils::location_unknown());

  concrete_optimizer::dag::OperatorIndex input2 =
-      builder->add_input(PRECISION_1B, slice(shape));
+      builder->add_input(PRECISION_1B, slice(shape), *concrete_optimizer::utils::location_unknown());

  std::vector<u_int64_t> table = {};
-  auto lut1 = builder->add_lut(input1, slice(table), PRECISION_8B);
-  auto lut2 = builder->add_lut(input2, slice(table), PRECISION_8B);
+  auto lut1 = builder->add_lut(input1, slice(table), PRECISION_8B, *concrete_optimizer::utils::location_unknown());
+  auto lut2 = builder->add_lut(input2, slice(table), PRECISION_8B, *concrete_optimizer::utils::location_unknown());

  std::vector<concrete_optimizer::dag::OperatorIndex> inputs = {lut1, lut2};

@@ -217,7 +217,7 @@ TEST test_multi_parameters_2_precision_crt() {
  rust::cxxbridge1::Box<concrete_optimizer::Weights> weights =
      concrete_optimizer::weights::vector(slice(weight_vec));

-  auto id = builder->add_dot(slice(inputs), std::move(weights));
+  auto id = builder->add_dot(slice(inputs), std::move(weights), *concrete_optimizer::utils::location_unknown());
  builder->tag_operator_as_output(id);

  auto options = default_options();
--- a/compilers/concrete-optimizer/concrete-optimizer/src/dag/operator/location.rs
+++ b/compilers/concrete-optimizer/concrete-optimizer/src/dag/operator/location.rs
@@ -0,0 +1,28 @@
+use std::{fmt::Display, path::PathBuf};
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Location {
+    Unknown,
+    File(PathBuf),
+    Line(PathBuf, usize),
+    LineColumn(PathBuf, usize, usize),
+}
+
+impl Display for Location {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Unknown => write!(f, "unknown location"),
+            Self::File(file) => write!(f, "{}", file.file_name().unwrap().to_str().unwrap()),
+            Self::Line(file, line) => {
+                write!(f, "{}:{line}", file.file_name().unwrap().to_str().unwrap())
+            }
+            Self::LineColumn(file, line, column) => {
+                write!(
+                    f,
+                    "{}:{line}:{column}",
+                    file.file_name().unwrap().to_str().unwrap()
+                )
+            }
+        }
+    }
+}
--- a/compilers/concrete-optimizer/concrete-optimizer/src/dag/operator/mod.rs
+++ b/compilers/concrete-optimizer/concrete-optimizer/src/dag/operator/mod.rs
@@ -1,8 +1,10 @@
 #![allow(clippy::module_inception)]
 pub mod dot_kind;
+pub mod location;
 pub mod operator;
 pub mod tensor;

 pub use self::dot_kind::*;
+pub use self::location::*;
 pub use self::operator::*;
 pub use self::tensor::*;
--- a/compilers/concrete-optimizer/concrete-optimizer/src/dag/operator/operator.rs
+++ b/compilers/concrete-optimizer/concrete-optimizer/src/dag/operator/operator.rs
@@ -3,6 +3,7 @@ use std::iter::{empty, once};
 use std::ops::Deref;

 use crate::dag::operator::tensor::{ClearTensor, Shape};
+use crate::optimization::dag::multi_parameters::partition_cut::ExternalPartition;

 use super::DotKind;

@@ -89,7 +90,7 @@ pub enum Operator {
    LevelledOp {
        inputs: Vec<OperatorIndex>,
        complexity: LevelledComplexity,
-        manp: f64,
+        weights: Vec<f64>,
        out_shape: Shape,
        comment: String,
    },
@@ -104,6 +105,11 @@ pub enum Operator {
        input: OperatorIndex,
        out_precision: Precision,
    },
+    ChangePartition {
+        input: OperatorIndex,
+        src_partition: Option<ExternalPartition>,
+        dst_partition: Option<ExternalPartition>,
+    },
 }

 impl Operator {
@@ -114,7 +120,8 @@ impl Operator {
            Self::LevelledOp { inputs, .. } | Self::Dot { inputs, .. } => Box::new(inputs.iter()),
            Self::UnsafeCast { input, .. }
            | Self::Lut { input, .. }
-            | Self::Round { input, .. } => Box::new(once(input)),
+            | Self::Round { input, .. }
+            | Self::ChangePartition { input, .. } => Box::new(once(input)),
        }
    }
 }
@@ -171,7 +178,7 @@ impl fmt::Display for Operator {
            }
            Self::LevelledOp {
                inputs,
-                manp,
+                weights,
                out_shape,
                ..
            } => {
@@ -182,7 +189,7 @@ impl fmt::Display for Operator {
                    }
                    write!(f, "%{}", input.0)?;
                }
-                write!(f, "] : manp={manp} x {out_shape:?}")?;
+                write!(f, "] : weights={weights:?}, out_shape={out_shape:?}")?;
            }
            Self::Round {
                input,
@@ -190,6 +197,23 @@ impl fmt::Display for Operator {
            } => {
                write!(f, "ROUND[%{}] : u{out_precision}", input.0)?;
            }
+            Self::ChangePartition {
+                input,
+                src_partition,
+                dst_partition,
+            } => {
+                write!(f, "CHANGE_PARTITION[%{}] : {{", input.0)?;
+                if let Some(partition) = src_partition {
+                    write!(f, "src_partition: {}", partition.name)?;
+                }
+                if let Some(partition) = dst_partition {
+                    if src_partition.is_some() {
+                        write!(f, ", ")?;
+                    }
+                    write!(f, "dst_partition: {}", partition.name)?;
+                }
+                write!(f, "}}")?;
+            }
        }
        Ok(())
    }
--- a/Show More
+++ b/Show More