Add RuntimeManager to centralize runtime management

2026-04-29 03:00:45 -04:00 · 2024-12-24 14:44:44 +00:00
575 changed files with 25457 additions and 31210 deletions
@@ -1,5 +0,0 @@
-frontend/node_modules
-config.toml
-.envrc
-.env
-.git
@@ -1 +0,0 @@
-*.ipynb linguist-vendored
@@ -1,41 +0,0 @@
---
-name: Bug Report
-about: Report a problem with OpenDevin
-title: ''
-labels: 'bug'
-assignees: ''
-
---
-<!-- You MUST fill out this template. We will close issues that don't include enough information to reproduce -->
-#### Describe the bug
-<!-- a short description of the problem -->
-
-#### Setup and configuration
-**Current version**:
-<!-- run `git log -n 1` to see this -->
-```bash
-```
-
-**My operating system**:
-
-<!-- tell us everything about your environment -->
-**My environment vars and other configuration** (be sure to redact API keys):
-```bash
-```
-
-**My model and agent** (you can see these settings in the UI):
-* Model:
-* Agent:
-
-**Commands I ran to install and run OpenDevin**:
-```
-```
-
-**Steps to Reproduce**:
-1.
-2.
-3.
-
-**Logs, error messages, and screenshots**:
-
-#### Additional Context
@@ -1,18 +0,0 @@
---
-name: Feature Request
-about: Suggest an idea for OpenDevin features
-title: ''
-labels: 'enhancement'
-assignees: ''
-
---
-
-**What problem or use case are you trying to solve?**
-
-**Describe the UX of the solution you'd like**
-
-**Do you have thoughts on the technical implementation?**
-
-**Describe alternatives you've considered**
-
-**Additional context**
@@ -1,16 +0,0 @@
---
-name: Question
-about: Use this template to ask a question regarding the project.
-title: ''
-labels: question
-assignees: ''
-
---
-
-## Describe your question
-
-<!--A clear and concise description of what you want to know.-->
-
-## Additional context
-
-<!--Add any other context about the question here, like what you've tried so far.-->
@@ -1,18 +0,0 @@
---
-name: Technical Proposal
-about: Propose a new architecture or technology
-title: ''
-labels: 'proposal'
-assignees: ''
-
---
-
-**Summary**
-
-**Motivation**
-
-**Technical Design**
-
-**Alternatives to Consider**
-
-**Additional context**
@@ -1,47 +0,0 @@
-name: Build & Run Tests
-
-on: [push, pull_request]
-
-jobs:
-  on-macos:
-    runs-on: macos-latest
-    strategy:
-      matrix:
-        python-version: ["3.11", "3.12"]
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install & Start Docker
-        run: |
-          brew install colima docker
-          colima start
-      - name: Install and configure Poetry
-        uses: snok/install-poetry@v1
-        with:
-          version: latest
-      - name: Build Environment
-        run: make build
-      - name: Run Tests
-        run: poetry run pytest ./tests
-  on-linux:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.11", "3.12"]
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install Poetry
-        run: curl -sSL https://install.python-poetry.org | python3 -
-      - name: Build Environment
-        run: make build
-      - name: Run Tests
-        run: poetry run pytest ./tests
@@ -1,109 +0,0 @@
-name: Use OpenDevin to Resolve GitHub Issue
-
-on:
-  issues:
-    types: [labeled]
-
-permissions:
-  contents: write
-  pull-requests: write
-  issues: write
-
-jobs:
-  open-devin:
-    if: github.event.label.name == 'dogfood-this'
-    runs-on: ubuntu-latest
-    container:
-      image: ghcr.io/opendevin/opendevin
-      volumes:
-        - /var/run/docker.sock:/var/run/docker.sock
-
-    steps:
-    - name: install git, github cli
-      run: apt-get install -y git gh
-
-    - name: Checkout Repository
-      uses: actions/checkout@v4
-
-    - name: Write Task File
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-      run: |
-        echo "TITLE:" > task.txt
-        echo "${ISSUE_TITLE}" >> task.txt
-        echo "" >> task.txt
-        echo "BODY:" >> task.txt
-        echo "${ISSUE_BODY}" >> task.txt
-
-    - name: Run OpenDevin
-      env:
-        ISSUE_TITLE: ${{ github.event.issue.title }}
-        ISSUE_BODY: ${{ github.event.issue.body }}
-        LLM_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        SANDBOX_TYPE: exec
-      run: |
-        python ./opendevin/main.py -d "./" -i 50 -f task.txt -d $GITHUB_WORKSPACE
-        rm task.txt
-
-    - name: Setup Git, Create Branch, and Commit Changes
-      run: |
-        # Setup Git configuration
-        git config --global --add safe.directory $PWD
-        git config --global user.name 'OpenDevin'
-        git config --global user.email 'OpenDevin@users.noreply.github.com'
-
-        # Create a unique branch name with a timestamp
-        BRANCH_NAME="fix/${{ github.event.issue.number }}-$(date +%Y%m%d%H%M%S)"
-
-        # Checkout new branch
-        git checkout -b $BRANCH_NAME
-
-        # Add all changes to staging, except task.txt
-        git add --all -- ':!task.txt'
-
-        # Commit the changes, if any
-        git commit -m "OpenDevin: Resolve Issue #${{ github.event.issue.number }}"
-        if [ $? -ne 0 ]; then
-          echo "No changes to commit."
-          exit 0
-        fi
-
-        # Push changes
-        git push --set-upstream origin $BRANCH_NAME
-
-    - name: Fetch Default Branch
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Fetch the default branch using gh cli
-        DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq .defaultBranchRef.name)
-        echo "Default branch is $DEFAULT_BRANCH"
-        echo "DEFAULT_BRANCH=$DEFAULT_BRANCH" >> $GITHUB_ENV
-
-    - name: Generate PR
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        # Create PR and capture URL
-        PR_URL=$(gh pr create \
-          --title "OpenDevin: Resolve Issue #2" \
-          --body "This PR was generated by OpenDevin to resolve issue #2" \
-          --repo "foragerr/OpenDevin" \
-          --head "${{ github.head_ref }}" \
-          --base "${{ env.DEFAULT_BRANCH }}" \
-          | grep -o 'https://github.com/[^ ]*')
-
-        # Extract PR number from URL
-        PR_NUMBER=$(echo "$PR_URL" | grep -o '[0-9]\+$')
-
-        # Set environment vars
-        echo "PR_URL=$PR_URL" >> $GITHUB_ENV
-        echo "PR_NUMBER=$PR_NUMBER" >> $GITHUB_ENV
-
-    - name: Post Comment
-      env:
-        GH_TOKEN: ${{ github.token }}
-      run: |
-        gh issue comment ${{ github.event.issue.number }} \
-          -b "OpenDevin raised [PR #${{ env.PR_NUMBER }}](${{ env.PR_URL }}) to resolve this issue."
@@ -1,36 +0,0 @@
-name: Publish Docker Image
-
-on:
-  push:
-    branches: [ '**' ]
-  workflow_dispatch:
-    inputs:
-      reason:
-        description: 'Reason for manual trigger'
-        required: true
-        default: ''
-
-jobs:
-  ghcr_build_and_push:
-    runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event.inputs.reason != ''
-    strategy:
-      matrix:
-        image: ["app", "evaluation", "sandbox"]
-
-    steps:
-      - name: checkout
-        uses: actions/checkout@v4
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log-in to ghcr.io
-        run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u ${{ github.actor }} --password-stdin
-
-      - name: Build and push ${{ matrix.image }}
-        run: ./containers/build.sh ${{ matrix.image }} --push
@@ -1,38 +0,0 @@
-name: Lint
-
-on: [push, pull_request]
-
-jobs:
-  lint-frontend:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install Node.js 20
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Install dependencies
-        run: |
-          cd frontend
-          npm install --frozen-lockfile
-
-      - name: Lint
-        run: |
-          cd frontend
-          npm run lint
-
-  lint-python:
-    name: Lint python
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up python
-        uses: actions/setup-python@v5
-        with:
-          python-version: 3.11
-      - name: Install pre-commit
-        run: pip install pre-commit==3.7.0
-      - name: Run pre-commit hooks
-        run: pre-commit run --files opendevin/**/* agenthub/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
@@ -1,20 +0,0 @@
-name: Run Tests
-
-on: [push]
-
-jobs:
-  test:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: '3.11'
-      - name: Set up environment
-        run: |
-          curl -sSL https://install.python-poetry.org | python3 -
-          poetry install --without evaluation
-      - name: Run tests
-        run: |
-          poetry run pytest ./tests
@@ -1,29 +0,0 @@
-name: 'Close stale issues'
-on:
-  schedule:
-    - cron: '30 1 * * *'
-
-jobs:
-  stale:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/stale@v9
-        with:
-          # Aggressively close issues that have been explicitly labeled `age-out`
-          any-of-labels: age-out
-          stale-issue-message: 'This issue is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 day.'
-          close-issue-message: 'This issue was closed because it has been stalled for over 7 days with no activity.'
-          stale-pr-message: 'This PR is stale because it has been open for 7 days with no activity. Remove stale label or comment or this will be closed in 1 days.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 7 days with no activity.'
-          days-before-stale: 7
-          days-before-close: 1
-
-      - uses: actions/stale@v9
-        with:
-          # Be more lenient with other issues
-          stale-issue-message: 'This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          close-issue-message: 'This issue was closed because it has been stalled for over 30 days with no activity.'
-          stale-pr-message: 'This PR is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days.'
-          close-pr-message: 'This PR was closed because it has been stalled for over 30 days with no activity.'
-          days-before-stale: 30
-          days-before-close: 7
@@ -1,204 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-./lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-requirements.txt
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-.python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-# poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-*venv/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
-.vscode/
-
-# evaluation
-evaluation/SWE-bench/data
-
-# frontend
-
-# dependencies
-frontend/node_modules
-frontend/.pnp
-frontend/bun.lockb
-frontend/yarn.lock
-.pnp.js
-
-# testing
-frontend/coverage
-
-# production
-frontend/build
-frontend/dist
-
-# misc
-.DS_Store
-.env.local
-.env.development.local
-.env.test.local
-.env.production.local
-
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-
-logs
-
-# agent
-.envrc
-/workspace
-/debug
-cache
-
-# configuration
-config.toml
@@ -1,89 +0,0 @@
-# Contributing
-
-Thanks for your interest in contributing to OpenDevin! We welcome and appreciate contributions.
-To report bugs, create a [GitHub issue](https://github.com/OpenDevin/OpenDevin/issues/new/choose).
-
-## Contribution Guide
-### 1. Fork the Official Repository
-
-Fork [OpenDevin repository](https://github.com/OpenDevin/OpenDevin) into your own account.
-Clone your own forked repository into your local environment.
-
-```shell
-git clone git@github.com:<YOUR-USERNAME>/OpenDevin.git
-```
-
-### 2. Configure Git
-
-Set the official repository as your [upstream](https://www.atlassian.com/git/tutorials/git-forks-and-upstreams) to synchronize with the latest update in the official repository.
-Add the original repository as upstream
-
-```shell
-cd OpenDevin
-git remote add upstream git@github.com:OpenDevin/OpenDevin.git
-```
-
-Verify that the remote is set.
-```shell
-git remote -v
-```
-You should see both `origin` and `upstream` in the output.
-
-### 3. Synchronize with Official Repository
-Synchronize latest commit with official repository before coding.
-
-```shell
-git fetch upstream
-git checkout main
-git merge upstream/main
-git push origin main
-```
-
-### 4. Create a New Branch And Open a Pull Request
-After you finish implementation, open forked repository. The source branch is your new branch, and the target branch is `OpenDevin/OpenDevin` `main` branch. Then PR should appears in [OpenDevin PRs](https://github.com/OpenDevin/OpenDevin/pulls).
-
-Then OpenDevin team will review your code.
-
-## PR Rules
-
-### 1. Pull Request title
-
-As described in [here](https://github.com/commitizen/conventional-commit-types/blob/master/index.json), a valid PR title should begin with one of the following prefixes:
-
- `feat`: A new feature
- `fix`: A bug fix
- `doc`: Documentation only changes
- `refactor`: A code change that neither fixes a bug nor adds a feature
- `style`: A refactoring that improves code style
- `perf`: A code change that improves performance
- `test`: Adding missing tests or correcting existing tests
- `ci`: Changes to CI configuration files and scripts (example scopes: `.github`, `ci` (Buildkite))
- `chore`: Other changes that don't modify src or test files
- `revert`: Reverts a previous commit
-
-For example, a PR title could be:
- `refactor: modify package path`
- `feat(frontend): xxxx`, where `(frontend)` means that this PR mainly focuses on the frontend component.
-
-You may also check out previous PRs in the [PR list](https://github.com/OpenDevin/OpenDevin/pulls).
-
-As described in [here](https://github.com/OpenDevin/OpenDevin/labels), we create several labels. Every PR should be tagged with the corresponding labels.
-
-### 2. Pull Request description
-
- If your PR is small (such as a typo fix), you can go brief.
- If it is large and you have changed a lot, it's better to write more details.
-
-
-## How to begin
-Please refer to the README in each module:
- [frontend](./frontend/README.md)
- [agenthub](./agenthub/README.md)
- [evaluation](./evaluation/README.md)
- [opendevin](./opendevin/README.md)
-    - [server](./opendevin/server/README.md)
-    - [mock server](./opendevin/mock/README.md)
-
-## Tests
-TODO: make sure code pass the test before submit.
-
@@ -1,133 +0,0 @@
-
-# Contributor Covenant Code of Conduct
-
-## Our Pledge
-
-We as members, contributors, and leaders pledge to make participation in our
-community a harassment-free experience for everyone, regardless of age, body
-size, visible or invisible disability, ethnicity, sex characteristics, gender
-identity and expression, level of experience, education, socio-economic status,
-nationality, personal appearance, race, caste, color, religion, or sexual
-identity and orientation.
-
-We pledge to act and interact in ways that contribute to an open, welcoming,
-diverse, inclusive, and healthy community.
-
-## Our Standards
-
-Examples of behavior that contributes to a positive environment for our
-community include:
-
-* Demonstrating empathy and kindness toward other people
-* Being respectful of differing opinions, viewpoints, and experiences
-* Giving and gracefully accepting constructive feedback
-* Accepting responsibility and apologizing to those affected by our mistakes,
-  and learning from the experience
-* Focusing on what is best not just for us as individuals, but for the overall
-  community
-
-Examples of unacceptable behavior include:
-
-* The use of sexualized language or imagery, and sexual attention or advances of
-  any kind
-* Trolling, insulting or derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or email address,
-  without their explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
-  professional setting
-
-## Enforcement Responsibilities
-
-Community leaders are responsible for clarifying and enforcing our standards of
-acceptable behavior and will take appropriate and fair corrective action in
-response to any behavior that they deem inappropriate, threatening, offensive,
-or harmful.
-
-Community leaders have the right and responsibility to remove, edit, or reject
-comments, commits, code, wiki edits, issues, and other contributions that are
-not aligned to this Code of Conduct, and will communicate reasons for moderation
-decisions when appropriate.
-
-## Scope
-
-This Code of Conduct applies within all community spaces, and also applies when
-an individual is officially representing the community in public spaces.
-Examples of representing our community include using an official email address,
-posting via an official social media account, or acting as an appointed
-representative at an online or offline event.
-
-## Enforcement
-
-Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported to the community leaders responsible for enforcement at
-contact@rbren.io
-All complaints will be reviewed and investigated promptly and fairly.
-
-All community leaders are obligated to respect the privacy and security of the
-reporter of any incident.
-
-## Enforcement Guidelines
-
-Community leaders will follow these Community Impact Guidelines in determining
-the consequences for any action they deem in violation of this Code of Conduct:
-
-### 1. Correction
-
-**Community Impact**: Use of inappropriate language or other behavior deemed
-unprofessional or unwelcome in the community.
-
-**Consequence**: A private, written warning from community leaders, providing
-clarity around the nature of the violation and an explanation of why the
-behavior was inappropriate. A public apology may be requested.
-
-### 2. Warning
-
-**Community Impact**: A violation through a single incident or series of
-actions.
-
-**Consequence**: A warning with consequences for continued behavior. No
-interaction with the people involved, including unsolicited interaction with
-those enforcing the Code of Conduct, for a specified period of time. This
-includes avoiding interactions in community spaces as well as external channels
-like social media. Violating these terms may lead to a temporary or permanent
-ban.
-
-### 3. Temporary Ban
-
-**Community Impact**: A serious violation of community standards, including
-sustained inappropriate behavior.
-
-**Consequence**: A temporary ban from any sort of interaction or public
-communication with the community for a specified period of time. No public or
-private interaction with the people involved, including unsolicited interaction
-with those enforcing the Code of Conduct, is allowed during this period.
-Violating these terms may lead to a permanent ban.
-
-### 4. Permanent Ban
-
-**Community Impact**: Demonstrating a pattern of violation of community
-standards, including sustained inappropriate behavior, harassment of an
-individual, or aggression toward or disparagement of classes of individuals.
-
-**Consequence**: A permanent ban from any sort of public interaction within the
-community.
-
-## Attribution
-
-This Code of Conduct is adapted from the [Contributor Covenant][homepage],
-version 2.1, available at
-[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
-
-Community Impact Guidelines were inspired by
-[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
-
-For answers to common questions about this code of conduct, see the FAQ at
-[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
-[https://www.contributor-covenant.org/translations][translations].
-
-[homepage]: https://www.contributor-covenant.org
-[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
-[Mozilla CoC]: https://github.com/mozilla/diversity
-[FAQ]: https://www.contributor-covenant.org/faq
-[translations]: https://www.contributor-covenant.org/translations
@@ -1,66 +0,0 @@
-# Development Guide
-This guide is for people working on OpenDevin and editing the source code.
-
-## Start the server for development
-
-### 1. Requirements
-* Linux, Mac OS, or [WSL on Windows](https://learn.microsoft.com/en-us/windows/wsl/install)
-* [Docker](https://docs.docker.com/engine/install/)(For those on MacOS, make sure to allow the default Docker socket to be used from advanced settings!)
-* [Python](https://www.python.org/downloads/) >= 3.11
-* [NodeJS](https://nodejs.org/en/download/package-manager) >= 18.17.1
-* [Poetry](https://python-poetry.org/docs/#installing-with-the-official-installer) >= 1.8
-
-Make sure you have all these dependencies installed before moving on to `make build`.
-
-### 2. Build and Setup The Environment
-
- **Build the Project:** Begin by building the project, which includes setting up the environment and installing dependencies. This step ensures that OpenDevin is ready to run smoothly on your system.
-    ```bash
-    make build
-    ```
-
-### 3. Configuring the Language Model
-
-OpenDevin supports a diverse array of Language Models (LMs) through the powerful [litellm](https://docs.litellm.ai) library. By default, we've chosen the mighty GPT-4 from OpenAI as our go-to model, but the world is your oyster! You can unleash the potential of Anthropic's suave Claude, the enigmatic Llama, or any other LM that piques your interest.
-
-To configure the LM of your choice, follow these steps:
-
-1. **Using the Makefile: The Effortless Approach**
-   With a single command, you can have a smooth LM setup for your OpenDevin experience. Simply run:
-   ```bash
-   make setup-config
-   ```
-   This command will prompt you to enter the LLM API key and model name, ensuring that OpenDevin is tailored to your specific needs.
-
-**Note on Alternative Models:**
-Some alternative models may prove more challenging to tame than others. Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest. And if you've already mastered the art of wielding a model other than OpenAI's GPT, we encourage you to [share your setup instructions with us](https://github.com/OpenDevin/OpenDevin/issues/417).
-
-For a full list of the LM providers and models available, please consult the [litellm documentation](https://docs.litellm.ai/docs/providers).
-
-There is also [documentation for running with local models using ollama](./docs/documentation/LOCAL_LLM_GUIDE.md).
-
-### 4. Run the Application
-
- **Run the Application:** Once the setup is complete, launching OpenDevin is as simple as running a single command. This command starts both the backend and frontend servers seamlessly, allowing you to interact with OpenDevin without any hassle.
-    ```bash
-    make run
-    ```
-
-### 5. Individual Server Startup
-
- **Start the Backend Server:** If you prefer, you can start the backend server independently to focus on backend-related tasks or configurations.
-    ```bash
-    make start-backend
-    ```
-
- **Start the Frontend Server:** Similarly, you can start the frontend server on its own to work on frontend-related components or interface enhancements.
-    ```bash
-    make start-frontend
-    ```
-
-### 6. Help
-
- **Get Some Help:** Need assistance or information on available targets and commands? The help command provides all the necessary guidance to ensure a smooth experience with OpenDevin.
-    ```bash
-    make help
-    ```
@@ -1,25 +0,0 @@
-The MIT License (MIT)
-=====================
-
-Copyright © 2023
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the “Software”), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
@@ -1,190 +0,0 @@
-# Makefile for OpenDevin project
-
-# Variables
-DOCKER_IMAGE = ghcr.io/opendevin/sandbox
-BACKEND_PORT = 3000
-BACKEND_HOST = "127.0.0.1:$(BACKEND_PORT)"
-FRONTEND_PORT = 3001
-DEFAULT_WORKSPACE_DIR = "./workspace"
-DEFAULT_MODEL = "gpt-3.5-turbo-1106"
-CONFIG_FILE = config.toml
-PRECOMMIT_CONFIG_PATH = "./dev_config/python/.pre-commit-config.yaml"
-
-# ANSI color codes
-GREEN=\033[0;32m
-YELLOW=\033[0;33m
-RED=\033[0;31m
-BLUE=\033[0;34m
-RESET=\033[0m
-
-# Build
-build:
-	@echo "$(GREEN)Building project...$(RESET)"
-	@$(MAKE) -s check-dependencies
-	@$(MAKE) -s pull-docker-image
-	@$(MAKE) -s install-python-dependencies
-	@$(MAKE) -s install-frontend-dependencies
-	@$(MAKE) -s install-precommit-hooks
-	@$(MAKE) -s build-frontend
-	@echo "$(GREEN)Build completed successfully.$(RESET)"
-
-check-dependencies:
-	@echo "$(YELLOW)Checking dependencies...$(RESET)"
-	@$(MAKE) -s check-python
-	@$(MAKE) -s check-npm
-	@$(MAKE) -s check-docker
-	@$(MAKE) -s check-poetry
-	@echo "$(GREEN)Dependencies checked successfully.$(RESET)"
-
-check-python:
-	@echo "$(YELLOW)Checking Python installation...$(RESET)"
-	@if command -v python3 > /dev/null; then \
-		echo "$(BLUE)$(shell python3 --version) is already installed.$(RESET)"; \
-	else \
-		echo "$(RED)Python 3 is not installed. Please install Python 3 to continue.$(RESET)"; \
-		exit 1; \
-	fi
-
-check-npm:
-	@echo "$(YELLOW)Checking npm installation...$(RESET)"
-	@if command -v npm > /dev/null; then \
-		echo "$(BLUE)npm $(shell npm --version) is already installed.$(RESET)"; \
-	else \
-		echo "$(RED)npm is not installed. Please install Node.js to continue.$(RESET)"; \
-		exit 1; \
-	fi
-
-check-docker:
-	@echo "$(YELLOW)Checking Docker installation...$(RESET)"
-	@if command -v docker > /dev/null; then \
-		echo "$(BLUE)$(shell docker --version) is already installed.$(RESET)"; \
-	else \
-		echo "$(RED)Docker is not installed. Please install Docker to continue.$(RESET)"; \
-		exit 1; \
-	fi
-
-check-poetry:
-	@echo "$(YELLOW)Checking Poetry installation...$(RESET)"
-	@if command -v poetry > /dev/null; then \
-		echo "$(BLUE)$(shell poetry --version) is already installed.$(RESET)"; \
-	else \
-		echo "$(RED)Poetry is not installed. You can install poetry by running the following command, then adding Poetry to your PATH:"; \
-		echo "$(RED) curl -sSL https://install.python-poetry.org | python3 -$(RESET)"; \
-		echo "$(RED)More detail here: https://python-poetry.org/docs/#installing-with-the-official-installer$(RESET)"; \
-		exit 1; \
-	fi
-
-pull-docker-image:
-	@echo "$(YELLOW)Pulling Docker image...$(RESET)"
-	@docker pull $(DOCKER_IMAGE)
-	@echo "$(GREEN)Docker image pulled successfully.$(RESET)"
-
-install-python-dependencies:
-	@echo "$(GREEN)Installing Python dependencies...$(RESET)"
-	@if [ "$(shell uname)" = "Darwin" ]; then \
-		echo "$(BLUE)Installing `chroma-hnswlib`...$(RESET)"; \
-		export HNSWLIB_NO_NATIVE=1; \
-		poetry run pip install chroma-hnswlib; \
-	fi
-	@poetry install --without evaluation
-	@echo "$(GREEN)Python dependencies installed successfully.$(RESET)"
-
-install-frontend-dependencies:
-	@echo "$(YELLOW)Setting up frontend environment...$(RESET)"
-	@echo "$(YELLOW)Detect Node.js version...$(RESET)"
-	@cd frontend && node ./scripts/detect-node-version.js
-	@cd frontend && \
-		echo "$(BLUE)Installing frontend dependencies with npm...$(RESET)" && \
-		npm install && \
-		echo "$(BLUE)Running make-i18n with npm...$(RESET)" && \
-		npm run make-i18n
-	@echo "$(GREEN)Frontend dependencies installed successfully.$(RESET)"
-
-install-precommit-hooks:
-	@echo "$(YELLOW)Installing pre-commit hooks...$(RESET)"
-	@git config --unset-all core.hooksPath || true
-	@poetry run pre-commit install --config $(PRECOMMIT_CONFIG_PATH)
-	@echo "$(GREEN)Pre-commit hooks installed successfully.$(RESET)"
-
-build-frontend:
-	@echo "$(YELLOW)Building frontend...$(RESET)"
-	@cd frontend && npm run build
-
-# Start backend
-start-backend:
-	@echo "$(YELLOW)Starting backend...$(RESET)"
-	@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT)
-
-# Start frontend
-start-frontend:
-	@echo "$(YELLOW)Starting frontend...$(RESET)"
-	@cd frontend && BACKEND_HOST=$(BACKEND_HOST) FRONTEND_PORT=$(FRONTEND_PORT) npm run start
-
-# Run the app
-run:
-	@echo "$(YELLOW)Running the app...$(RESET)"
-	@if [ "$(OS)" = "Windows_NT" ]; then \
-		echo "$(RED)`make run` is not supported on Windows. Please run `make start-frontend` and `make start-backend` separately.$(RESET)"; \
-		exit 1; \
-	fi
-	@mkdir -p logs
-	@echo "$(YELLOW)Starting backend server...$(RESET)"
-	@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) &
-	@echo "$(YELLOW)Waiting for the backend to start...$(RESET)"
-	@until nc -z localhost $(BACKEND_PORT); do sleep 0.1; done
-	@echo "$(GREEN)Backend started successfully.$(RESET)"
-	@cd frontend && echo "$(BLUE)Starting frontend with npm...$(RESET)" && npm run start -- --port $(FRONTEND_PORT)
-	@echo "$(GREEN)Application started successfully.$(RESET)"
-
-# Setup config.toml
-setup-config:
-	@echo "$(YELLOW)Setting up config.toml...$(RESET)"
-	@$(MAKE) setup-config-prompts
-	@mv $(CONFIG_FILE).tmp $(CONFIG_FILE)
-	@echo "$(GREEN)Config.toml setup completed.$(RESET)"
-
-setup-config-prompts:
-	@read -p "Enter your LLM Model name (see https://docs.litellm.ai/docs/providers for full list) [default: $(DEFAULT_MODEL)]: " llm_model; \
-	 llm_model=$${llm_model:-$(DEFAULT_MODEL)}; \
-	 echo "LLM_MODEL=\"$$llm_model\"" > $(CONFIG_FILE).tmp
-
-	@read -p "Enter your LLM API key: " llm_api_key; \
-	 echo "LLM_API_KEY=\"$$llm_api_key\"" >> $(CONFIG_FILE).tmp
-
-	@read -p "Enter your LLM Base URL [mostly used for local LLMs, leave blank if not needed - example: http://localhost:5001/v1/]: " llm_base_url; \
-	 if [[ ! -z "$$llm_base_url" ]]; then echo "LLM_BASE_URL=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; fi
-
-	@echo "Enter your LLM Embedding Model\nChoices are openai, azureopenai, llama2 or leave blank to default to 'BAAI/bge-small-en-v1.5' via huggingface"; \
-	 read -p "> " llm_embedding_model; \
-	 	echo "LLM_EMBEDDING_MODEL=\"$$llm_embedding_model\"" >> $(CONFIG_FILE).tmp; \
-		if [ "$$llm_embedding_model" = "llama2" ]; then \
-			read -p "Enter the local model URL (will overwrite LLM_BASE_URL): " llm_base_url; \
-				echo "LLM_BASE_URL=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; \
-		elif [ "$$llm_embedding_model" = "azureopenai" ]; then \
-			read -p "Enter the Azure endpoint URL (will overwrite LLM_BASE_URL): " llm_base_url; \
-				echo "LLM_BASE_URL=\"$$llm_base_url\"" >> $(CONFIG_FILE).tmp; \
-			read -p "Enter the Azure LLM Deployment Name: " llm_deployment_name; \
-				echo "LLM_DEPLOYMENT_NAME=\"$$llm_deployment_name\"" >> $(CONFIG_FILE).tmp; \
-			read -p "Enter the Azure API Version: " llm_api_version; \
-				echo "LLM_API_VERSION=\"$$llm_api_version\"" >> $(CONFIG_FILE).tmp; \
-		fi
-
-	@read -p "Enter your workspace directory [default: $(DEFAULT_WORKSPACE_DIR)]: " workspace_dir; \
-	 workspace_dir=$${workspace_dir:-$(DEFAULT_WORKSPACE_DIR)}; \
-	 echo "WORKSPACE_BASE=\"$$workspace_dir\"" >> $(CONFIG_FILE).tmp
-
-# Help
-help:
-	@echo "$(BLUE)Usage: make [target]$(RESET)"
-	@echo "Targets:"
-	@echo "  $(GREEN)build$(RESET)               - Build project, including environment setup and dependencies."
-	@echo "  $(GREEN)setup-config$(RESET)        - Setup the configuration for OpenDevin by providing LLM API key,"
-	@echo "                        LLM Model name, and workspace directory."
-	@echo "  $(GREEN)start-backend$(RESET)       - Start the backend server for the OpenDevin project."
-	@echo "  $(GREEN)start-frontend$(RESET)      - Start the frontend server for the OpenDevin project."
-	@echo "  $(GREEN)run$(RESET)                 - Run the OpenDevin application, starting both backend and frontend servers."
-	@echo "                        Backend Log file will be stored in the 'logs' directory."
-	@echo "  $(GREEN)help$(RESET)                - Display this help message, providing information on available targets."
-
-# Phony targets
-.PHONY: build check-dependencies check-python check-npm check-docker check-poetry pull-docker-image install-python-dependencies install-frontend-dependencies install-precommit-hooks start-backend start-frontend run setup-config setup-config-prompts help
@@ -1,247 +0,0 @@
-<a name="readme-top"></a>
-<!--
-*** Thanks for checking out the Best-README-Template. If you have a suggestion
-*** that would make this better, please fork the repo and create a pull request
-*** or simply open an issue with the tag "enhancement".
-*** Don't forget to give the project a star!
-*** Thanks again! Now go create something AMAZING! :D
-->
-
-
-
-<!-- PROJECT SHIELDS -->
-<!--
-*** I'm using markdown "reference style" links for readability.
-*** Reference links are enclosed in brackets [ ] instead of parentheses ( ).
-*** See the bottom of this document for the declaration of the reference variables
-*** for contributors-url, forks-url, etc. This is an optional, concise syntax you may use.
-*** https://www.markdownguide.org/basic-syntax/#reference-style-links
-->
-
-<div align="center">
-  <a href="https://github.com/OpenDevin/OpenDevin/graphs/contributors"><img src="https://img.shields.io/github/contributors/opendevin/opendevin?style=for-the-badge" alt="Contributors"></a>
-  <a href="https://github.com/OpenDevin/OpenDevin/network/members"><img src="https://img.shields.io/github/forks/opendevin/opendevin?style=for-the-badge" alt="Forks"></a>
-  <a href="https://github.com/OpenDevin/OpenDevin/stargazers"><img src="https://img.shields.io/github/stars/opendevin/opendevin?style=for-the-badge" alt="Stargazers"></a>
-  <a href="https://github.com/OpenDevin/OpenDevin/issues"><img src="https://img.shields.io/github/issues/opendevin/opendevin?style=for-the-badge" alt="Issues"></a>
-  <a href="https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE"><img src="https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge" alt="MIT License"></a>
-  </br>
-  <a href="https://join.slack.com/t/opendevin/shared_invite/zt-2etftj1dd-X1fDL2PYIVpsmJZkqEYANw"><img src="https://img.shields.io/badge/Slack-Join%20Us-red?logo=slack&logoColor=white&style=for-the-badge" alt="Join our Slack community"></a>
-  <a href="https://discord.gg/mBuDGRzzES"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
-</div>
-
-<!-- PROJECT LOGO -->
-<div align="center">
-  <img src="./logo.png" alt="Logo" width="200" height="200">
-  <h1 align="center">OpenDevin: Code Less, Make More</h1>
-</div>
-
-
-
-
-<!-- TABLE OF CONTENTS -->
-<details>
-  <summary>🗂️ Table of Contents</summary>
-  <ol>
-    <li><a href="#-mission">🎯 Mission</a></li>
-    <li><a href="#-what-is-devin">🤔 What is Devin?</a></li>
-    <li><a href="#-why-opendevin">🐚 Why OpenDevin?</a></li>
-    <li><a href="#-project-status">🚧 Project Status</a></li>
-      <a href="#-get-started">🚀 Get Started</a>
-      <ul>
-        <li><a href="#1-requirements">1. Requirements</a></li>
-        <li><a href="#2-build-and-setup">2. Build and Setup</a></li>
-        <li><a href="#3-run-the-application">3. Run the Application</a></li>
-        <li><a href="#4-individual-server-startup">4. Individual Server Startup</a></li>
-        <li><a href="#5-help">5. Help</a></li>
-      </ul>
-    </li>
-    <li><a href="#%EF%B8%8F-research-strategy">⭐️ Research Strategy</a></li>
-    <li><a href="#-how-to-contribute">🤝 How to Contribute</a></li>
-    <li><a href="#-join-our-community">🤖 Join Our Community</a></li>
-    <li><a href="#%EF%B8%8F-built-with">🛠️ Built With</a></li>
-    <li><a href="#-license">📜 License</a></li>
-  </ol>
-</details>
-
-## 🎯 Mission
-
-[Project Demo Video](https://github.com/OpenDevin/OpenDevin/assets/38853559/71a472cc-df34-430c-8b1d-4d7286c807c9)
-
-
-Welcome to OpenDevin, an open-source project aiming to replicate Devin, an autonomous AI software engineer who is capable of executing complex engineering tasks and collaborating actively with users on software development projects. This project aspires to replicate, enhance, and innovate upon Devin through the power of the open-source community.
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
-## 🤔 What is Devin?
-Devin represents a cutting-edge autonomous agent designed to navigate the complexities of software engineering. It leverages a combination of tools such as a shell, code editor, and web browser, showcasing the untapped potential of LLMs in software development. Our goal is to explore and expand upon Devin's capabilities, identifying both its strengths and areas for improvement, to guide the progress of open code models.
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
-## 🐚 Why OpenDevin?
-The OpenDevin project is born out of a desire to replicate, enhance, and innovate beyond the original Devin model. By engaging the open-source community, we aim to tackle the challenges faced by Code LLMs in practical scenarios, producing works that significantly contribute to the community and pave the way for future advancements.
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
-## 🚧 Project Status
-
-OpenDevin is currently a work in progress, but you can already run the alpha version to see the end-to-end system in action. The project team is actively working on the following key milestones:
-
- **UI**: Developing a user-friendly interface, including a chat interface, a shell demonstrating commands, and a web browser.
- **Architecture**: Building a stable agent framework with a robust backend that can read, write, and run simple commands.
- **Agent Capabilities**: Enhancing the agent's abilities to generate bash scripts, run tests, and perform other software engineering tasks.
- **Evaluation**: Establishing a minimal evaluation pipeline that is consistent with Devin's evaluation criteria.
-
-After completing the MVP, the team will focus on research in various areas, including foundation models, specialist capabilities, evaluation, and agent studies.
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
-## ⚠️ Caveats and Warnings
-* OpenDevin is still an alpha project. It is changing very quickly and is unstable. We are working on getting a stable release out in the coming weeks.
-* OpenDevin will issue many prompts to the LLM you configure. Most of these LLMs cost money--be sure to set spending limits and monitor usage.
-* OpenDevin runs `bash` commands within a Docker sandbox, so it should not affect your machine. But your workspace directory will be attached to that sandbox, and files in the directory may be modified or deleted.
-* Our default Agent is currently the MonologueAgent, which has limited capabilities, but is fairly stable. We're working on other Agent implementations, including [SWE Agent](https://swe-agent.com/). You can [read about our current set of agents here](./docs/documentation/Agents.md).
-
-## 🚀 Get Started
-
-Getting started with the OpenDevin project is incredibly easy. Follow these simple steps to set up and run OpenDevin on your system:
-
-The easiest way to run OpenDevin is inside a Docker container.
-You can run:
-```bash
-# Your OpenAI API key, or any other LLM API key
-export LLM_API_KEY="sk-..."
-
-# The directory you want OpenDevin to modify. MUST be an absolute path!
-export WORKSPACE_DIR=$(pwd)/workspace
-
-docker run \
-    -e LLM_API_KEY \
-    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_DIR \
-    -v $WORKSPACE_DIR:/opt/workspace_base \
-    -v /var/run/docker.sock:/var/run/docker.sock \
-    -p 3000:3000 \
-    ghcr.io/opendevin/opendevin:latest
-```
-Replace `$(pwd)/workspace` with the path to the code you want OpenDevin to work with.
-
-You can find opendevin running at `http://localhost:3000`.
-
-See [Development.md](Development.md) for instructions on running OpenDevin without Docker.
-
-## 🤖 LLM Backends
-OpenDevin can work with any LLM backend.
-For a full list of the LM providers and models available, please consult the
-[litellm documentation](https://docs.litellm.ai/docs/providers).
-
-The `LLM_MODEL` environment variable controls which model is used in programmatic interactions,
-but choosing a model in the OpenDevin UI will override this setting.
-
-The following environment variables might be necessary for some LLMs:
-* `LLM_API_KEY`
-* `LLM_BASE_URL`
-* `LLM_EMBEDDING_MODEL`
-* `LLM_DEPLOYMENT_NAME`
-* `LLM_API_VERSION`
-
-**Note on Alternative Models:**
-Some alternative models may prove more challenging to tame than others.
-Fear not, brave adventurer! We shall soon unveil LLM-specific documentation to guide you on your quest.
-And if you've already mastered the art of wielding a model other than OpenAI's GPT,
-we encourage you to [share your setup instructions with us](https://github.com/OpenDevin/OpenDevin/issues/417).
-
-There is also [documentation for running with local models using ollama](./docs/documentation/LOCAL_LLM_GUIDE.md).
-
-## ⭐️ Research Strategy
-
-Achieving full replication of production-grade applications with LLMs is a complex endeavor. Our strategy involves:
-
-1. **Core Technical Research:** Focusing on foundational research to understand and improve the technical aspects of code generation and handling.
-2. **Specialist Abilities:** Enhancing the effectiveness of core components through data curation, training methods, and more.
-3. **Task Planning:** Developing capabilities for bug detection, codebase management, and optimization.
-4. **Evaluation:** Establishing comprehensive evaluation metrics to better understand and improve our models.
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
-## 🤝 How to Contribute
-
-OpenDevin is a community-driven project, and we welcome contributions from everyone. Whether you're a developer, a researcher, or simply enthusiastic about advancing the field of software engineering with AI, there are many ways to get involved:
-
- **Code Contributions:** Help us develop the core functionalities, frontend interface, or sandboxing solutions.
- **Research and Evaluation:** Contribute to our understanding of LLMs in software engineering, participate in evaluating the models, or suggest improvements.
- **Feedback and Testing:** Use the OpenDevin toolset, report bugs, suggest features, or provide feedback on usability.
-
-For details, please check [this document](./CONTRIBUTING.md).
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
-## 🤖 Join Our Community
-
-Now we have both Slack workspace for the collaboration on building OpenDevin and Discord server for discussion about anything related, e.g., this project, LLM, agent, etc.
-
-* [Slack workspace](https://join.slack.com/t/opendevin/shared_invite/zt-2etftj1dd-X1fDL2PYIVpsmJZkqEYANw)
-* [Discord server](https://discord.gg/mBuDGRzzES)
-
-If you would love to contribute, feel free to join our community (note that now there is no need to fill in the [form](https://forms.gle/758d5p6Ve8r2nxxq6)). Let's simplify software engineering together!
-
-🐚 **Code less, make more with OpenDevin.**
-
-[![Star History Chart](https://api.star-history.com/svg?repos=OpenDevin/OpenDevin&type=Date)](https://star-history.com/#OpenDevin/OpenDevin&Date)
-
-## 🛠️ Built With
-
-OpenDevin is built using a combination of powerful frameworks and libraries, providing a robust foundation for its development. Here are the key technologies used in the project:
-
-![FastAPI](https://img.shields.io/badge/FastAPI-black?style=for-the-badge) ![uvicorn](https://img.shields.io/badge/uvicorn-black?style=for-the-badge) ![LiteLLM](https://img.shields.io/badge/LiteLLM-black?style=for-the-badge) ![Docker](https://img.shields.io/badge/Docker-black?style=for-the-badge) ![Ruff](https://img.shields.io/badge/Ruff-black?style=for-the-badge) ![MyPy](https://img.shields.io/badge/MyPy-black?style=for-the-badge) ![LlamaIndex](https://img.shields.io/badge/LlamaIndex-black?style=for-the-badge) ![React](https://img.shields.io/badge/React-black?style=for-the-badge)
-
-Please note that the selection of these technologies is in progress, and additional technologies may be added or existing ones may be removed as the project evolves. We strive to adopt the most suitable and efficient tools to enhance the capabilities of OpenDevin.
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
-## 📜 License
-
-Distributed under the MIT License. See [`LICENSE`](./LICENSE) for more information.
-
-<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
-    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
-        ↑ Back to Top ↑
-    </a>
-</p>
-
-[contributors-shield]: https://img.shields.io/github/contributors/opendevin/opendevin?style=for-the-badge
-[contributors-url]: https://github.com/OpenDevin/OpenDevin/graphs/contributors
-[forks-shield]: https://img.shields.io/github/forks/opendevin/opendevin?style=for-the-badge
-[forks-url]: https://github.com/OpenDevin/OpenDevin/network/members
-[stars-shield]: https://img.shields.io/github/stars/opendevin/opendevin?style=for-the-badge
-[stars-url]: https://github.com/OpenDevin/OpenDevin/stargazers
-[issues-shield]: https://img.shields.io/github/issues/opendevin/opendevin?style=for-the-badge
-[issues-url]: https://github.com/OpenDevin/OpenDevin/issues
-[license-shield]: https://img.shields.io/github/license/opendevin/opendevin?style=for-the-badge
-[license-url]: https://github.com/OpenDevin/OpenDevin/blob/main/LICENSE
@@ -0,0 +1,37 @@
+import os
+
+__package_name__ = 'openhands_ai'
+
+
+def get_version():
+    # Try getting the version from pyproject.toml
+    try:
+        root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        with open(os.path.join(root_dir, 'pyproject.toml'), 'r') as f:
+            for line in f:
+                if line.startswith('version ='):
+                    return line.split('=')[1].strip().strip('"')
+    except FileNotFoundError:
+        pass
+
+    try:
+        from importlib.metadata import PackageNotFoundError, version
+
+        return version(__package_name__)
+    except (ImportError, PackageNotFoundError):
+        pass
+
+    try:
+        from pkg_resources import DistributionNotFound, get_distribution
+
+        return get_distribution(__package_name__).version
+    except (ImportError, DistributionNotFound):
+        pass
+
+    return 'unknown'
+
+
+try:
+    __version__ = get_version()
+except Exception:
+    __version__ = 'unknown'
@@ -1,73 +0,0 @@
-# Agent Framework Research
-
-In this folder, there may exist multiple implementations of `Agent` that will be used by the framework.
-
-For example, `agenthub/monologue_agent`, `agenthub/metagpt_agent`, `agenthub/codeact_agent`, etc.
-Contributors from different backgrounds and interests can choose to contribute to any (or all!) of these directions.
-
-## Constructing an Agent
-
-The abstraction for an agent can be found [here](../opendevin/agent.py).
-
-Agents are run inside of a loop. At each iteration, `agent.step()` is called with a
-[State](../opendevin/state.py) input, and the agent must output an [Action](../opendevin/action).
-
-Every agent also has a `self.llm` which it can use to interact with the LLM configured by the user.
-See the [LiteLLM docs for `self.llm.completion`](https://docs.litellm.ai/docs/completion).
-
-## State
-The `state` contains:
-* A history of actions taken by the agent, as well as any observations (e.g. file content, command output) from those actions
-* A list of actions/observations that have happened since the most recent step
-* A [`plan`](https://github.com/OpenDevin/OpenDevin/blob/main/opendevin/plan.py), which contains the main goal
-  * The agent can add and modify subtasks through the `AddTaskAction` and `ModifyTaskAction`
-
-## Actions
-Here is a list of available Actions, which can be returned by `agent.step()`:
- [`CmdRunAction`](../opendevin/action/bash.py) - Runs a command inside a sandboxed terminal
- [`CmdKillAction`](../opendevin/action/bash.py) - Kills a background command
- [`FileReadAction`](../opendevin/action/fileop.py) - Reads the content of a file
- [`FileWriteAction`](../opendevin/action/fileop.py) - Writes new content to a file
- [`BrowseURLAction`](../opendevin/action/browse.py) - Gets the content of a URL
- [`AgentRecallAction`](../opendevin/action/agent.py) - Searches memory (e.g. a vector database)
- [`AddTaskAction`](../opendevin/action/tasks.py) - Adds a subtask to the plan
- [`ModifyTaskAction`](../opendevin/action/tasks.py) - Changes the state of a subtask
- [`AgentThinkAction`](../opendevin/action/agent.py) - A no-op that allows the agent to add plaintext to the history (as well as the chat log)
- [`AgentFinishAction`](../opendevin/action/agent.py) - Stops the control loop, allowing the user to enter a new task
-
-You can use `action.to_dict()` and `action_from_dict` to serialize and deserialize actions.
-
-## Observations
-There are also several types of Observations. These are typically available in the step following the corresponding Action.
-But they may also appear as a result of asynchronous events (e.g. a message from the user, logs from a command running
-in the background).
-
-Here is a list of available Observations:
- [`CmdOutputObservation`](../opendevin/observation/run.py)
- [`BrowserOutputObservation`](../opendevin/observation/browse.py)
- [`FileReadObservation`](../opendevin/observation/files.py)
- [`FileWriteObservation`](../opendevin/observation/files.py)
- [`UserMessageObservation`](../opendevin/observation/)
- [`AgentRecallObservation`](../opendevin/observation/recall.py)
- [`AgentErrorObservation`](../opendevin/observation/error.py)
-
-You can use `observation.to_dict()` and `observation_from_dict` to serialize and deserialize observations.
-
-## Interface
-Every agent must implement the following methods:
-
-### `step`
-```
-def step(self, state: "State") -> "Action"
-```
-`step` moves the agent forward one step towards its goal. This probably means
-sending a prompt to the LLM, then parsing the response into an `Action`.
-
-### `search_memory`
-```
-def search_memory(self, query: str) -> List[str]:
-```
-`search_memory` should return a list of events that match the query. This will be used
-for the `recall` action.
-
-You can optionally just return `[]` for this method, meaning the agent has no long-term memory.
@@ -1,9 +1,39 @@
 from dotenv import load_dotenv
+
+from openhands.agenthub.micro.agent import MicroAgent
+from openhands.agenthub.micro.registry import all_microagents
+from openhands.controller.agent import Agent
+
 load_dotenv()

-# Import agents after environment variables are loaded
-from . import monologue_agent  # noqa: E402
-from . import codeact_agent  # noqa: E402
-from . import planner_agent  # noqa: E402

-__all__ = ['monologue_agent', 'codeact_agent', 'planner_agent']
+from openhands.agenthub import (  # noqa: E402
+    browsing_agent,
+    codeact_agent,
+    delegator_agent,
+    dummy_agent,
+    planner_agent,
+)
+
+__all__ = [
+    'codeact_agent',
+    'planner_agent',
+    'delegator_agent',
+    'dummy_agent',
+    'browsing_agent',
+]
+
+for agent in all_microagents.values():
+    name = agent['name']
+    prompt = agent['prompt']
+
+    anon_class = type(
+        name,
+        (MicroAgent,),
+        {
+            'prompt': prompt,
+            'agent_definition': agent,
+        },
+    )
+
+    Agent.register(name, anon_class)
@@ -0,0 +1,4 @@
+from openhands.agenthub.browsing_agent.browsing_agent import BrowsingAgent
+from openhands.controller.agent import Agent
+
+Agent.register('BrowsingAgent', BrowsingAgent)
@@ -0,0 +1,223 @@
+import os
+
+from browsergym.core.action.highlevel import HighLevelActionSet
+from browsergym.utils.obs import flatten_axtree_to_str
+
+from openhands.agenthub.browsing_agent.response_parser import BrowsingResponseParser
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import Message, TextContent
+from openhands.events.action import (
+    Action,
+    AgentFinishAction,
+    BrowseInteractiveAction,
+    MessageAction,
+)
+from openhands.events.event import EventSource
+from openhands.events.observation import BrowserOutputObservation
+from openhands.events.observation.observation import Observation
+from openhands.llm.llm import LLM
+from openhands.runtime.plugins import (
+    PluginRequirement,
+)
+
+USE_NAV = (
+    os.environ.get('USE_NAV', 'true') == 'true'
+)  # only disable NAV actions when running webarena and miniwob benchmarks
+USE_CONCISE_ANSWER = (
+    os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
+)  # only return concise answer when running webarena and miniwob benchmarks
+
+if not USE_NAV and USE_CONCISE_ANSWER:
+    EVAL_MODE = True  # disabled NAV actions and only return concise answer, for webarena and miniwob benchmarks\
+else:
+    EVAL_MODE = False
+
+
+def get_error_prefix(last_browser_action: str) -> str:
+    return f'IMPORTANT! Last action is incorrect:\n{last_browser_action}\nThink again with the current observation of the page.\n'
+
+
+def get_system_message(goal: str, action_space: str) -> str:
+    return f"""\
+# Instructions
+Review the current state of the page and all other information to find the best
+possible next action to accomplish your goal. Your answer will be interpreted
+and executed by a program, make sure to follow the formatting instructions.
+
+# Goal:
+{goal}
+
+# Action Space
+{action_space}
+"""
+
+
+CONCISE_INSTRUCTION = """\
+
+Here is another example with chain of thought of a valid action when providing a concise answer to user:
+"
+In order to accomplish my goal I need to send the information asked back to the user. This page list the information of HP Inkjet Fax Machine, which is the product identified in the objective. Its price is $279.49. I will send a message back to user with the answer.
+```send_msg_to_user("$279.49")```
+"
+"""
+
+
+def get_prompt(
+    error_prefix: str, cur_url: str, cur_axtree_txt: str, prev_action_str: str
+) -> str:
+    prompt = f"""\
+{error_prefix}
+
+# Current Page URL:
+{cur_url}
+
+# Current Accessibility Tree:
+{cur_axtree_txt}
+
+# Previous Actions
+{prev_action_str}
+
+Here is an example with chain of thought of a valid action when clicking on a button:
+"
+In order to accomplish my goal I need to click on the button with bid 12
+```click("12")```
+"
+""".strip()
+    if USE_CONCISE_ANSWER:
+        prompt += CONCISE_INSTRUCTION
+    return prompt
+
+
+class BrowsingAgent(Agent):
+    VERSION = '1.0'
+    """
+    An agent that interacts with the browser.
+    """
+
+    sandbox_plugins: list[PluginRequirement] = []
+    response_parser = BrowsingResponseParser()
+
+    def __init__(
+        self,
+        llm: LLM,
+        config: AgentConfig,
+    ) -> None:
+        """Initializes a new instance of the BrowsingAgent class.
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm, config)
+        # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
+        # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
+        action_subsets = ['chat', 'bid']
+        if USE_NAV:
+            action_subsets.append('nav')
+        self.action_space = HighLevelActionSet(
+            subsets=action_subsets,
+            strict=False,  # less strict on the parsing of the actions
+            multiaction=True,  # enable to agent to take multiple actions at once
+        )
+
+        self.reset()
+
+    def reset(self) -> None:
+        """Resets the Browsing Agent."""
+        super().reset()
+        self.cost_accumulator = 0
+        self.error_accumulator = 0
+
+    def step(self, state: State) -> Action:
+        """Performs one step using the Browsing Agent.
+        This includes gathering information on previous steps and prompting the model to make a browsing command to execute.
+
+        Parameters:
+        - state (State): used to get updated info
+
+        Returns:
+        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+        """
+        messages: list[Message] = []
+        prev_actions = []
+        cur_url = ''
+        cur_axtree_txt = ''
+        error_prefix = ''
+        last_obs = None
+        last_action = None
+
+        if EVAL_MODE and len(state.history) == 1:
+            # for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
+            # initialize and retrieve the first observation by issuing an noop OP
+            # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
+            return BrowseInteractiveAction(browser_actions='noop()')
+
+        for event in state.history:
+            if isinstance(event, BrowseInteractiveAction):
+                prev_actions.append(event.browser_actions)
+                last_action = event
+            elif isinstance(event, MessageAction) and event.source == EventSource.AGENT:
+                # agent has responded, task finished.
+                return AgentFinishAction(outputs={'content': event.content})
+            elif isinstance(event, Observation):
+                last_obs = event
+
+        if EVAL_MODE:
+            prev_actions = prev_actions[1:]  # remove the first noop action
+
+        prev_action_str = '\n'.join(prev_actions)
+        # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
+        # we should also send a message back to the user in OpenHands and call it a day
+        if (
+            isinstance(last_action, BrowseInteractiveAction)
+            and last_action.browsergym_send_msg_to_user
+        ):
+            return MessageAction(last_action.browsergym_send_msg_to_user)
+
+        if isinstance(last_obs, BrowserOutputObservation):
+            if last_obs.error:
+                # add error recovery prompt prefix
+                error_prefix = get_error_prefix(last_obs.last_browser_action)
+                self.error_accumulator += 1
+                if self.error_accumulator > 5:
+                    return MessageAction('Too many errors encountered. Task failed.')
+
+            cur_url = last_obs.url
+
+            try:
+                cur_axtree_txt = flatten_axtree_to_str(
+                    last_obs.axtree_object,
+                    extra_properties=last_obs.extra_element_properties,
+                    with_clickable=True,
+                    filter_visible_only=True,
+                )
+            except Exception as e:
+                logger.error(
+                    'Error when trying to process the accessibility tree: %s', e
+                )
+                return MessageAction('Error encountered when browsing.')
+
+        goal, _ = state.get_current_user_intent()
+
+        if goal is None:
+            goal = state.inputs['task']
+
+        system_msg = get_system_message(
+            goal,
+            self.action_space.describe(with_long_description=False, with_examples=True),
+        )
+
+        messages.append(Message(role='system', content=[TextContent(text=system_msg)]))
+
+        prompt = get_prompt(error_prefix, cur_url, cur_axtree_txt, prev_action_str)
+        messages.append(Message(role='user', content=[TextContent(text=prompt)]))
+
+        response = self.llm.completion(
+            messages=self.llm.format_messages_for_llm(messages),
+            stop=[')```', ')\n```'],
+        )
+        return self.response_parser.parse(response)
@@ -0,0 +1,123 @@
+import ast
+import re
+
+from openhands.controller.action_parser import ActionParser, ResponseParser
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action import (
+    Action,
+    BrowseInteractiveAction,
+)
+
+
+class BrowsingResponseParser(ResponseParser):
+    def __init__(self):
+        # Need to pay attention to the item order in self.action_parsers
+        super().__init__()
+        self.action_parsers = [BrowsingActionParserMessage()]
+        self.default_parser = BrowsingActionParserBrowseInteractive()
+
+    def parse(self, response: str) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        action_str = response['choices'][0]['message']['content']
+        if action_str is None:
+            return ''
+        action_str = action_str.strip()
+        # Ensure action_str ends with ')```'
+        if action_str:
+            if not action_str.endswith('```'):
+                if action_str.endswith(')'):
+                    action_str += '```'  # prevent duplicate ending paranthesis, e.g. send_msg_to_user('Done'))
+                else:
+                    action_str += ')```'  # expected format
+        logger.debug(action_str)
+        return action_str
+
+    def parse_action(self, action_str: str) -> Action:
+        for action_parser in self.action_parsers:
+            if action_parser.check_condition(action_str):
+                return action_parser.parse(action_str)
+        return self.default_parser.parse(action_str)
+
+
+class BrowsingActionParserMessage(ActionParser):
+    """Parser action:
+    - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user
+    """
+
+    def __init__(
+        self,
+    ):
+        pass
+
+    def check_condition(self, action_str: str) -> bool:
+        return '```' not in action_str
+
+    def parse(self, action_str: str) -> Action:
+        msg = f'send_msg_to_user("""{action_str}""")'
+        return BrowseInteractiveAction(
+            browser_actions=msg,
+            thought=action_str,
+            browsergym_send_msg_to_user=action_str,
+        )
+
+
+class BrowsingActionParserBrowseInteractive(ActionParser):
+    """Parser action:
+    - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym
+    """
+
+    def __init__(
+        self,
+    ):
+        pass
+
+    def check_condition(self, action_str: str) -> bool:
+        return True
+
+    def parse(self, action_str: str) -> Action:
+        # parse the action string into browser_actions and thought
+        # the LLM can return only one string, or both
+
+        # when both are returned, it looks like this:
+        ### Based on the current state of the page and the goal of finding out the president of the USA, the next action should involve searching for information related to the president.
+        ### To achieve this, we can navigate to a reliable source such as a search engine or a specific website that provides information about the current president of the USA.
+        ### Here is an example of a valid action to achieve this:
+        ### ```
+        ### goto('https://www.whitehouse.gov/about-the-white-house/presidents/'
+        # in practice, BrowsingResponseParser.parse_response also added )``` to the end of the string
+
+        # when the LLM returns only one string, it looks like this:
+        ### goto('https://www.whitehouse.gov/about-the-white-house/presidents/')
+        # and parse_response added )``` to the end of the string
+        parts = action_str.split('```')
+        browser_actions = (
+            parts[1].strip() if parts[1].strip() != '' else parts[0].strip()
+        )
+        thought = parts[0].strip() if parts[1].strip() != '' else ''
+
+        # if the LLM wants to talk to the user, we extract the message
+        msg_content = ''
+        for sub_action in browser_actions.split('\n'):
+            if 'send_msg_to_user(' in sub_action:
+                try:
+                    tree = ast.parse(sub_action)
+                    args = tree.body[0].value.args  # type: ignore
+                    msg_content = args[0].value
+                except SyntaxError:
+                    logger.error(f'Error parsing action: {sub_action}')
+                    # the syntax was not correct, but we can still try to get the message
+                    # e.g. send_msg_to_user("Hello, world!") or send_msg_to_user('Hello, world!'
+                    match = re.search(r'send_msg_to_user\((["\'])(.*?)\1\)', sub_action)
+                    if match:
+                        msg_content = match.group(2)
+                    else:
+                        msg_content = ''
+
+        return BrowseInteractiveAction(
+            browser_actions=browser_actions,
+            thought=thought,
+            browsergym_send_msg_to_user=msg_content,
+        )
@@ -0,0 +1,158 @@
+import collections
+import re
+from warnings import warn
+
+import yaml
+
+
+def yaml_parser(message):
+    """Parse a yaml message for the retry function."""
+    # saves gpt-3.5 from some yaml parsing errors
+    message = re.sub(r':\s*\n(?=\S|\n)', ': ', message)
+
+    try:
+        value = yaml.safe_load(message)
+        valid = True
+        retry_message = ''
+    except yaml.YAMLError as e:
+        warn(str(e), stacklevel=2)
+        value = {}
+        valid = False
+        retry_message = "Your response is not a valid yaml. Please try again and be careful to the format. Don't add any apology or comment, just the answer."
+    return value, valid, retry_message
+
+
+def _compress_chunks(text, identifier, skip_list, split_regex='\n\n+'):
+    """Compress a string by replacing redundant chunks by identifiers. Chunks are defined by the split_regex."""
+    text_list = re.split(split_regex, text)
+    text_list = [chunk.strip() for chunk in text_list]
+    counter = collections.Counter(text_list)
+    def_dict = {}
+    id = 0
+
+    # Store items that occur more than once in a dictionary
+    for item, count in counter.items():
+        if count > 1 and item not in skip_list and len(item) > 10:
+            def_dict[f'{identifier}-{id}'] = item
+            id += 1
+
+    # Replace redundant items with their identifiers in the text
+    compressed_text = '\n'.join(text_list)
+    for key, value in def_dict.items():
+        compressed_text = compressed_text.replace(value, key)
+
+    return def_dict, compressed_text
+
+
+def compress_string(text):
+    """Compress a string by replacing redundant paragraphs and lines with identifiers."""
+    # Perform paragraph-level compression
+    def_dict, compressed_text = _compress_chunks(
+        text, identifier='§', skip_list=[], split_regex='\n\n+'
+    )
+
+    # Perform line-level compression, skipping any paragraph identifiers
+    line_dict, compressed_text = _compress_chunks(
+        compressed_text, '¶', list(def_dict.keys()), split_regex='\n+'
+    )
+    def_dict.update(line_dict)
+
+    # Create a definitions section
+    def_lines = ['<definitions>']
+    for key, value in def_dict.items():
+        def_lines.append(f'{key}:\n{value}')
+    def_lines.append('</definitions>')
+    definitions = '\n'.join(def_lines)
+
+    return definitions + '\n' + compressed_text
+
+
+def extract_html_tags(text, keys):
+    """Extract the content within HTML tags for a list of keys.
+
+    Parameters
+    ----------
+    text : str
+        The input string containing the HTML tags.
+    keys : list of str
+        The HTML tags to extract the content from.
+
+    Returns:
+    -------
+    dict
+        A dictionary mapping each key to a list of subset in `text` that match the key.
+
+    Notes:
+    -----
+    All text and keys will be converted to lowercase before matching.
+
+    """
+    content_dict = {}
+    # text = text.lower()
+    # keys = set([k.lower() for k in keys])
+    for key in keys:
+        pattern = f'<{key}>(.*?)</{key}>'
+        matches = re.findall(pattern, text, re.DOTALL)
+        if matches:
+            content_dict[key] = [match.strip() for match in matches]
+    return content_dict
+
+
+class ParseError(Exception):
+    pass
+
+
+def parse_html_tags_raise(text, keys=(), optional_keys=(), merge_multiple=False):
+    """A version of parse_html_tags that raises an exception if the parsing is not successful."""
+    content_dict, valid, retry_message = parse_html_tags(
+        text, keys, optional_keys, merge_multiple=merge_multiple
+    )
+    if not valid:
+        raise ParseError(retry_message)
+    return content_dict
+
+
+def parse_html_tags(text, keys=(), optional_keys=(), merge_multiple=False):
+    """Satisfy the parse api, extracts 1 match per key and validates that all keys are present
+
+    Parameters
+    ----------
+    text : str
+        The input string containing the HTML tags.
+    keys : list of str
+        The HTML tags to extract the content from.
+    optional_keys : list of str
+        The HTML tags to extract the content from, but are optional.
+
+    Returns:
+    -------
+    dict
+        A dictionary mapping each key to subset of `text` that match the key.
+    bool
+        Whether the parsing was successful.
+    str
+        A message to be displayed to the agent if the parsing was not successful.
+    """
+    all_keys = tuple(keys) + tuple(optional_keys)
+    content_dict = extract_html_tags(text, all_keys)
+    retry_messages = []
+
+    for key in all_keys:
+        if key not in content_dict:
+            if key not in optional_keys:
+                retry_messages.append(f'Missing the key <{key}> in the answer.')
+        else:
+            val = content_dict[key]
+            content_dict[key] = val[0]
+            if len(val) > 1:
+                if not merge_multiple:
+                    retry_messages.append(
+                        f'Found multiple instances of the key {key}. You should have only one of them.'
+                    )
+                else:
+                    # merge the multiple instances
+                    content_dict[key] = '\n'.join(val)
+
+    valid = len(retry_messages) == 0
+    retry_message = '\n'.join(retry_messages)
+    return content_dict, valid, retry_message
@@ -1,23 +0,0 @@
-# CodeAct-based Agent Framework
-
-This folder implements the [CodeAct idea](https://arxiv.org/abs/2402.13463) that relies on LLM to autonomously perform actions in a Bash shell. It requires more from the LLM itself: LLM needs to be capable enough to do all the stuff autonomously, instead of stuck in an infinite loop.
-
-**NOTE: This agent is still highly experimental and under active development to reach the capability described in the original paper & [repo](https://github.com/xingyaoww/code-act).**
-
-<video src="https://github.com/xingyaoww/code-act/assets/38853559/62c80ada-62ce-447e-811c-fc801dd4beac"> </video>
-*Demo of the expected capability - work-in-progress.*
-
-```bash
-mkdir workspace
-PYTHONPATH=`pwd`:$PYTHONPATH python3 opendevin/main.py -d ./workspace -c CodeActAgent -t "Please write a flask app that returns 'Hello, World\!' at the root URL, then start the app on port 5000. python3 has already been installed for you."
-```
-
-Example: prompts `gpt-4-0125-preview` to write a flask server, install `flask` library, and start the server.
-
-<img width="951" alt="image" src="https://github.com/OpenDevin/OpenDevin/assets/38853559/325c3115-a343-4cc5-a92b-f1e5d552a077">
-
-<img width="957" alt="image" src="https://github.com/OpenDevin/OpenDevin/assets/38853559/68ad10c1-744a-4e9d-bb29-0f163d665a0a">
-
-Most of the things are working as expected, except at the end, the model did not follow the instruction to stop the interaction by outputting `<execute> exit </execute>` as instructed.
-
-**TODO**: This should be fixable by either (1) including a complete in-context example like [this](https://github.com/xingyaoww/mint-bench/blob/main/mint/tasks/in_context_examples/reasoning/with_tool.txt), OR (2) collect some interaction data like this and fine-tune a model (like [this](https://github.com/xingyaoww/code-act), a more complex route).
@@ -1,4 +1,4 @@
-from opendevin.agent import Agent
-from .codeact_agent import CodeActAgent
+from openhands.agenthub.codeact_agent.codeact_agent import CodeActAgent
+from openhands.controller.agent import Agent

 Agent.register('CodeActAgent', CodeActAgent)
@@ -1,157 +1,504 @@
-import re
-from typing import List, Mapping
+import json
+import os
+from collections import deque

-from opendevin.action import (
+from litellm import ModelResponse
+
+import openhands.agenthub.codeact_agent.function_calling as codeact_function_calling
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import (
    Action,
-    AgentEchoAction,
+    AgentDelegateAction,
    AgentFinishAction,
+    BrowseInteractiveAction,
+    BrowseURLAction,
    CmdRunAction,
+    FileEditAction,
+    IPythonRunCellAction,
+    MessageAction,
 )
-from opendevin.agent import Agent
-from opendevin.llm.llm import LLM
-from opendevin.observation import (
-    AgentMessageObservation,
+from openhands.events.observation import (
+    AgentDelegateObservation,
+    BrowserOutputObservation,
    CmdOutputObservation,
+    FileEditObservation,
+    IPythonRunCellObservation,
+    UserRejectObservation,
 )
-from opendevin.parse_commands import parse_command_file
-from opendevin.state import State
-
-COMMAND_DOCS = parse_command_file()
-COMMAND_SEGMENT = (
-    f"""
-
-Apart from the standard bash commands, you can also use the following special commands:
-{COMMAND_DOCS}
-"""
-    if COMMAND_DOCS is not None
-    else ''
+from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.observation import Observation
+from openhands.events.serialization.event import truncate_content
+from openhands.llm.llm import LLM
+from openhands.runtime.plugins import (
+    AgentSkillsRequirement,
+    JupyterRequirement,
+    PluginRequirement,
 )
-SYSTEM_MESSAGE = f"""You are a helpful assistant. You will be provided access (as root) to a bash shell to complete user-provided tasks.
-You will be able to execute commands in the bash shell, interact with the file system, install packages, and receive the output of your commands.
-
-DO NOT provide code in ```triple backticks```. Instead, you should execute bash command on behalf of the user by wrapping them with <execute> and </execute>.
-For example:
-
-You can list the files in the current directory by executing the following command:
-<execute>ls</execute>
-
-You can also install packages using pip:
-<execute> pip install numpy </execute>
-
-You can also write a block of code to a file:
-<execute>
-echo "import math
-print(math.pi)" > math.py
-</execute>
-{COMMAND_SEGMENT}
-
-When you are done, execute the following to close the shell and end the conversation:
-<execute>exit</execute>
-"""
-
-INVALID_INPUT_MESSAGE = (
-    "I don't understand your input. \n"
-    'If you want to execute command, please use <execute> YOUR_COMMAND_HERE </execute>.\n'
-    'If you already completed the task, please exit the shell by generating: <execute> exit </execute>.'
-)
-
-
-def parse_response(response) -> str:
-    action = response.choices[0].message.content
-    if '<execute>' in action and '</execute>' not in action:
-        action += '</execute>'
-    return action
+from openhands.utils.prompt import PromptManager


 class CodeActAgent(Agent):
+    VERSION = '2.2'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
+
+    ### Overview
+
+    This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
+
+    The conceptual idea is illustrated below. At each turn, the agent can:
+
+    1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
+    2. **CodeAct**: Choose to perform the task by executing code
+    - Execute any valid Linux `bash` command
+    - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
+
+    ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
+
    """

+    sandbox_plugins: list[PluginRequirement] = [
+        # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
+        # AgentSkillsRequirement provides a lot of Python functions,
+        # and it needs to be initialized before Jupyter for Jupyter to use those functions.
+        AgentSkillsRequirement(),
+        JupyterRequirement(),
+    ]
+
    def __init__(
        self,
        llm: LLM,
+        config: AgentConfig,
    ) -> None:
-        """
-        Initializes a new instance of the CodeActAgent class.
+        """Initializes a new instance of the CodeActAgent class.

        Parameters:
        - llm (LLM): The llm to be used by this agent
        """
-        super().__init__(llm)
-        self.messages: List[Mapping[str, str]] = []
+        super().__init__(llm, config)
+        self.reset()
+
+        self.mock_function_calling = False
+        if not self.llm.is_function_calling_active():
+            logger.info(
+                f'Function calling not enabled for model {self.llm.config.model}. '
+                'Mocking function calling via prompting.'
+            )
+            self.mock_function_calling = True
+
+        # Function calling mode
+        self.tools = codeact_function_calling.get_tools(
+            codeact_enable_browsing=self.config.codeact_enable_browsing,
+            codeact_enable_jupyter=self.config.codeact_enable_jupyter,
+            codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
+        )
+        logger.debug(
+            f'TOOLS loaded for CodeActAgent: {json.dumps(self.tools, indent=2)}'
+        )
+        self.prompt_manager = PromptManager(
+            microagent_dir=os.path.join(os.path.dirname(__file__), 'micro')
+            if self.config.use_microagents
+            else None,
+            prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts'),
+            disabled_microagents=self.config.disabled_microagents,
+        )
+
+        self.pending_actions: deque[Action] = deque()
+
+    def get_action_message(
+        self,
+        action: Action,
+        pending_tool_call_action_messages: dict[str, Message],
+    ) -> list[Message]:
+        """Converts an action into a message format that can be sent to the LLM.
+
+        This method handles different types of actions and formats them appropriately:
+        1. For tool-based actions (AgentDelegate, CmdRun, IPythonRunCell, FileEdit) and agent-sourced AgentFinish:
+            - In function calling mode: Stores the LLM's response in pending_tool_call_action_messages
+            - In non-function calling mode: Creates a message with the action string
+        2. For MessageActions: Creates a message with the text content and optional image content
+
+        Args:
+            action (Action): The action to convert. Can be one of:
+                - CmdRunAction: For executing bash commands
+                - IPythonRunCellAction: For running IPython code
+                - FileEditAction: For editing files
+                - BrowseInteractiveAction: For browsing the web
+                - AgentFinishAction: For ending the interaction
+                - MessageAction: For sending messages
+            pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
+                to their corresponding messages. Used in function calling mode to track tool calls
+                that are waiting for their results.
+
+        Returns:
+            list[Message]: A list containing the formatted message(s) for the action.
+                May be empty if the action is handled as a tool call in function calling mode.
+
+        Note:
+            In function calling mode, tool-based actions are stored in pending_tool_call_action_messages
+            rather than being returned immediately. They will be processed later when all corresponding
+            tool call results are available.
+        """
+        # create a regular message from an event
+        if isinstance(
+            action,
+            (
+                AgentDelegateAction,
+                IPythonRunCellAction,
+                FileEditAction,
+                BrowseInteractiveAction,
+                BrowseURLAction,
+            ),
+        ) or (isinstance(action, CmdRunAction) and action.source == 'agent'):
+            tool_metadata = action.tool_call_metadata
+            assert tool_metadata is not None, (
+                'Tool call metadata should NOT be None when function calling is enabled. Action: '
+                + str(action)
+            )
+
+            llm_response: ModelResponse = tool_metadata.model_response
+            assistant_msg = llm_response.choices[0].message
+
+            # Add the LLM message (assistant) that initiated the tool calls
+            # (overwrites any previous message with the same response_id)
+            logger.debug(
+                f'Tool calls type: {type(assistant_msg.tool_calls)}, value: {assistant_msg.tool_calls}'
+            )
+            pending_tool_call_action_messages[llm_response.id] = Message(
+                role=assistant_msg.role,
+                # tool call content SHOULD BE a string
+                content=[TextContent(text=assistant_msg.content or '')]
+                if assistant_msg.content is not None
+                else [],
+                tool_calls=assistant_msg.tool_calls,
+            )
+            return []
+        elif isinstance(action, AgentFinishAction):
+            role = 'user' if action.source == 'user' else 'assistant'
+
+            # when agent finishes, it has tool_metadata
+            # which has already been executed, and it doesn't have a response
+            # when the user finishes (/exit), we don't have tool_metadata
+            tool_metadata = action.tool_call_metadata
+            if tool_metadata is not None:
+                # take the response message from the tool call
+                assistant_msg = tool_metadata.model_response.choices[0].message
+                content = assistant_msg.content or ''
+
+                # save content if any, to thought
+                if action.thought:
+                    if action.thought != content:
+                        action.thought += '\n' + content
+                else:
+                    action.thought = content
+
+                # remove the tool call metadata
+                action.tool_call_metadata = None
+            return [
+                Message(
+                    role=role,
+                    content=[TextContent(text=action.thought)],
+                )
+            ]
+        elif isinstance(action, MessageAction):
+            role = 'user' if action.source == 'user' else 'assistant'
+            content = [TextContent(text=action.content or '')]
+            if self.llm.vision_is_active() and action.image_urls:
+                content.append(ImageContent(image_urls=action.image_urls))
+            return [
+                Message(
+                    role=role,
+                    content=content,
+                )
+            ]
+        elif isinstance(action, CmdRunAction) and action.source == 'user':
+            content = [
+                TextContent(text=f'User executed the command:\n{action.command}')
+            ]
+            return [
+                Message(
+                    role='user',
+                    content=content,
+                )
+            ]
+        return []
+
+    def get_observation_message(
+        self,
+        obs: Observation,
+        tool_call_id_to_message: dict[str, Message],
+    ) -> list[Message]:
+        """Converts an observation into a message format that can be sent to the LLM.
+
+        This method handles different types of observations and formats them appropriately:
+        - CmdOutputObservation: Formats command execution results with exit codes
+        - IPythonRunCellObservation: Formats IPython cell execution results, replacing base64 images
+        - FileEditObservation: Formats file editing results
+        - AgentDelegateObservation: Formats results from delegated agent tasks
+        - ErrorObservation: Formats error messages from failed actions
+        - UserRejectObservation: Formats user rejection messages
+
+        In function calling mode, observations with tool_call_metadata are stored in
+        tool_call_id_to_message for later processing instead of being returned immediately.
+
+        Args:
+            obs (Observation): The observation to convert
+            tool_call_id_to_message (dict[str, Message]): Dictionary mapping tool call IDs
+                to their corresponding messages (used in function calling mode)
+
+        Returns:
+            list[Message]: A list containing the formatted message(s) for the observation.
+                May be empty if the observation is handled as a tool response in function calling mode.
+
+        Raises:
+            ValueError: If the observation type is unknown
+        """
+        message: Message
+        max_message_chars = self.llm.config.max_message_chars
+        if isinstance(obs, CmdOutputObservation):
+            # if it doesn't have tool call metadata, it was triggered by a user action
+            if obs.tool_call_metadata is None:
+                text = truncate_content(
+                    f'\nObserved result of command executed by user:\n{obs.content}',
+                    max_message_chars,
+                )
+            else:
+                text = truncate_content(
+                    obs.content + obs.interpreter_details, max_message_chars
+                )
+            text += f'\n[Command finished with exit code {obs.exit_code}]'
+            message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, IPythonRunCellObservation):
+            text = obs.content
+            # replace base64 images with a placeholder
+            splitted = text.split('\n')
+            for i, line in enumerate(splitted):
+                if '![image](data:image/png;base64,' in line:
+                    splitted[i] = (
+                        '![image](data:image/png;base64, ...) already displayed to user'
+                    )
+            text = '\n'.join(splitted)
+            text = truncate_content(text, max_message_chars)
+            message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, FileEditObservation):
+            text = truncate_content(str(obs), max_message_chars)
+            message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, BrowserOutputObservation):
+            text = obs.get_agent_obs_text()
+            message = Message(
+                role='user',
+                content=[TextContent(text=text)],
+            )
+        elif isinstance(obs, AgentDelegateObservation):
+            text = truncate_content(
+                obs.outputs['content'] if 'content' in obs.outputs else '',
+                max_message_chars,
+            )
+            message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, ErrorObservation):
+            text = truncate_content(obs.content, max_message_chars)
+            text += '\n[Error occurred in processing last action]'
+            message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, UserRejectObservation):
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
+            text += '\n[Last action has been rejected by the user]'
+            message = Message(role='user', content=[TextContent(text=text)])
+        else:
+            # If an observation message is not returned, it will cause an error
+            # when the LLM tries to return the next message
+            raise ValueError(f'Unknown observation type: {type(obs)}')
+
+        # Update the message as tool response properly
+        if (tool_call_metadata := obs.tool_call_metadata) is not None:
+            tool_call_id_to_message[tool_call_metadata.tool_call_id] = Message(
+                role='tool',
+                content=message.content,
+                tool_call_id=tool_call_metadata.tool_call_id,
+                name=tool_call_metadata.function_name,
+            )
+            # No need to return the observation message
+            # because it will be added by get_action_message when all the corresponding
+            # tool calls in the SAME request are processed
+            return []
+
+        return [message]
+
+    def reset(self) -> None:
+        """Resets the CodeAct Agent."""
+        super().reset()

    def step(self, state: State) -> Action:
-        """
-        Performs one step using the Code Act Agent.
+        """Performs one step using the CodeAct Agent.
        This includes gathering info on previous steps and prompting the model to make a command to execute.

        Parameters:
-        - state (State): used to get updated info and background commands
+        - state (State): used to get updated info

        Returns:
-        - CmdRunAction(command) - command action to run
-        - AgentEchoAction(content=INVALID_INPUT_MESSAGE) - invalid command output
-
-        Raises:
-        - NotImplementedError - for actions other than CmdOutputObservation or AgentMessageObservation
+        - CmdRunAction(command) - bash command to run
+        - IPythonRunCellAction(code) - IPython code to run
+        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
        """
+        # Continue with pending actions if any
+        if self.pending_actions:
+            return self.pending_actions.popleft()

-        if len(self.messages) == 0:
-            assert state.plan.main_goal, 'Expecting instruction to be set'
-            self.messages = [
-                {'role': 'system', 'content': SYSTEM_MESSAGE},
-                {'role': 'user', 'content': state.plan.main_goal},
-            ]
-        updated_info = state.updated_info
-        if updated_info:
-            for prev_action, obs in updated_info:
-                assert isinstance(
-                    prev_action, (CmdRunAction, AgentEchoAction)
-                ), 'Expecting CmdRunAction or AgentEchoAction for Action'
-                if isinstance(
-                    obs, AgentMessageObservation
-                ):  # warning message from itself
-                    self.messages.append(
-                        {'role': 'user', 'content': obs.content})
-                elif isinstance(obs, CmdOutputObservation):
-                    content = 'OBSERVATION:\n' + obs.content
-                    content += f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]]'
-                    self.messages.append({'role': 'user', 'content': content})
-                else:
-                    raise NotImplementedError(
-                        f'Unknown observation type: {obs.__class__}'
+        # if we're done, go back
+        latest_user_message = state.get_last_user_message()
+        if latest_user_message and latest_user_message.content.strip() == '/exit':
+            return AgentFinishAction()
+
+        # prepare what we want to send to the LLM
+        messages = self._get_messages(state)
+        params: dict = {
+            'messages': self.llm.format_messages_for_llm(messages),
+        }
+        params['tools'] = self.tools
+        if self.mock_function_calling:
+            params['mock_function_calling'] = True
+        response = self.llm.completion(**params)
+        actions = codeact_function_calling.response_to_actions(response)
+        for action in actions:
+            self.pending_actions.append(action)
+        return self.pending_actions.popleft()
+
+    def _get_messages(self, state: State) -> list[Message]:
+        """Constructs the message history for the LLM conversation.
+
+        This method builds a structured conversation history by processing events from the state
+        and formatting them into messages that the LLM can understand. It handles both regular
+        message flow and function-calling scenarios.
+
+        The method performs the following steps:
+        1. Initializes with system prompt and optional initial user message
+        2. Processes events (Actions and Observations) into messages
+        3. Handles tool calls and their responses in function-calling mode
+        4. Manages message role alternation (user/assistant/tool)
+        5. Applies caching for specific LLM providers (e.g., Anthropic)
+        6. Adds environment reminders for non-function-calling mode
+
+        Args:
+            state (State): The current state object containing conversation history and other metadata
+
+        Returns:
+            list[Message]: A list of formatted messages ready for LLM consumption, including:
+                - System message with prompt
+                - Initial user message (if configured)
+                - Action messages (from both user and assistant)
+                - Observation messages (including tool responses)
+                - Environment reminders (in non-function-calling mode)
+
+        Note:
+            - In function-calling mode, tool calls and their responses are carefully tracked
+              to maintain proper conversation flow
+            - Messages from the same role are combined to prevent consecutive same-role messages
+            - For Anthropic models, specific messages are cached according to their documentation
+        """
+        if not self.prompt_manager:
+            raise Exception('Prompt Manager not instantiated.')
+
+        messages: list[Message] = [
+            Message(
+                role='system',
+                content=[
+                    TextContent(
+                        text=self.prompt_manager.get_system_message(),
+                        cache_prompt=self.llm.is_caching_prompt_active(),
                    )
-        response = self.llm.completion(
-            messages=self.messages,
-            stop=['</execute>'],
-            temperature=0.0
-        )
-        action_str: str = parse_response(response)
-        state.num_of_chars += sum(len(message['content'])
-                                  for message in self.messages) + len(action_str)
-        self.messages.append({'role': 'assistant', 'content': action_str})
+                ],
+            )
+        ]
+        example_message = self.prompt_manager.get_example_user_message()
+        if example_message:
+            messages.append(
+                Message(
+                    role='user',
+                    content=[TextContent(text=example_message)],
+                    cache_prompt=self.llm.is_caching_prompt_active(),
+                )
+            )

-        command = re.search(r'<execute>(.*)</execute>', action_str, re.DOTALL)
-        if command is not None:
-            # a command was found
-            command_group = command.group(1)
-            if command_group.strip() == 'exit':
-                return AgentFinishAction()
-            return CmdRunAction(command=command_group)
-            # # execute the code
-            # # TODO: does exit_code get loaded into Message?
-            # exit_code, observation = self.env.execute(command_group)
-            # self._history.append(Message(Role.ASSISTANT, observation))
-        else:
-            # we could provide a error message for the model to continue similar to
-            # https://github.com/xingyaoww/mint-bench/blob/main/mint/envs/general_env.py#L18-L23
-            # observation = INVALID_INPUT_MESSAGE
-            # self._history.append(Message(Role.ASSISTANT, observation))
-            return AgentEchoAction(
-                content=INVALID_INPUT_MESSAGE
-            )  # warning message to itself
+        pending_tool_call_action_messages: dict[str, Message] = {}
+        tool_call_id_to_message: dict[str, Message] = {}
+        events = list(state.history)
+        for event in events:
+            # create a regular message from an event
+            if isinstance(event, Action):
+                messages_to_add = self.get_action_message(
+                    action=event,
+                    pending_tool_call_action_messages=pending_tool_call_action_messages,
+                )
+            elif isinstance(event, Observation):
+                messages_to_add = self.get_observation_message(
+                    obs=event,
+                    tool_call_id_to_message=tool_call_id_to_message,
+                )
+            else:
+                raise ValueError(f'Unknown event type: {type(event)}')

-    def search_memory(self, query: str) -> List[str]:
-        raise NotImplementedError('Implement this abstract method')
+            # Check pending tool call action messages and see if they are complete
+            _response_ids_to_remove = []
+            for (
+                response_id,
+                pending_message,
+            ) in pending_tool_call_action_messages.items():
+                assert pending_message.tool_calls is not None, (
+                    'Tool calls should NOT be None when function calling is enabled & the message is considered pending tool call. '
+                    f'Pending message: {pending_message}'
+                )
+                if all(
+                    tool_call.id in tool_call_id_to_message
+                    for tool_call in pending_message.tool_calls
+                ):
+                    # If complete:
+                    # -- 1. Add the message that **initiated** the tool calls
+                    messages_to_add.append(pending_message)
+                    # -- 2. Add the tool calls **results***
+                    for tool_call in pending_message.tool_calls:
+                        messages_to_add.append(tool_call_id_to_message[tool_call.id])
+                        tool_call_id_to_message.pop(tool_call.id)
+                    _response_ids_to_remove.append(response_id)
+            # Cleanup the processed pending tool messages
+            for response_id in _response_ids_to_remove:
+                pending_tool_call_action_messages.pop(response_id)
+
+            for message in messages_to_add:
+                if message:
+                    if message.role == 'user':
+                        self.prompt_manager.enhance_message(message)
+                    # handle error if the message is the SAME role as the previous message
+                    # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
+                    # there shouldn't be two consecutive messages from the same role
+                    # NOTE: we shouldn't combine tool messages because each of them has a different tool_call_id
+                    if (
+                        messages
+                        and messages[-1].role == message.role
+                        and message.role != 'tool'
+                    ):
+                        messages[-1].content.extend(message.content)
+                    else:
+                        messages.append(message)
+
+        if self.llm.is_caching_prompt_active():
+            # NOTE: this is only needed for anthropic
+            # following logic here:
+            # https://github.com/anthropics/anthropic-quickstarts/blob/8f734fd08c425c6ec91ddd613af04ff87d70c5a0/computer-use-demo/computer_use_demo/loop.py#L241-L262
+            breakpoints_remaining = 3  # remaining 1 for system/tool
+            for message in reversed(messages):
+                if message.role == 'user' or message.role == 'tool':
+                    if breakpoints_remaining > 0:
+                        message.content[
+                            -1
+                        ].cache_prompt = True  # Last item inside the message content
+                        breakpoints_remaining -= 1
+                    else:
+                        break
+
+        return messages
@@ -0,0 +1,554 @@
+"""This file contains the function calling implementation for different actions.
+
+This is similar to the functionality of `CodeActResponseParser`.
+"""
+
+import json
+
+from browsergym.core.action.highlevel import HighLevelActionSet
+from litellm import (
+    ChatCompletionToolParam,
+    ChatCompletionToolParamFunctionChunk,
+    ModelResponse,
+)
+
+from openhands.core.exceptions import FunctionCallNotExistsError
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action import (
+    Action,
+    AgentDelegateAction,
+    AgentFinishAction,
+    BrowseInteractiveAction,
+    BrowseURLAction,
+    CmdRunAction,
+    FileEditAction,
+    IPythonRunCellAction,
+    MessageAction,
+)
+from openhands.events.tool import ToolCallMetadata
+
+_BASH_DESCRIPTION = """Execute a bash command in the terminal.
+* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
+* Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.
+* Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.
+"""
+
+CmdRunTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='execute_bash',
+        description=_BASH_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'command': {
+                    'type': 'string',
+                    'description': 'The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.',
+                },
+            },
+            'required': ['command'],
+        },
+    ),
+)
+
+_IPYTHON_DESCRIPTION = """Run a cell of Python code in an IPython environment.
+* The assistant should define variables and import packages before using them.
+* The variable defined in the IPython environment will not be available outside the IPython environment (e.g., in terminal).
+"""
+
+IPythonTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='execute_ipython_cell',
+        description=_IPYTHON_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'code': {
+                    'type': 'string',
+                    'description': 'The Python code to execute. Supports magic commands like %pip.',
+                },
+            },
+            'required': ['code'],
+        },
+    ),
+)
+
+_FILE_EDIT_DESCRIPTION = """Edit a file.
+* The assistant can edit files by specifying the file path and providing a draft of the new file content.
+* The draft content doesn't need to be exactly the same as the existing file; the assistant may skip unchanged lines using comments like `# unchanged` to indicate unchanged sections.
+* IMPORTANT: For large files (e.g., > 300 lines), specify the range of lines to edit using `start` and `end` (1-indexed, inclusive). The range should be smaller than 300 lines.
+* To append to a file, set both `start` and `end` to `-1`.
+* If the file doesn't exist, a new file will be created with the provided content.
+
+**Example 1: general edit for short files**
+For example, given an existing file `/path/to/file.py` that looks like this:
+(this is the end of the file)
+1|class MyClass:
+2|    def __init__(self):
+3|        self.x = 1
+4|        self.y = 2
+5|        self.z = 3
+6|
+7|print(MyClass().z)
+8|print(MyClass().x)
+(this is the end of the file)
+
+The assistant wants to edit the file to look like this:
+(this is the end of the file)
+1|class MyClass:
+2|    def __init__(self):
+3|        self.x = 1
+4|        self.y = 2
+5|
+6|print(MyClass().y)
+(this is the end of the file)
+
+The assistant may produce an edit action like this:
+path="/path/to/file.txt" start=1 end=-1
+content=```
+class MyClass:
+    def __init__(self):
+        # no changes before
+        self.y = 2
+        # self.z is removed
+
+# MyClass().z is removed
+print(MyClass().y)
+```
+
+**Example 2: append to file for short files**
+For example, given an existing file `/path/to/file.py` that looks like this:
+(this is the end of the file)
+1|class MyClass:
+2|    def __init__(self):
+3|        self.x = 1
+4|        self.y = 2
+5|        self.z = 3
+6|
+7|print(MyClass().z)
+8|print(MyClass().x)
+(this is the end of the file)
+
+To append the following lines to the file:
+```python
+print(MyClass().y)
+```
+
+The assistant may produce an edit action like this:
+path="/path/to/file.txt" start=-1 end=-1
+content=```
+print(MyClass().y)
+```
+
+**Example 3: edit for long files**
+
+Given an existing file `/path/to/file.py` that looks like this:
+(1000 more lines above)
+1001|class MyClass:
+1002|    def __init__(self):
+1003|        self.x = 1
+1004|        self.y = 2
+1005|        self.z = 3
+1006|
+1007|print(MyClass().z)
+1008|print(MyClass().x)
+(2000 more lines below)
+
+The assistant wants to edit the file to look like this:
+
+(1000 more lines above)
+1001|class MyClass:
+1002|    def __init__(self):
+1003|        self.x = 1
+1004|        self.y = 2
+1005|
+1006|print(MyClass().y)
+(2000 more lines below)
+
+The assistant may produce an edit action like this:
+path="/path/to/file.txt" start=1001 end=1008
+content=```
+class MyClass:
+    def __init__(self):
+        # no changes before
+        self.y = 2
+        # self.z is removed
+
+# MyClass().z is removed
+print(MyClass().y)
+```
+"""
+
+LLMBasedFileEditTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='edit_file',
+        description=_FILE_EDIT_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'path': {
+                    'type': 'string',
+                    'description': 'The absolute path to the file to be edited.',
+                },
+                'new_content_draft': {
+                    'type': 'string',
+                    'description': 'A draft of the new content for the file being edited. Note that the assistant may skip unchanged lines.',
+                },
+                'start': {
+                    'type': 'integer',
+                    'description': 'The starting line number for the edit (1-indexed, inclusive). Default is 1.',
+                },
+                'end': {
+                    'type': 'integer',
+                    'description': 'The ending line number for the edit (1-indexed, inclusive). Default is -1 (end of file).',
+                },
+            },
+            'required': ['path', 'content'],
+        },
+    ),
+)
+
+_STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating and editing files
+* State is persistent across command calls and discussions with the user
+* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
+* The `create` command cannot be used if the specified `path` already exists as a file
+* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
+* The `undo_edit` command will revert the last edit made to the file at `path`
+
+Notes for using the `str_replace` command:
+* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
+* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
+* The `new_str` parameter should contain the edited lines that should replace the `old_str`
+"""
+
+StrReplaceEditorTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='str_replace_editor',
+        description=_STR_REPLACE_EDITOR_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'command': {
+                    'description': 'The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.',
+                    'enum': ['view', 'create', 'str_replace', 'insert', 'undo_edit'],
+                    'type': 'string',
+                },
+                'path': {
+                    'description': 'Absolute path to file or directory, e.g. `/workspace/file.py` or `/workspace`.',
+                    'type': 'string',
+                },
+                'file_text': {
+                    'description': 'Required parameter of `create` command, with the content of the file to be created.',
+                    'type': 'string',
+                },
+                'old_str': {
+                    'description': 'Required parameter of `str_replace` command containing the string in `path` to replace.',
+                    'type': 'string',
+                },
+                'new_str': {
+                    'description': 'Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.',
+                    'type': 'string',
+                },
+                'insert_line': {
+                    'description': 'Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.',
+                    'type': 'integer',
+                },
+                'view_range': {
+                    'description': 'Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.',
+                    'items': {'type': 'integer'},
+                    'type': 'array',
+                },
+            },
+            'required': ['command', 'path'],
+        },
+    ),
+)
+
+
+_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `webpage_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).
+
+You may use the `webpage_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
+"""
+
+WebReadTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='web_read',
+        description=_WEB_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'url': {
+                    'type': 'string',
+                    'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).',
+                }
+            },
+            'required': ['url'],
+        },
+    ),
+)
+
+# from browsergym/core/action/highlevel.py
+_browser_action_space = HighLevelActionSet(
+    subsets=['bid', 'nav'],
+    strict=False,  # less strict on the parsing of the actions
+    multiaction=True,  # enable to agent to take multiple actions at once
+)
+
+
+_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage.
+
+See the description of "code" parameter for more details.
+
+Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
+More than 2-3 actions usually leads to failure or unexpected behavior. Example:
+fill('a12', 'example with "quotes"')
+click('a51')
+click('48', button='middle', modifiers=['Shift'])
+"""
+
+_BROWSER_TOOL_DESCRIPTION = """
+The following 15 functions are available. Nothing else is supported.
+
+goto(url: str)
+    Description: Navigate to a url.
+    Examples:
+        goto('http://www.example.com')
+
+go_back()
+    Description: Navigate to the previous page in history.
+    Examples:
+        go_back()
+
+go_forward()
+    Description: Navigate to the next page in history.
+    Examples:
+        go_forward()
+
+noop(wait_ms: float = 1000)
+    Description: Do nothing, and optionally wait for the given time (in milliseconds).
+    You can use this to get the current page content and/or wait for the page to load.
+    Examples:
+        noop()
+
+        noop(500)
+
+scroll(delta_x: float, delta_y: float)
+    Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
+    Examples:
+        scroll(0, 200)
+
+        scroll(-50.2, -100.5)
+
+fill(bid: str, value: str)
+    Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
+    Examples:
+        fill('237', 'example value')
+
+        fill('45', 'multi-line\nexample')
+
+        fill('a12', 'example with "quotes"')
+
+select_option(bid: str, options: str | list[str])
+    Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
+    Examples:
+        select_option('a48', 'blue')
+
+        select_option('c48', ['red', 'green', 'blue'])
+
+click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
+    Description: Click an element.
+    Examples:
+        click('a51')
+
+        click('b22', button='right')
+
+        click('48', button='middle', modifiers=['Shift'])
+
+dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
+    Description: Double click an element.
+    Examples:
+        dblclick('12')
+
+        dblclick('ca42', button='right')
+
+        dblclick('178', button='middle', modifiers=['Shift'])
+
+hover(bid: str)
+    Description: Hover over an element.
+    Examples:
+        hover('b8')
+
+press(bid: str, key_comb: str)
+    Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
+    Examples:
+        press('88', 'Backspace')
+
+        press('a26', 'ControlOrMeta+a')
+
+        press('a61', 'Meta+Shift+t')
+
+focus(bid: str)
+    Description: Focus the matching element.
+    Examples:
+        focus('b455')
+
+clear(bid: str)
+    Description: Clear the input field.
+    Examples:
+        clear('996')
+
+drag_and_drop(from_bid: str, to_bid: str)
+    Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
+    Examples:
+        drag_and_drop('56', '498')
+
+upload_file(bid: str, file: str | list[str])
+    Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
+    Examples:
+        upload_file('572', '/home/user/my_receipt.pdf')
+
+        upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
+"""
+
+
+for _, action in _browser_action_space.action_set.items():
+    assert (
+        action.signature in _BROWSER_TOOL_DESCRIPTION
+    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
+    assert (
+        action.description in _BROWSER_TOOL_DESCRIPTION
+    ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
+
+BrowserTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='browser',
+        description=_BROWSER_DESCRIPTION,
+        parameters={
+            'type': 'object',
+            'properties': {
+                'code': {
+                    'type': 'string',
+                    'description': (
+                        'The Python code that interacts with the browser.\n'
+                        + _BROWSER_TOOL_DESCRIPTION
+                    ),
+                }
+            },
+            'required': ['code'],
+        },
+    ),
+)
+
+_FINISH_DESCRIPTION = """Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task."""
+
+FinishTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='finish',
+        description=_FINISH_DESCRIPTION,
+    ),
+)
+
+
+def combine_thought(action: Action, thought: str) -> Action:
+    if not hasattr(action, 'thought'):
+        return action
+    if thought:
+        action.thought = thought
+    return action
+
+
+def response_to_actions(response: ModelResponse) -> list[Action]:
+    actions: list[Action] = []
+    assert len(response.choices) == 1, 'Only one choice is supported for now'
+    assistant_msg = response.choices[0].message
+    if assistant_msg.tool_calls:
+        # Check if there's assistant_msg.content. If so, add it to the thought
+        thought = ''
+        if isinstance(assistant_msg.content, str):
+            thought = assistant_msg.content
+        elif isinstance(assistant_msg.content, list):
+            for msg in assistant_msg.content:
+                if msg['type'] == 'text':
+                    thought += msg['text']
+
+        # Process each tool call to OpenHands action
+        for i, tool_call in enumerate(assistant_msg.tool_calls):
+            action: Action
+            try:
+                arguments = json.loads(tool_call.function.arguments)
+            except json.decoder.JSONDecodeError as e:
+                raise RuntimeError(
+                    f'Failed to parse tool call arguments: {tool_call.function.arguments}'
+                ) from e
+            if tool_call.function.name == 'execute_bash':
+                action = CmdRunAction(**arguments)
+            elif tool_call.function.name == 'execute_ipython_cell':
+                action = IPythonRunCellAction(**arguments)
+            elif tool_call.function.name == 'delegate_to_browsing_agent':
+                action = AgentDelegateAction(
+                    agent='BrowsingAgent',
+                    inputs=arguments,
+                )
+            elif tool_call.function.name == 'finish':
+                action = AgentFinishAction()
+            elif tool_call.function.name == 'edit_file':
+                action = FileEditAction(**arguments)
+            elif tool_call.function.name == 'str_replace_editor':
+                # We implement this in agent_skills, which can be used via Jupyter
+                # convert tool_call.function.arguments to kwargs that can be passed to file_editor
+                code = f'print(file_editor(**{arguments}))'
+                logger.debug(
+                    f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
+                )
+                action = IPythonRunCellAction(code=code, include_extra=False)
+            elif tool_call.function.name == 'browser':
+                action = BrowseInteractiveAction(browser_actions=arguments['code'])
+            elif tool_call.function.name == 'web_read':
+                action = BrowseURLAction(url=arguments['url'])
+            else:
+                raise FunctionCallNotExistsError(
+                    f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
+                )
+
+            # We only add thought to the first action
+            if i == 0:
+                action = combine_thought(action, thought)
+            # Add metadata for tool calling
+            action.tool_call_metadata = ToolCallMetadata(
+                tool_call_id=tool_call.id,
+                function_name=tool_call.function.name,
+                model_response=response,
+                total_calls_in_response=len(assistant_msg.tool_calls),
+            )
+            actions.append(action)
+    else:
+        actions.append(
+            MessageAction(content=assistant_msg.content, wait_for_response=True)
+        )
+
+    assert len(actions) >= 1
+    return actions
+
+
+def get_tools(
+    codeact_enable_browsing: bool = False,
+    codeact_enable_llm_editor: bool = False,
+    codeact_enable_jupyter: bool = False,
+) -> list[ChatCompletionToolParam]:
+    tools = [CmdRunTool, FinishTool]
+    if codeact_enable_browsing:
+        tools.append(WebReadTool)
+        tools.append(BrowserTool)
+    if codeact_enable_jupyter:
+        tools.append(IPythonTool)
+    if codeact_enable_llm_editor:
+        tools.append(LLMBasedFileEditTool)
+    else:
+        tools.append(StrReplaceEditorTool)
+    return tools
@@ -0,0 +1,6 @@
+You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
+<IMPORTANT>
+* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
+* When configuring git credentials, use "openhands" as the user.name and "openhands@all-hands.dev" as the user.email by default, unless explicitly instructed otherwise.
+* The assistant MUST NOT include comments in the code unless they are necessary to describe non-obvious behavior.
+</IMPORTANT>
@@ -0,0 +1,4 @@
+from openhands.agenthub.delegator_agent.agent import DelegatorAgent
+from openhands.controller.agent import Agent
+
+Agent.register('DelegatorAgent', DelegatorAgent)
@@ -0,0 +1,87 @@
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.events.action import Action, AgentDelegateAction, AgentFinishAction
+from openhands.events.observation import AgentDelegateObservation, Observation
+from openhands.llm.llm import LLM
+
+
+class DelegatorAgent(Agent):
+    VERSION = '1.0'
+    """
+    The Delegator Agent is responsible for delegating tasks to other agents based on the current task.
+    """
+
+    current_delegate: str = ''
+
+    def __init__(self, llm: LLM, config: AgentConfig):
+        """Initialize the Delegator Agent with an LLM
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm, config)
+
+    def step(self, state: State) -> Action:
+        """Checks to see if current step is completed, returns AgentFinishAction if True.
+        Otherwise, delegates the task to the next agent in the pipeline.
+
+        Parameters:
+        - state (State): The current state given the previous actions and observations
+
+        Returns:
+        - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
+        - AgentDelegateAction: The next agent to delegate the task to
+        """
+        if self.current_delegate == '':
+            self.current_delegate = 'study'
+            task, _ = state.get_current_user_intent()
+            return AgentDelegateAction(
+                agent='StudyRepoForTaskAgent', inputs={'task': task}
+            )
+
+        # last observation in history should be from the delegate
+        last_observation = None
+        for event in reversed(state.history):
+            if isinstance(event, Observation):
+                last_observation = event
+                break
+
+        if not isinstance(last_observation, AgentDelegateObservation):
+            raise Exception('Last observation is not an AgentDelegateObservation')
+
+        goal, _ = state.get_current_user_intent()
+        if self.current_delegate == 'study':
+            self.current_delegate = 'coder'
+            return AgentDelegateAction(
+                agent='CoderAgent',
+                inputs={
+                    'task': goal,
+                    'summary': last_observation.outputs['summary'],
+                },
+            )
+        elif self.current_delegate == 'coder':
+            self.current_delegate = 'verifier'
+            return AgentDelegateAction(
+                agent='VerifierAgent',
+                inputs={
+                    'task': goal,
+                },
+            )
+        elif self.current_delegate == 'verifier':
+            if (
+                'completed' in last_observation.outputs
+                and last_observation.outputs['completed']
+            ):
+                return AgentFinishAction()
+            else:
+                self.current_delegate = 'coder'
+                return AgentDelegateAction(
+                    agent='CoderAgent',
+                    inputs={
+                        'task': goal,
+                        'summary': last_observation.outputs['summary'],
+                    },
+                )
+        else:
+            raise Exception('Invalid delegate state')
@@ -0,0 +1,4 @@
+from openhands.agenthub.dummy_agent.agent import DummyAgent
+from openhands.controller.agent import Agent
+
+Agent.register('DummyAgent', DummyAgent)
@@ -0,0 +1,211 @@
+from typing import TypedDict, Union
+
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.schema import AgentState
+from openhands.events.action import (
+    Action,
+    AddTaskAction,
+    AgentFinishAction,
+    AgentRejectAction,
+    BrowseInteractiveAction,
+    BrowseURLAction,
+    CmdRunAction,
+    FileReadAction,
+    FileWriteAction,
+    MessageAction,
+    ModifyTaskAction,
+)
+from openhands.events.observation import (
+    AgentStateChangedObservation,
+    CmdOutputObservation,
+    FileReadObservation,
+    FileWriteObservation,
+    NullObservation,
+    Observation,
+)
+from openhands.events.serialization.event import event_to_dict
+from openhands.llm.llm import LLM
+
+"""
+FIXME: There are a few problems this surfaced
+* FileWrites seem to add an unintended newline at the end of the file
+* Browser not working
+"""
+
+ActionObs = TypedDict(
+    'ActionObs', {'action': Action, 'observations': list[Observation]}
+)
+
+
+class DummyAgent(Agent):
+    VERSION = '1.0'
+    """
+    The DummyAgent is used for e2e testing. It just sends the same set of actions deterministically,
+    without making any LLM calls.
+    """
+
+    def __init__(self, llm: LLM, config: AgentConfig):
+        super().__init__(llm, config)
+        self.steps: list[ActionObs] = [
+            {
+                'action': AddTaskAction(
+                    parent='None', goal='check the current directory'
+                ),
+                'observations': [],
+            },
+            {
+                'action': AddTaskAction(parent='0', goal='run ls'),
+                'observations': [],
+            },
+            {
+                'action': ModifyTaskAction(task_id='0', state='in_progress'),
+                'observations': [],
+            },
+            {
+                'action': MessageAction('Time to get started!'),
+                'observations': [],
+            },
+            {
+                'action': CmdRunAction(command='echo "foo"'),
+                'observations': [
+                    CmdOutputObservation(
+                        'foo', command_id=-1, command='echo "foo"', exit_code=0
+                    )
+                ],
+            },
+            {
+                'action': FileWriteAction(
+                    content='echo "Hello, World!"', path='hello.sh'
+                ),
+                'observations': [
+                    FileWriteObservation(
+                        content='echo "Hello, World!"', path='hello.sh'
+                    )
+                ],
+            },
+            {
+                'action': FileReadAction(path='hello.sh'),
+                'observations': [
+                    FileReadObservation('echo "Hello, World!"\n', path='hello.sh')
+                ],
+            },
+            {
+                'action': CmdRunAction(command='bash hello.sh'),
+                'observations': [
+                    CmdOutputObservation(
+                        'bash: hello.sh: No such file or directory',
+                        command_id=-1,
+                        command='bash workspace/hello.sh',
+                        exit_code=127,
+                    )
+                ],
+            },
+            {
+                'action': BrowseURLAction(url='https://google.com'),
+                'observations': [
+                    # BrowserOutputObservation('<html><body>Simulated Google page</body></html>',url='https://google.com',screenshot=''),
+                ],
+            },
+            {
+                'action': BrowseInteractiveAction(
+                    browser_actions='goto("https://google.com")'
+                ),
+                'observations': [
+                    # BrowserOutputObservation('<html><body>Simulated Google page after interaction</body></html>',url='https://google.com',screenshot=''),
+                ],
+            },
+            {
+                'action': AgentRejectAction(),
+                'observations': [NullObservation('')],
+            },
+            {
+                'action': AgentFinishAction(
+                    outputs={}, thought='Task completed', action='finish'
+                ),
+                'observations': [AgentStateChangedObservation('', AgentState.FINISHED)],
+            },
+        ]
+
+    def step(self, state: State) -> Action:
+        if state.iteration >= len(self.steps):
+            return AgentFinishAction()
+
+        current_step = self.steps[state.iteration]
+        action = current_step['action']
+
+        # If the action is AddTaskAction or ModifyTaskAction, update the parent ID or task_id
+        if isinstance(action, AddTaskAction):
+            if action.parent == 'None':
+                action.parent = ''  # Root task has no parent
+            elif action.parent == '0':
+                action.parent = state.root_task.id
+            elif action.parent.startswith('0.'):
+                action.parent = f'{state.root_task.id}{action.parent[1:]}'
+        elif isinstance(action, ModifyTaskAction):
+            if action.task_id == '0':
+                action.task_id = state.root_task.id
+            elif action.task_id.startswith('0.'):
+                action.task_id = f'{state.root_task.id}{action.task_id[1:]}'
+            # Ensure the task_id doesn't start with a dot
+            if action.task_id.startswith('.'):
+                action.task_id = action.task_id[1:]
+        elif isinstance(action, (BrowseURLAction, BrowseInteractiveAction)):
+            try:
+                return self.simulate_browser_action(action)
+            except (
+                Exception
+            ):  # This could be a specific exception for browser unavailability
+                return self.handle_browser_unavailable(action)
+
+        if state.iteration > 0:
+            prev_step = self.steps[state.iteration - 1]
+
+            if 'observations' in prev_step and prev_step['observations']:
+                expected_observations = prev_step['observations']
+                hist_events = state.history[-len(expected_observations) :]
+
+                if len(hist_events) < len(expected_observations):
+                    print(
+                        f'Warning: Expected {len(expected_observations)} observations, but got {len(hist_events)}'
+                    )
+
+                for i in range(min(len(expected_observations), len(hist_events))):
+                    hist_obs = event_to_dict(hist_events[i])
+                    expected_obs = event_to_dict(expected_observations[i])
+
+                    # Remove dynamic fields for comparison
+                    for obs in [hist_obs, expected_obs]:
+                        obs.pop('id', None)
+                        obs.pop('timestamp', None)
+                        obs.pop('cause', None)
+                        obs.pop('source', None)
+                        if 'extras' in obs:
+                            obs['extras'].pop('command_id', None)
+
+                    if hist_obs != expected_obs:
+                        print(
+                            f'Warning: Observation mismatch. Expected {expected_obs}, got {hist_obs}'
+                        )
+
+        return action
+
+    def simulate_browser_action(
+        self, action: Union[BrowseURLAction, BrowseInteractiveAction]
+    ) -> Action:
+        # Instead of simulating, we'll reject the browser action
+        return self.handle_browser_unavailable(action)
+
+    def handle_browser_unavailable(
+        self, action: Union[BrowseURLAction, BrowseInteractiveAction]
+    ) -> Action:
+        # Create a message action to inform that browsing is not available
+        message = 'Browser actions are not available in the DummyAgent environment.'
+        if isinstance(action, BrowseURLAction):
+            message += f' Unable to browse URL: {action.url}'
+        elif isinstance(action, BrowseInteractiveAction):
+            message += (
+                f' Unable to perform interactive browsing: {action.browser_actions}'
+            )
+        return MessageAction(content=message)
@@ -0,0 +1,82 @@
+from jinja2 import BaseLoader, Environment
+
+from openhands.agenthub.micro.instructions import instructions
+from openhands.agenthub.micro.registry import all_microagents
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.core.utils import json
+from openhands.events.action import Action
+from openhands.events.event import Event
+from openhands.events.serialization.action import action_from_dict
+from openhands.events.serialization.event import event_to_memory
+from openhands.llm.llm import LLM
+
+
+def parse_response(orig_response: str) -> Action:
+    # attempt to load the JSON dict from the response
+    action_dict = json.loads(orig_response)
+
+    # load the action from the dict
+    return action_from_dict(action_dict)
+
+
+def to_json(obj, **kwargs):
+    """Serialize an object to str format"""
+    return json.dumps(obj, **kwargs)
+
+
+class MicroAgent(Agent):
+    VERSION = '1.0'
+    prompt = ''
+    agent_definition: dict = {}
+
+    def history_to_json(self, history: list[Event], max_events: int = 20, **kwargs):
+        """
+        Serialize and simplify history to str format
+        """
+        processed_history = []
+        event_count = 0
+
+        for event in reversed(history):
+            if event_count >= max_events:
+                break
+            processed_history.append(
+                event_to_memory(event, self.llm.config.max_message_chars)
+            )
+            event_count += 1
+
+        # history is in reverse order, let's fix it
+        processed_history.reverse()
+
+        return json.dumps(processed_history, **kwargs)
+
+    def __init__(self, llm: LLM, config: AgentConfig):
+        super().__init__(llm, config)
+        if 'name' not in self.agent_definition:
+            raise ValueError('Agent definition must contain a name')
+        self.prompt_template = Environment(loader=BaseLoader).from_string(self.prompt)
+        self.delegates = all_microagents.copy()
+        del self.delegates[self.agent_definition['name']]
+
+    def step(self, state: State) -> Action:
+        last_user_message, last_image_urls = state.get_current_user_intent()
+        prompt = self.prompt_template.render(
+            state=state,
+            instructions=instructions,
+            to_json=to_json,
+            history_to_json=self.history_to_json,
+            delegates=self.delegates,
+            latest_user_message=last_user_message,
+        )
+        content = [TextContent(text=prompt)]
+        if self.llm.vision_is_active() and last_image_urls:
+            content.append(ImageContent(image_urls=last_image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(
+            messages=self.llm.format_messages_for_llm(message),
+        )
+        action_resp = resp['choices'][0]['message']['content']
+        action = parse_response(action_resp)
+        return action
@@ -0,0 +1,6 @@
+name: CoderAgent
+description: Given a particular task, and a detailed description of the codebase, accomplishes the task
+inputs:
+  task: string
+  summary: string
+outputs: {}
@@ -0,0 +1,6 @@
+name: CommitWriterAgent
+description: "Write a git commit message for files in the git staging area"
+inputs: {}
+outputs:
+  answer: string
+  reason: string
@@ -0,0 +1,22 @@
+import os
+
+instructions: dict = {}
+
+base_dir = os.path.dirname(os.path.abspath(__file__)) + '/_instructions'
+for root, dirs, files in os.walk(base_dir):
+    if len(files) == 0:
+        continue
+    if root == base_dir:
+        obj = instructions
+    else:
+        rel_base = os.path.relpath(root, base_dir)
+        keys = rel_base.split('/')
+        obj = instructions
+        for key in keys:
+            if key not in obj:
+                obj[key] = {}
+            obj = obj[key]
+    for file in files:
+        without_ext = os.path.splitext(file)[0]
+        with open(os.path.join(root, file), 'r') as f:
+            obj[without_ext] = f.read()
@@ -0,0 +1,8 @@
+name: ManagerAgent
+description: Delegates tasks to microagents based on their area of expertise
+generates: Action
+inputs:
+  task: string
+outputs:
+  summary: string # if finished
+  reason: string # if rejected
@@ -0,0 +1,24 @@
+name: MathAgent
+description: "Solves simple and complex math problems using python"
+container: python:3.12.3-bookworm
+inputs:
+  task: string
+outputs:
+  answer: string
+examples:
+  - inputs:
+      task: "What is 2 + 2?"
+    outputs:
+      answer: "4"
+  - inputs:
+      task: "What is the area of a circle with radius 7.324 inches?"
+    output:
+      answer: "168.518 square inches"
+  - inputs:
+      task: "What day of the week is 2099-01-01?"
+    outputs:
+      answer: "Saturday"
+  - inputs:
+      task: "What is the integral of sin(x^2) evaluated from -1 to 1?"
+    outputs:
+      answer: "0.603848"
@@ -0,0 +1,5 @@
+name: PostgresAgent
+description: Writes and maintains PostgreSQL migrations
+inputs:
+  task: string
+outputs: {}
@@ -0,0 +1,27 @@
+import os
+
+import yaml
+
+all_microagents = {}
+
+# Get the list of directories and sort them to preserve determinism
+dirs = sorted(os.listdir(os.path.dirname(__file__)))
+
+for dir in dirs:
+    base = os.path.dirname(__file__) + '/' + dir
+    if os.path.isfile(base):
+        continue
+    if dir.startswith('_'):
+        continue
+    promptFile = base + '/prompt.md'
+    agentFile = base + '/agent.yaml'
+    if not os.path.isfile(promptFile) or not os.path.isfile(agentFile):
+        raise Exception(f'Missing prompt or agent file in {base}. Please create them.')
+    with open(promptFile, 'r') as f:
+        prompt = f.read()
+    with open(agentFile, 'r') as f:
+        agent = yaml.safe_load(f)
+    if 'name' not in agent:
+        raise Exception(f'Missing name in {agentFile}')
+    agent['prompt'] = prompt
+    all_microagents[agent['name']] = agent
@@ -0,0 +1,5 @@
+name: RepoExplorerAgent
+description: Generates a detailed summary of an existing codebase
+inputs: {}
+outputs:
+  summary: string
@@ -0,0 +1,6 @@
+name: StudyRepoForTaskAgent
+description: Given a particular task, finds and describes all relevant parts of the codebase
+inputs:
+  task: string
+outputs:
+  summary: string
@@ -0,0 +1,6 @@
+name: TypoFixerAgent
+description: Fixes typos in files in the current working directory
+inputs:
+  task: string
+outputs:
+  summary: string
@@ -0,0 +1,7 @@
+name: VerifierAgent
+description: Given a particular task, verifies that the task has been completed
+inputs:
+  task: string
+outputs:
+  completed: boolean
+  summary: string
@@ -1,2 +0,0 @@
-.envrc
-workspace
@@ -1,8 +0,0 @@
-# LLM control loop
-This is currently a standalone utility. It will need to be integrated into OpenDevin's backend.
-
-## Usage
-```bash
-# Run this in project root
-./agenthub/monologue_agent/build-and-run.sh "write a bash script that prints 'hello world'"
-```
@@ -1,8 +0,0 @@
-# TODO
-There's a lot of low-hanging fruit for this agent:
-
-* Strip `<script>`, `<style>`, and other non-text tags from the HTML before sending it to the LLM
-* Keep track of the working directory when the agent uses `cd`
-* Improve memory condensing--condense earlier memories more aggressively
-* Limit the time that `run` can wait (in case agent runs an interactive command and it's hanging)
-* Figure out how to run background processes, e.g. `node server.js` to start a server
@@ -1,4 +0,0 @@
-from opendevin.agent import Agent
-from .agent import MonologueAgent
-
-Agent.register('MonologueAgent', MonologueAgent)
@@ -1,235 +0,0 @@
-from typing import List
-from opendevin.agent import Agent
-from opendevin.state import State
-from opendevin.llm.llm import LLM
-from opendevin.schema import ActionType, ObservationType
-from opendevin.exceptions import AgentNoInstructionError
-
-from opendevin.action import (
-    Action,
-    NullAction,
-    CmdRunAction,
-    FileWriteAction,
-    FileReadAction,
-    AgentRecallAction,
-    BrowseURLAction,
-    AgentThinkAction,
-)
-
-from opendevin.observation import (
-    Observation,
-    NullObservation,
-    CmdOutputObservation,
-    FileReadObservation,
-    AgentRecallObservation,
-    BrowserOutputObservation,
-)
-
-import agenthub.monologue_agent.utils.prompts as prompts
-from agenthub.monologue_agent.utils.monologue import Monologue
-from agenthub.monologue_agent.utils.memory import LongTermMemory
-
-MAX_MONOLOGUE_LENGTH = 20000
-MAX_OUTPUT_LENGTH = 5000
-
-INITIAL_THOUGHTS = [
-    'I exist!',
-    'Hmm...looks like I can type in a command line prompt',
-    'Looks like I have a web browser too!',
-    "Here's what I want to do: $TASK",
-    'How am I going to get there though?',
-    'It seems like I have some kind of short term memory.',
-    'Each of my thoughts seems to be stored in a JSON array.',
-    'It seems whatever I say next will be added as an object to the list.',
-    'But no one has perfect short-term memory. My list of thoughts will be summarized and condensed over time, losing information in the process.',
-    'Fortunately I have long term memory!',
-    'I can just perform a recall action, followed by the thing I want to remember. And then related thoughts just spill out!',
-    "Sometimes they're random thoughts that don't really have to do with what I wanted to remember. But usually they're exactly what I need!",
-    "Let's try it out!",
-    'RECALL what it is I want to do',
-    "Here's what I want to do: $TASK",
-    'How am I going to get there though?',
-    "Neat! And it looks like it's easy for me to use the command line too! I just have to perform a run action and include the command I want to run in the command argument. The command output just jumps into my head!",
-    'RUN echo "hello world"',
-    'hello world',
-    'Cool! I bet I can write files too using the write action.',
-    "WRITE echo \"console.log('hello world')\" > test.js",
-    '',
-    "I just created test.js. I'll try and run it now.",
-    'RUN node test.js',
-    'hello world',
-    'It works!',
-    "I'm going to try reading it now using the read action.",
-    'READ test.js',
-    "console.log('hello world')",
-    'Nice! I can read files too!',
-    'And if I want to use the browser, I just need to use the browse action and include the url I want to visit in the url argument',
-    "Let's try that...",
-    'BROWSE google.com',
-    '<form><input type="text"></input><button type="submit"></button></form>',
-    'I can browse the web too!',
-    'And once I have completed my task, I can use the finish action to stop working.',
-    "But I should only use the finish action when I'm absolutely certain that I've completed my task and have tested my work.",
-    'Very cool. Now to accomplish my task.',
-    "I'll need a strategy. And as I make progress, I'll need to keep refining that strategy. I'll need to set goals, and break them into sub-goals.",
-    'In between actions, I must always take some time to think, strategize, and set new goals. I should never take two actions in a row.',
-    "OK so my task is to $TASK. I haven't made any progress yet. Where should I start?",
-    "It seems like there might be an existing project here. I should probably start by running `ls` to see what's here.",
-]
-
-
-class MonologueAgent(Agent):
-    """
-    The Monologue Agent utilizes long and short term memory to complete tasks.
-    Long term memory is stored as a LongTermMemory object and the model uses it to search for examples from the past.
-    Short term memory is stored as a Monologue object and the model can condense it as necessary.
-    """
-
-    _initialized = False
-
-    def __init__(self, llm: LLM):
-        """
-        Initializes the Monologue Agent with an llm, monologue, and memory.
-
-        Parameters:
-        - llm (LLM): The llm to be used by this agent
-        """
-        super().__init__(llm)
-        self.monologue = Monologue()
-        self.memory = LongTermMemory()
-
-    def _add_event(self, event: dict):
-        """
-        Adds a new event to the agent's monologue and memory.
-        Monologue automatically condenses when it gets too large.
-
-        Parameters:
-        - event (dict): The event that will be added to monologue and memory
-        """
-
-        if 'extras' in event and 'screenshot' in event['extras']:
-            del event['extras']['screenshot']
-        if (
-            'args' in event
-            and 'output' in event['args']
-            and len(event['args']['output']) > MAX_OUTPUT_LENGTH
-        ):
-            event['args']['output'] = (
-                event['args']['output'][:MAX_OUTPUT_LENGTH] + '...'
-            )
-
-        self.monologue.add_event(event)
-        self.memory.add_event(event)
-        if self.monologue.get_total_length() > MAX_MONOLOGUE_LENGTH:
-            self.monologue.condense(self.llm)
-
-    def _initialize(self, task: str):
-        """
-        Utilizes the INITIAL_THOUGHTS list to give the agent a context for it's capabilities and how to navigate the /workspace.
-        Short circuted to return when already initialized.
-
-        Parameters:
-        - task (str): The initial goal statement provided by the user
-
-        Raises:
-        - AgentNoInstructionError: If task is not provided
-        """
-
-        if self._initialized:
-            return
-
-        if task is None or task == '':
-            raise AgentNoInstructionError()
-        self.monologue = Monologue()
-        self.memory = LongTermMemory()
-
-        output_type = ''
-        for thought in INITIAL_THOUGHTS:
-            thought = thought.replace('$TASK', task)
-            if output_type != '':
-                observation: Observation = NullObservation(content='')
-                if output_type == ObservationType.RUN:
-                    observation = CmdOutputObservation(
-                        content=thought, command_id=0, command=''
-                    )
-                elif output_type == ObservationType.READ:
-                    observation = FileReadObservation(content=thought, path='')
-                elif output_type == ObservationType.RECALL:
-                    observation = AgentRecallObservation(
-                        content=thought, memories=[])
-                elif output_type == ObservationType.BROWSE:
-                    observation = BrowserOutputObservation(
-                        content=thought, url='', screenshot=''
-                    )
-                self._add_event(observation.to_dict())
-                output_type = ''
-            else:
-                action: Action = NullAction()
-                if thought.startswith('RUN'):
-                    command = thought.split('RUN ')[1]
-                    action = CmdRunAction(command)
-                    output_type = ActionType.RUN
-                elif thought.startswith('WRITE'):
-                    parts = thought.split('WRITE ')[1].split(' > ')
-                    path = parts[1]
-                    content = parts[0]
-                    action = FileWriteAction(path=path, content=content)
-                elif thought.startswith('READ'):
-                    path = thought.split('READ ')[1]
-                    action = FileReadAction(path=path)
-                    output_type = ActionType.READ
-                elif thought.startswith('RECALL'):
-                    query = thought.split('RECALL ')[1]
-                    action = AgentRecallAction(query=query)
-                    output_type = ActionType.RECALL
-                elif thought.startswith('BROWSE'):
-                    url = thought.split('BROWSE ')[1]
-                    action = BrowseURLAction(url=url)
-                    output_type = ActionType.BROWSE
-                else:
-                    action = AgentThinkAction(thought=thought)
-                self._add_event(action.to_dict())
-        self._initialized = True
-
-    def step(self, state: State) -> Action:
-        """
-        Modifies the current state by adding the most recent actions and observations, then prompts the model to think about it's next action to take using monologue, memory, and hint.
-
-        Parameters:
-        - state (State): The current state based on previous steps taken
-
-        Returns:
-        - Action: The next action to take based on LLM response
-        """
-        self._initialize(state.plan.main_goal)
-        for prev_action, obs in state.updated_info:
-            self._add_event(prev_action.to_dict())
-            self._add_event(obs.to_dict())
-
-        state.updated_info = []
-
-        prompt = prompts.get_request_action_prompt(
-            state.plan.main_goal,
-            self.monologue.get_thoughts(),
-            state.background_commands_obs,
-        )
-        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
-        action_resp = resp['choices'][0]['message']['content']
-        state.num_of_chars += len(prompt) + len(action_resp)
-        action = prompts.parse_action_response(action_resp)
-        self.latest_action = action
-        return action
-
-    def search_memory(self, query: str) -> List[str]:
-        """
-        Uses VectorIndexRetriever to find related memories within the long term memory.
-        Uses search to produce top 10 results.
-
-        Parameters:
-        - query (str): The query that we want to find related memories for
-
-        Returns:
-        - List[str]: A list of top 10 text results that matched the query
-        """
-        return self.memory.search(query)
@@ -1,37 +0,0 @@
-import json
-from json_repair import repair_json
-
-
-def my_encoder(obj):
-    """
-    Encodes objects as dictionaries
-
-    Parameters:
-    - obj (Object): An object that will be converted
-
-    Returns:
-    - dict: If the object can be converted it is returned in dict format
-    """
-    if hasattr(obj, 'to_dict'):
-        return obj.to_dict()
-
-
-def dumps(obj, **kwargs):
-    """
-    Serialize an object to str format
-    """
-
-    return json.dumps(obj, default=my_encoder, **kwargs)
-
-
-def loads(s, **kwargs):
-    """
-    Create a JSON object from str
-    """
-    json_start = s.find('{')
-    json_end = s.rfind('}') + 1
-    if json_start == -1 or json_end == -1:
-        raise ValueError('Invalid response: no JSON found')
-    s = s[json_start:json_end]
-    s = repair_json(s)
-    return json.loads(s, **kwargs)
@@ -1,104 +0,0 @@
-import chromadb
-from llama_index.core import Document
-from llama_index.core.retrievers import VectorIndexRetriever
-from llama_index.core import VectorStoreIndex
-from llama_index.vector_stores.chroma import ChromaVectorStore
-
-from opendevin import config
-from . import json
-
-embedding_strategy = config.get('LLM_EMBEDDING_MODEL')
-
-# TODO: More embeddings: https://docs.llamaindex.ai/en/stable/examples/embeddings/OpenAI/
-# There's probably a more programmatic way to do this.
-if embedding_strategy == 'llama2':
-    from llama_index.embeddings.ollama import OllamaEmbedding
-    embed_model = OllamaEmbedding(
-        model_name='llama2',
-        base_url=config.get('LLM_BASE_URL', required=True),
-        ollama_additional_kwargs={'mirostat': 0},
-    )
-elif embedding_strategy == 'openai':
-    from llama_index.embeddings.openai import OpenAIEmbedding
-    embed_model = OpenAIEmbedding(
-        model='text-embedding-ada-002',
-        api_key=config.get('LLM_API_KEY', required=True)
-    )
-elif embedding_strategy == 'azureopenai':
-    # Need to instruct to set these env variables in documentation
-    from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
-    embed_model = AzureOpenAIEmbedding(
-        model='text-embedding-ada-002',
-        deployment_name=config.get('LLM_DEPLOYMENT_NAME', required=True),
-        api_key=config.get('LLM_API_KEY', required=True),
-        azure_endpoint=config.get('LLM_BASE_URL', required=True),
-        api_version=config.get('LLM_API_VERSION', required=True),
-    )
-else:
-    from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-    embed_model = HuggingFaceEmbedding(
-        model_name='BAAI/bge-small-en-v1.5'
-    )
-
-
-class LongTermMemory:
-    """
-    Responsible for storing information that the agent can call on later for better insights and context.
-    Uses chromadb to store and search through memories.
-    """
-
-    def __init__(self):
-        """
-        Initialize the chromadb and set up ChromaVectorStore for later use.
-        """
-        db = chromadb.Client()
-        self.collection = db.get_or_create_collection(name='memories')
-        vector_store = ChromaVectorStore(chroma_collection=self.collection)
-        self.index = VectorStoreIndex.from_vector_store(
-            vector_store, embed_model=embed_model)
-        self.thought_idx = 0
-
-    def add_event(self, event: dict):
-        """
-        Adds a new event to the long term memory with a unique id.
-
-        Parameters:
-        - event (dict): The new event to be added to memory
-        """
-        id = ''
-        t = ''
-        if 'action' in event:
-            t = 'action'
-            id = event['action']
-        elif 'observation' in event:
-            t = 'observation'
-            id = event['observation']
-        doc = Document(
-            text=json.dumps(event),
-            doc_id=str(self.thought_idx),
-            extra_info={
-                'type': t,
-                'id': id,
-                'idx': self.thought_idx,
-            },
-        )
-        self.thought_idx += 1
-        self.index.insert(doc)
-
-    def search(self, query: str, k: int = 10):
-        """
-        Searches through the current memory using VectorIndexRetriever
-
-        Parameters:
-        - query (str): A query to match search results to
-        - k (int): Number of top results to return
-
-        Returns:
-        - List[str]: List of top k results found in current memory
-        """
-        retriever = VectorIndexRetriever(
-            index=self.index,
-            similarity_top_k=k,
-        )
-        results = retriever.retrieve(query)
-        return [r.get_text() for r in results]
@@ -1,78 +0,0 @@
-import traceback
-
-from opendevin.llm.llm import LLM
-from opendevin.exceptions import AgentEventTypeError
-import agenthub.monologue_agent.utils.json as json
-import agenthub.monologue_agent.utils.prompts as prompts
-
-
-class Monologue:
-    """
-    The monologue is a representation for the agent's internal monologue where it can think.
-    The agent has the capability of using this monologue for whatever it wants.
-    """
-
-    def __init__(self):
-        """
-        Initialize the empty list of thoughts
-        """
-        self.thoughts = []
-
-    def add_event(self, t: dict):
-        """
-        Adds an event to memory if it is a valid event.
-
-        Parameters:
-        - t (dict): The thought that we want to add to memory
-
-        Raises:
-        - AgentEventTypeError: If t is not a dict
-        """
-        if not isinstance(t, dict):
-            raise AgentEventTypeError()
-        self.thoughts.append(t)
-
-    def get_thoughts(self):
-        """
-        Get the current thoughts of the agent.
-
-        Returns:
-        - List: The list of thoughts that the agent has.
-        """
-        return self.thoughts
-
-    def get_total_length(self):
-        """
-        Gives the total number of characters in all thoughts
-
-        Returns:
-        - Int: Total number of chars in thoughts.
-        """
-        total_length = 0
-        for t in self.thoughts:
-            try:
-                total_length += len(json.dumps(t))
-            except TypeError as e:
-                print(f'Error serializing thought: {e}')
-        return total_length
-
-    def condense(self, llm: LLM):
-        """
-        Attempts to condense the monologue by using the llm
-
-        Parameters:
-        - llm (LLM): llm to be used for summarization
-
-        Raises:
-        - RunTimeError: When the condensing process fails for any reason
-        """
-
-        try:
-            prompt = prompts.get_summarize_monologue_prompt(self.thoughts)
-            messages = [{'content': prompt, 'role': 'user'}]
-            resp = llm.completion(messages=messages)
-            summary_resp = resp['choices'][0]['message']['content']
-            self.thoughts = prompts.parse_summary_response(summary_resp)
-        except Exception as e:
-            traceback.print_exc()
-            raise RuntimeError(f'Error condensing thoughts: {e}')
@@ -1,194 +0,0 @@
-from typing import List
-
-from . import json
-from json import JSONDecodeError
-
-import re
-
-from opendevin.action import (
-    action_from_dict,
-    Action,
-)
-from opendevin.observation import (
-    CmdOutputObservation,
-)
-from opendevin.exceptions import LLMOutputError
-
-ACTION_PROMPT = """
-You're a thoughtful robot. Your main task is this:
-%(task)s
-
-Don't expand the scope of your task--just complete it as written.
-
-This is your internal monologue, in JSON format:
-
-%(monologue)s
-
-
-Your most recent thought is at the bottom of that monologue. Continue your train of thought.
-What is your next thought or action? Your response must be in JSON format.
-It must be an object, and it must contain two fields:
-* `action`, which is one of the actions below
-* `args`, which is a map of key-value pairs, specifying the arguments for that action
-
-Here are the possible actions:
-* `read` - reads the content of a file. Arguments:
-  * `path` - the path of the file to read
-* `write` - writes the content to a file. Arguments:
-  * `path` - the path of the file to write
-  * `content` - the content to write to the file
-* `run` - runs a command. Arguments:
-  * `command` - the command to run
-  * `background` - if true, run the command in the background, so that other commands can be run concurrently. Useful for e.g. starting a server. You won't be able to see the logs. You don't need to end the command with `&`, just set this to true.
-* `kill` - kills a background command
-  * `id` - the ID of the background command to kill
-* `browse` - opens a web page. Arguments:
-  * `url` - the URL to open
-* `recall` - recalls a past memory. Arguments:
-  * `query` - the query to search for
-* `think` - make a plan, set a goal, or record your thoughts. Arguments:
-  * `thought` - the thought to record
-* `finish` - if you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.
-
-%(background_commands)s
-
-You MUST take time to think in between read, write, run, browse, and recall actions.
-You should never act twice in a row without thinking. But if your last several
-actions are all "think" actions, you should consider taking a different action.
-
-Notes:
-* your environment is Debian Linux. You can install software with `apt`
-* your working directory will not change, even if you run `cd`. All commands will be run in the `/workspace` directory.
-* don't run interactive commands, or commands that don't return (e.g. `node server.js`). You may run commands in the background (e.g. `node server.js &`)
-
-What is your next thought or action? Again, you must reply with JSON, and only with JSON.
-
-%(hint)s
-"""
-
-MONOLOGUE_SUMMARY_PROMPT = """
-Below is the internal monologue of an automated LLM agent. Each
-thought is an item in a JSON array. The thoughts may be memories,
-actions taken by the agent, or outputs from those actions.
-Please return a new, smaller JSON array, which summarizes the
-internal monologue. You can summarize individual thoughts, and
-you can condense related thoughts together with a description
-of their content.
-
-%(monologue)s
-
-Make the summaries as pithy and informative as possible.
-Be specific about what happened and what was learned. The summary
-will be used as keywords for searching for the original memory.
-Be sure to preserve any key words or important information.
-
-Your response must be in JSON format. It must be an object with the
-key `new_monologue`, which is a JSON array containing the summarized monologue.
-Each entry in the array must have an `action` key, and an `args` key.
-The action key may be `summarize`, and `args.summary` should contain the summary.
-You can also use the same action and args from the source monologue.
-"""
-
-
-def get_summarize_monologue_prompt(thoughts: List[dict]):
-    """
-    Gets the prompt for summarizing the monologue
-
-    Returns:
-    - str: A formatted string with the current monologue within the prompt
-    """
-    return MONOLOGUE_SUMMARY_PROMPT % {
-        'monologue': json.dumps({'old_monologue': thoughts}, indent=2),
-    }
-
-
-def get_request_action_prompt(
-    task: str,
-    thoughts: List[dict],
-    background_commands_obs: List[CmdOutputObservation] = [],
-):
-    """
-    Gets the action prompt formatted with appropriate values.
-
-    Parameters:
-    - task (str): The current task the agent is trying to accomplish
-    - thoughts (List[dict]): The agent's current thoughts
-    - background_commands_obs (List[CmdOutputObservation]): List of all observed background commands running
-
-    Returns:
-    - str: Formatted prompt string with hint, task, monologue, and background included
-    """
-
-    hint = ''
-    if len(thoughts) > 0:
-        latest_thought = thoughts[-1]
-        if 'action' in latest_thought:
-            if latest_thought['action'] == 'think':
-                if latest_thought['args']['thought'].startswith('OK so my task is'):
-                    hint = "You're just getting started! What should you do first?"
-                else:
-                    hint = "You've been thinking a lot lately. Maybe it's time to take action?"
-            elif latest_thought['action'] == 'error':
-                hint = 'Looks like that last command failed. Maybe you need to fix it, or try something else.'
-
-    bg_commands_message = ''
-    if len(background_commands_obs) > 0:
-        bg_commands_message = 'The following commands are running in the background:'
-        for command_obs in background_commands_obs:
-            bg_commands_message += (
-                f'\n`{command_obs.command_id}`: {command_obs.command}'
-            )
-        bg_commands_message += '\nYou can end any process by sending a `kill` action with the numerical `id` above.'
-
-    return ACTION_PROMPT % {
-        'task': task,
-        'monologue': json.dumps(thoughts, indent=2),
-        'background_commands': bg_commands_message,
-        'hint': hint,
-    }
-
-
-def parse_action_response(response: str) -> Action:
-    """
-    Parses a string to find an action within it
-
-    Parameters:
-    - response (str): The string to be parsed
-
-    Returns:
-    - Action: The action that was found in the response string
-    """
-    try:
-        action_dict = json.loads(response)
-    except JSONDecodeError:
-        # Find response-looking json in the output and use the more promising one. Helps with weak llms
-        response_json_matches = re.finditer(
-            r"""{\s*\"action\":\s?\"(\w+)\"(?:,?|,\s*\"args\":\s?{((?:.|\s)*?)})\s*}""",
-            response)  # Find all response-looking strings
-
-        def rank(match):
-            return len(match[2]) if match[1] == 'think' else 130  # Crudely rank multiple responses by length
-        try:
-            action_dict = json.loads(max(response_json_matches, key=rank)[0])  # Use the highest ranked response
-        except ValueError as e:
-            raise LLMOutputError(
-                "Output from the LLM isn't properly formatted. The model may be misconfigured."
-            ) from e
-    if 'content' in action_dict:
-        # The LLM gets confused here. Might as well be robust
-        action_dict['contents'] = action_dict.pop('content')
-    return action_from_dict(action_dict)
-
-
-def parse_summary_response(response: str) -> List[dict]:
-    """
-    Parses a summary of the monologue
-
-    Parameters:
-    - response (str): The response string to be parsed
-
-    Returns:
-    - List[dict]: The list of summaries output by the model
-    """
-    parsed = json.loads(response)
-    return parsed['new_monologue']
@@ -1,4 +1,4 @@
-from opendevin.agent import Agent
-from .agent import PlannerAgent
+from openhands.agenthub.planner_agent.agent import PlannerAgent
+from openhands.controller.agent import Agent

 Agent.register('PlannerAgent', PlannerAgent)
@@ -1,31 +1,31 @@
-from typing import List
-from .prompt import get_prompt, parse_response
-
-from opendevin.agent import Agent
-from opendevin.action import AgentFinishAction
-from opendevin.llm.llm import LLM
-from opendevin.state import State
-from opendevin.action import Action
+from openhands.agenthub.planner_agent.prompt import get_prompt_and_images
+from openhands.agenthub.planner_agent.response_parser import PlannerResponseParser
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import Action, AgentFinishAction
+from openhands.llm.llm import LLM


 class PlannerAgent(Agent):
+    VERSION = '1.0'
    """
    The planner agent utilizes a special prompting strategy to create long term plans for solving problems.
    The agent is given its previous action-observation pairs, current task, and hint based on last action taken at every step.
    """
+    response_parser = PlannerResponseParser()

-    def __init__(self, llm: LLM):
-        """
-        Initialize the Planner Agent with an LLM
+    def __init__(self, llm: LLM, config: AgentConfig):
+        """Initialize the Planner Agent with an LLM

        Parameters:
        - llm (LLM): The llm to be used by this agent
        """
-        super().__init__(llm)
+        super().__init__(llm, config)

    def step(self, state: State) -> Action:
-        """
-        Checks to see if current step is completed, returns AgentFinishAction if True.
+        """Checks to see if current step is completed, returns AgentFinishAction if True.
        Otherwise, creates a plan prompt and sends to model for inference, returning the result as the next action.

        Parameters:
@@ -35,16 +35,19 @@ class PlannerAgent(Agent):
        - AgentFinishAction: If the last state was 'completed', 'verified', or 'abandoned'
        - Action: The next action to take based on llm response
        """
-
-        if state.plan.task.state in ['completed', 'verified', 'abandoned']:
+        if state.root_task.state in [
+            'completed',
+            'verified',
+            'abandoned',
+        ]:
            return AgentFinishAction()
-        prompt = get_prompt(state.plan, state.history)
-        messages = [{'content': prompt, 'role': 'user'}]
-        resp = self.llm.completion(messages=messages)
-        action_resp = resp['choices'][0]['message']['content']
-        state.num_of_chars += len(prompt) + len(action_resp)
-        action = parse_response(action_resp)
-        return action

-    def search_memory(self, query: str) -> List[str]:
-        return []
+        prompt, image_urls = get_prompt_and_images(
+            state, self.llm.config.max_message_chars
+        )
+        content = [TextContent(text=prompt)]
+        if self.llm.vision_is_active() and image_urls:
+            content.append(ImageContent(image_urls=image_urls))
+        message = Message(role='user', content=content)
+        resp = self.llm.completion(messages=self.llm.format_messages_for_llm(message))
+        return self.response_parser.parse(resp)
@@ -1,46 +1,15 @@
-import json
-from typing import List, Tuple, Dict, Type
-
-from opendevin.controller.agent_controller import print_with_color
-from opendevin.plan import Plan
-from opendevin.action import Action, action_from_dict
-from opendevin.observation import Observation
-from opendevin.schema import ActionType
-
-from opendevin.action import (
+from openhands.controller.state.state import State
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.schema import ActionType
+from openhands.core.utils import json
+from openhands.events.action import (
+    Action,
    NullAction,
-    CmdRunAction,
-    CmdKillAction,
-    BrowseURLAction,
-    FileReadAction,
-    FileWriteAction,
-    AgentRecallAction,
-    AgentThinkAction,
-    AgentFinishAction,
-    AgentSummarizeAction,
-    AddTaskAction,
-    ModifyTaskAction,
 )
+from openhands.events.serialization.action import action_from_dict
+from openhands.events.serialization.event import event_to_memory

-from opendevin.observation import (
-    NullObservation,
-)
-
-ACTION_TYPE_TO_CLASS: Dict[str, Type[Action]] = {
-    ActionType.RUN: CmdRunAction,
-    ActionType.KILL: CmdKillAction,
-    ActionType.BROWSE: BrowseURLAction,
-    ActionType.READ: FileReadAction,
-    ActionType.WRITE: FileWriteAction,
-    ActionType.RECALL: AgentRecallAction,
-    ActionType.THINK: AgentThinkAction,
-    ActionType.SUMMARIZE: AgentSummarizeAction,
-    ActionType.FINISH: AgentFinishAction,
-    ActionType.ADD_TASK: AddTaskAction,
-    ActionType.MODIFY_TASK: ModifyTaskAction,
-}
-
-HISTORY_SIZE = 10
+HISTORY_SIZE = 20

 prompt = """
 # Task
@@ -105,25 +74,23 @@ It must be an object, and it must contain two fields:
  * `content` - the content to write to the file
 * `run` - runs a command on the command line in a Linux shell. Arguments:
  * `command` - the command to run
-  * `background` - if true, run the command in the background, so that other commands can be run concurrently. Useful for e.g. starting a server. You won't be able to see the logs. You don't need to end the command with `&`, just set this to true.
-* `kill` - kills a background command
-  * `id` - the ID of the background command to kill
 * `browse` - opens a web page. Arguments:
  * `url` - the URL to open
-* `think` - make a plan, set a goal, or record your thoughts. Arguments:
-  * `thought` - the thought to record
+* `message` - make a plan, set a goal, record your thoughts, or ask for more input from the user. Arguments:
+  * `content` - the message to record
+  * `wait_for_response` - set to `true` to wait for the user to respond before proceeding
 * `add_task` - add a task to your plan. Arguments:
-  * `parent` - the ID of the parent task
+  * `parent` - the ID of the parent task (leave empty if it should go at the top level)
  * `goal` - the goal of the task
  * `subtasks` - a list of subtasks, each of which is a map with a `goal` key.
 * `modify_task` - close a task. Arguments:
-  * `id` - the ID of the task to close
+  * `task_id` - the ID of the task to close
  * `state` - set to 'in_progress' to start the task, 'completed' to finish it, 'verified' to assert that it was successful, 'abandoned' to give up on it permanently, or `open` to stop working on it for now.
 * `finish` - if ALL of your tasks and subtasks have been verified or abandoned, and you're absolutely certain that you've completed your task and have tested your work, use the finish action to stop working.

-You MUST take time to think in between read, write, run, browse, and recall actions.
+You MUST take time to think in between read, write, run, and browse actions--do this with the `message` action.
 You should never act twice in a row without thinking. But if your last several
-actions are all `think` actions, you should consider taking a different action.
+actions are all `message` actions, you should consider taking a different action.

 What is your next thought or action? Again, you must reply with JSON, and only with JSON.

@@ -131,96 +98,91 @@ What is your next thought or action? Again, you must reply with JSON, and only w
 """


-def get_prompt(plan: Plan, history: List[Tuple[Action, Observation]]) -> str:
-    """
-    Gets the prompt for the planner agent.
+def get_hint(latest_action_id: str) -> str:
+    """Returns action type hint based on given action_id"""
+    hints = {
+        '': "You haven't taken any actions yet. Start by using `ls` to check out what files you're working with.",
+        ActionType.RUN: 'You should think about the command you just ran, what output it gave, and how that affects your plan.',
+        ActionType.READ: 'You should think about the file you just read, what you learned from it, and how that affects your plan.',
+        ActionType.WRITE: 'You just changed a file. You should think about how it affects your plan.',
+        ActionType.BROWSE: 'You should think about the page you just visited, and what you learned from it.',
+        ActionType.MESSAGE: "Look at your last thought in the history above. What does it suggest? Don't think anymore--take action.",
+        ActionType.ADD_TASK: 'You should think about the next action to take.',
+        ActionType.MODIFY_TASK: 'You should think about the next action to take.',
+        ActionType.SUMMARIZE: '',
+        ActionType.FINISH: '',
+    }
+    return hints.get(latest_action_id, '')
+
+
+def get_prompt_and_images(
+    state: State, max_message_chars: int
+) -> tuple[str, list[str] | None]:
+    """Gets the prompt for the planner agent.
+
    Formatted with the most recent action-observation pairs, current task, and hint based on last action

    Parameters:
-    - plan (Plan): The original plan outlined by the user with LLM defined tasks
-    - history (List[Tuple[Action, Observation]]): List of corresponding action-observation pairs
+    - state (State): The state of the current agent

    Returns:
    - str: The formatted string prompt with historical values
    """
+    # the plan
+    plan_str = json.dumps(state.root_task.to_dict(), indent=2)

-    plan_str = json.dumps(plan.task.to_dict(), indent=2)
-    sub_history = history[-HISTORY_SIZE:]
+    # the history
    history_dicts = []
    latest_action: Action = NullAction()
-    for action, observation in sub_history:
-        if not isinstance(action, NullAction):
-            history_dicts.append(action.to_dict())
-            latest_action = action
-        if not isinstance(observation, NullObservation):
-            observation_dict = observation.to_dict()
-            if (
-                'extras' in observation_dict
-                and 'screenshot' in observation_dict['extras']
-            ):
-                del observation_dict['extras']['screenshot']
-            history_dicts.append(observation_dict)
+
+    # retrieve the latest HISTORY_SIZE events
+    for event_count, event in enumerate(reversed(state.history)):
+        if event_count >= HISTORY_SIZE:
+            break
+        if latest_action == NullAction() and isinstance(event, Action):
+            latest_action = event
+        history_dicts.append(event_to_memory(event, max_message_chars))
+
+    # history_dicts is in reverse order, lets fix it
+    history_dicts.reverse()
+
+    # and get it as a JSON string
    history_str = json.dumps(history_dicts, indent=2)

-    hint = ''
-    current_task = plan.get_current_task()
+    # the plan status
+    current_task = state.root_task.get_current_task()
    if current_task is not None:
        plan_status = f"You're currently working on this task:\n{current_task.goal}."
        if len(current_task.subtasks) == 0:
            plan_status += "\nIf it's not achievable AND verifiable with a SINGLE action, you MUST break it down into subtasks NOW."
    else:
        plan_status = "You're not currently working on any tasks. Your next action MUST be to mark a task as in_progress."
-        hint = plan_status

-    latest_action_id = latest_action.to_dict()['action']
+    # the hint, based on the last action
+    hint = get_hint(event_to_memory(latest_action, max_message_chars).get('action', ''))
+    logger.debug('HINT:\n' + hint, extra={'msg_type': 'DETAIL'})

-    if current_task is not None:
-        if latest_action_id == '':
-            hint = "You haven't taken any actions yet. Start by using `ls` to check out what files you're working with."
-        elif latest_action_id == ActionType.RUN:
-            hint = 'You should think about the command you just ran, what output it gave, and how that affects your plan.'
-        elif latest_action_id == ActionType.READ:
-            hint = 'You should think about the file you just read, what you learned from it, and how that affects your plan.'
-        elif latest_action_id == ActionType.WRITE:
-            hint = 'You just changed a file. You should think about how it affects your plan.'
-        elif latest_action_id == ActionType.BROWSE:
-            hint = 'You should think about the page you just visited, and what you learned from it.'
-        elif latest_action_id == ActionType.THINK:
-            hint = "Look at your last thought in the history above. What does it suggest? Don't think anymore--take action."
-        elif latest_action_id == ActionType.RECALL:
-            hint = 'You should think about the information you just recalled, and how it should affect your plan.'
-        elif latest_action_id == ActionType.ADD_TASK:
-            hint = 'You should think about the next action to take.'
-        elif latest_action_id == ActionType.MODIFY_TASK:
-            hint = 'You should think about the next action to take.'
-        elif latest_action_id == ActionType.SUMMARIZE:
-            hint = ''
-        elif latest_action_id == ActionType.FINISH:
-            hint = ''
+    # the last relevant user message (the task)
+    message, image_urls = state.get_current_user_intent()

-    print_with_color('HINT:\n' + hint, 'INFO')
+    # finally, fill in the prompt
    return prompt % {
-        'task': plan.main_goal,
+        'task': message,
        'plan': plan_str,
        'history': history_str,
        'hint': hint,
        'plan_status': plan_status,
-    }
+    }, image_urls


 def parse_response(response: str) -> Action:
-    """
-    Parses the model output to find a valid action to take
-
+    """Parses the model output to find a valid action to take
    Parameters:
    - response (str): A response from the model that potentially contains an Action.

    Returns:
    - Action: A valid next action to perform from model output
    """
-    json_start = response.find('{')
-    json_end = response.rfind('}') + 1
-    response = response[json_start:json_end]
    action_dict = json.loads(response)
    if 'contents' in action_dict:
        # The LLM gets confused here. Might as well be robust
@@ -0,0 +1,37 @@
+from openhands.controller.action_parser import ResponseParser
+from openhands.core.utils import json
+from openhands.events.action import (
+    Action,
+)
+from openhands.events.serialization.action import action_from_dict
+
+
+class PlannerResponseParser(ResponseParser):
+    def __init__(self):
+        super().__init__()
+
+    def parse(self, response: str) -> Action:
+        action_str = self.parse_response(response)
+        return self.parse_action(action_str)
+
+    def parse_response(self, response) -> str:
+        # get the next action from the response
+        return response['choices'][0]['message']['content']
+
+    def parse_action(self, action_str: str) -> Action:
+        """Parses a string to find an action within it
+
+        Parameters:
+        - response (str): The string to be parsed
+
+        Returns:
+        - Action: The action that was found in the response string
+        """
+        # attempt to load the JSON dict from the response
+        action_dict = json.loads(action_str)
+
+        if 'content' in action_dict:
+            # The LLM gets confused here. Might as well be robust
+            action_dict['contents'] = action_dict.pop('content')
+
+        return action_from_dict(action_dict)
@@ -1,54 +0,0 @@
-FROM node:21.7.2-bookworm-slim as frontend-builder
-
-WORKDIR /app
-
-COPY ./frontend/package.json frontend/package-lock.json ./
-RUN npm install
-
-COPY ./frontend ./
-RUN npm run make-i18n && npm run build
-
-FROM python:3.12-slim as backend-builder
-
-WORKDIR /app
-ENV PYTHONPATH '/app'
-
-ENV POETRY_NO_INTERACTION=1 \
-    POETRY_VIRTUALENVS_IN_PROJECT=1 \
-    POETRY_VIRTUALENVS_CREATE=1 \
-    POETRY_CACHE_DIR=/tmp/poetry_cache
-
-RUN apt-get update -y \
-    && apt-get install -y curl make git build-essential \
-    && python3 -m pip install poetry==1.8.2  --break-system-packages
-
-COPY ./pyproject.toml ./poetry.lock ./
-RUN touch README.md
-RUN poetry install --without evaluation --no-root && rm -rf $POETRY_CACHE_DIR
-
-FROM python:3.12-slim as runtime
-
-WORKDIR /app
-
-ENV RUN_AS_DEVIN=false
-ENV USE_HOST_NETWORK=false
-ENV SSH_HOSTNAME=host.docker.internal
-ENV WORKSPACE_BASE=/opt/workspace_base
-RUN mkdir -p $WORKSPACE_BASE
-
-RUN apt-get update -y \
-    && apt-get install -y curl ssh
-
-ENV VIRTUAL_ENV=/app/.venv \
-    PATH="/app/.venv/bin:$PATH" \
-    PYTHONPATH='/app'
-
-COPY --from=backend-builder ${VIRTUAL_ENV} ${VIRTUAL_ENV}
-
-COPY ./opendevin ./opendevin
-COPY ./agenthub ./agenthub
-RUN python opendevin/download.py # No-op to download assets
-
-COPY --from=frontend-builder /app/dist ./frontend/dist
-
-CMD ["uvicorn", "opendevin.server.listen:app", "--host", "0.0.0.0", "--port", "3000"]
@@ -1,2 +0,0 @@
-DOCKER_REPOSITORY=ghcr.io/opendevin/opendevin
-DOCKER_BASE_DIR="."
@@ -1,48 +0,0 @@
-#!/bin/bash
-set -eo pipefail
-
-image_name=$1
-push=0
-if [[ $2 == "--push" ]]; then
-  push=1
-fi
-
-echo -e "Building: $image_name"
-tags=(latest)
-if [[ -n $GITHUB_REF_NAME ]]; then
-  # check if ref name is a version number
-  if [[ $GITHUB_REF_NAME =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
-    major_version=$(echo $GITHUB_REF_NAME | cut -d. -f1)
-    minor_version=$(echo $GITHUB_REF_NAME | cut -d. -f1,2)
-    tags+=($major_version $minor_version)
-  fi
-  sanitized=$(echo $GITHUB_REF_NAME | sed 's/[^a-zA-Z0-9.-]\+/-/g')
-  tags+=($sanitized)
-fi
-echo "Tags: ${tags[@]}"
-
-dir=./containers/$image_name
-if [ ! -f $dir/Dockerfile ]; then
-  echo "No Dockerfile found"
-  exit 1
-fi
-if [ ! -f $dir/config.sh ]; then
-  echo "No config.sh found for Dockerfile"
-  exit 1
-fi
-source $dir/config.sh
-echo "Repo: $DOCKER_REPOSITORY"
-echo "Base dir: $DOCKER_BASE_DIR"
-#docker pull $DOCKER_REPOSITORY:main || true # try to get any cached layers
-args=""
-for tag in ${tags[@]}; do
-  args+=" -t $DOCKER_REPOSITORY:$tag"
-done
-if [[ $push -eq 1 ]]; then
-  args+=" --push"
-fi
-
-docker buildx build \
-  $args \
-  --platform linux/amd64,linux/arm64 \
-  -f $dir/Dockerfile $DOCKER_BASE_DIR
@@ -1,41 +0,0 @@
-FROM ubuntu:20.04
-
-# https://github.com/princeton-nlp/SWE-bench/issues/15#issuecomment-1815392192
-RUN apt-get update && \
-    apt-get install -y bash gcc git jq wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN git config --global user.email "swebench@pnlp.org"
-RUN git config --global user.name "swebench"
-
-RUN apt update && apt install -y build-essential
-
-# Create new user
-RUN useradd -ms /bin/bash swe-bench
-USER swe-bench
-WORKDIR /home/swe-bench
-
-# Setup Conda
-ENV PATH="/home/swe-bench/miniconda3/bin:${PATH}"
-ARG PATH="/home/swe-bench/miniconda3/bin:${PATH}"
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-`uname -m`.sh -O miniconda.sh \
-    && mkdir ~/.conda \
-    && bash miniconda.sh -b \
-    && rm -f miniconda.sh
-RUN conda --version
-
-# Setup SWE-Bench Env
-COPY environment.yml .
-RUN conda env create -f environment.yml
-
-# Add commands
-COPY ./commands.sh .
-RUN . ./commands.sh
-
-# Some missing packages
-RUN pip install datasets python-dotenv gitpython
-
-RUN conda init bash
-
-CMD ["/bin/bash"]
@@ -1,2 +0,0 @@
-DOCKER_REPOSITORY=ghcr.io/opendevin/eval-swe-bench
-DOCKER_BASE_DIR=evaluation/SWE-bench
@@ -1,21 +0,0 @@
-FROM ubuntu:22.04
-
-# install basic packages
-RUN apt-get update && apt-get install -y \
-    curl \
-    wget \
-    git \
-    vim \
-    nano \
-    unzip \
-    zip \
-    python3 \
-    python3-pip \
-    python3-venv \
-    python3-dev \
-    build-essential \
-    openssh-server \
-    sudo \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN service ssh start
@@ -1,2 +0,0 @@
-DOCKER_REPOSITORY=ghcr.io/opendevin/sandbox
-DOCKER_BASE_DIR="."
@@ -0,0 +1,5 @@
+from openhands.controller.agent_controller import AgentController
+
+__all__ = [
+    'AgentController',
+]
@@ -0,0 +1,77 @@
+from abc import ABC, abstractmethod
+
+from openhands.events.action import Action
+
+
+class ActionParseError(Exception):
+    """Exception raised when the response from the LLM cannot be parsed into an action."""
+
+    def __init__(self, error: str):
+        self.error = error
+
+    def __str__(self):
+        return self.error
+
+
+class ResponseParser(ABC):
+    """This abstract base class is a general interface for an response parser dedicated to
+    parsing the action from the response from the LLM.
+    """
+
+    def __init__(
+        self,
+    ):
+        # Need pay attention to the item order in self.action_parsers
+        self.action_parsers = []
+
+    @abstractmethod
+    def parse(self, response: str) -> Action:
+        """Parses the action from the response from the LLM.
+
+        Parameters:
+        - response (str): The response from the LLM.
+
+        Returns:
+        - action (Action): The action parsed from the response.
+        """
+        pass
+
+    @abstractmethod
+    def parse_response(self, response) -> str:
+        """Parses the action from the response from the LLM.
+
+        Parameters:
+        - response (str): The response from the LLM.
+
+        Returns:
+        - action_str (str): The action str parsed from the response.
+        """
+        pass
+
+    @abstractmethod
+    def parse_action(self, action_str: str) -> Action:
+        """Parses the action from the response from the LLM.
+
+        Parameters:
+        - action_str (str): The response from the LLM.
+
+        Returns:
+        - action (Action): The action parsed from the response.
+        """
+        pass
+
+
+class ActionParser(ABC):
+    """This abstract base class is a general interface for an action parser dedicated to
+    parsing the action from the action str from the LLM.
+    """
+
+    @abstractmethod
+    def check_condition(self, action_str: str) -> bool:
+        """Check if the action string can be parsed by this parser."""
+        pass
+
+    @abstractmethod
+    def parse(self, action_str: str) -> Action:
+        """Parses the action from the action string from the LLM response."""
+        pass
@@ -1,14 +1,21 @@
 from abc import ABC, abstractmethod
-from typing import List, Dict, Type, TYPE_CHECKING
+from typing import TYPE_CHECKING, Type

 if TYPE_CHECKING:
-    from opendevin.action import Action
-    from opendevin.state import State
-from opendevin.llm.llm import LLM
-from opendevin.exceptions import AgentAlreadyRegisteredError, AgentNotRegisteredError
+    from openhands.controller.state.state import State
+    from openhands.core.config import AgentConfig
+    from openhands.events.action import Action
+from openhands.core.exceptions import (
+    AgentAlreadyRegisteredError,
+    AgentNotRegisteredError,
+)
+from openhands.llm.llm import LLM
+from openhands.runtime.plugins import PluginRequirement
+from openhands.utils.prompt import PromptManager


 class Agent(ABC):
+    DEPRECATED = False
    """
    This abstract base class is an general interface for an agent dedicated to
    executing a specific instruction and allowing human interaction with the
@@ -16,19 +23,22 @@ class Agent(ABC):
    It tracks the execution status and maintains a history of interactions.
    """

-    _registry: Dict[str, Type['Agent']] = {}
+    _registry: dict[str, Type['Agent']] = {}
+    sandbox_plugins: list[PluginRequirement] = []

    def __init__(
-            self,
-            llm: LLM,
+        self,
+        llm: LLM,
+        config: 'AgentConfig',
    ):
        self.llm = llm
+        self.config = config
        self._complete = False
+        self.prompt_manager: PromptManager | None = None

    @property
    def complete(self) -> bool:
-        """
-        Indicates whether the current instruction execution is complete.
+        """Indicates whether the current instruction execution is complete.

        Returns:
        - complete (bool): True if execution is complete; False otherwise.
@@ -37,37 +47,29 @@ class Agent(ABC):

    @abstractmethod
    def step(self, state: 'State') -> 'Action':
-        """
-        Starts the execution of the assigned instruction. This method should
+        """Starts the execution of the assigned instruction. This method should
        be implemented by subclasses to define the specific execution logic.
        """
        pass

-    @abstractmethod
-    def search_memory(self, query: str) -> List[str]:
-        """
-        Searches the agent's memory for information relevant to the given query.
-
-        Parameters:
-        - query (str): The query to search for in the agent's memory.
-
-        Returns:
-        - response (str): The response to the query.
-        """
-        pass
-
    def reset(self) -> None:
-        """
-        Resets the agent's execution status and clears the history. This method can be used
+        """Resets the agent's execution status and clears the history. This method can be used
        to prepare the agent for restarting the instruction or cleaning up before destruction.

        """
+        # TODO clear history
        self._complete = False

+        if self.llm:
+            self.llm.reset()
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
    @classmethod
    def register(cls, name: str, agent_cls: Type['Agent']):
-        """
-        Registers an agent class in the registry.
+        """Registers an agent class in the registry.

        Parameters:
        - name (str): The name to register the class under.
@@ -82,8 +84,7 @@ class Agent(ABC):

    @classmethod
    def get_cls(cls, name: str) -> Type['Agent']:
-        """
-        Retrieves an agent class from the registry.
+        """Retrieves an agent class from the registry.

        Parameters:
        - name (str): The name of the class to retrieve
@@ -100,8 +101,7 @@ class Agent(ABC):

    @classmethod
    def list_agents(cls) -> list[str]:
-        """
-        Retrieves the list of all agent names from the registry.
+        """Retrieves the list of all agent names from the registry.

        Raises:
        - AgentNotRegisteredError: If no agent is registered
@@ -0,0 +1,955 @@
+import asyncio
+import copy
+import os
+import traceback
+from typing import Callable, ClassVar, Type
+
+import litellm
+from litellm.exceptions import BadRequestError, ContextWindowExceededError
+
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State, TrafficControlState
+from openhands.controller.stuck import StuckDetector
+from openhands.core.config import AgentConfig, LLMConfig
+from openhands.core.exceptions import (
+    FunctionCallNotExistsError,
+    FunctionCallValidationError,
+    LLMMalformedActionError,
+    LLMNoActionError,
+    LLMResponseError,
+)
+from openhands.core.logger import LOG_ALL_EVENTS
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.schema import AgentState
+from openhands.events import EventSource, EventStream, EventStreamSubscriber
+from openhands.events.action import (
+    Action,
+    ActionConfirmationStatus,
+    AddTaskAction,
+    AgentDelegateAction,
+    AgentFinishAction,
+    AgentRejectAction,
+    ChangeAgentStateAction,
+    CmdRunAction,
+    IPythonRunCellAction,
+    MessageAction,
+    ModifyTaskAction,
+    NullAction,
+)
+from openhands.events.event import Event
+from openhands.events.observation import (
+    AgentDelegateObservation,
+    AgentStateChangedObservation,
+    ErrorObservation,
+    NullObservation,
+    Observation,
+)
+from openhands.events.serialization.event import truncate_content
+from openhands.llm.llm import LLM
+from openhands.utils.shutdown_listener import should_continue
+
+# note: RESUME is only available on web GUI
+TRAFFIC_CONTROL_REMINDER = (
+    "Please click on resume button if you'd like to continue, or start a new task."
+)
+
+
+class AgentController:
+    id: str
+    agent: Agent
+    max_iterations: int
+    event_stream: EventStream
+    state: State
+    confirmation_mode: bool
+    agent_to_llm_config: dict[str, LLMConfig]
+    agent_configs: dict[str, AgentConfig]
+    agent_task: asyncio.Future | None = None
+    parent: 'AgentController | None' = None
+    delegate: 'AgentController | None' = None
+    _pending_action: Action | None = None
+    _closed: bool = False
+    filter_out: ClassVar[tuple[type[Event], ...]] = (
+        NullAction,
+        NullObservation,
+        ChangeAgentStateAction,
+        AgentStateChangedObservation,
+    )
+
+    def __init__(
+        self,
+        agent: Agent,
+        event_stream: EventStream,
+        max_iterations: int,
+        max_budget_per_task: float | None = None,
+        agent_to_llm_config: dict[str, LLMConfig] | None = None,
+        agent_configs: dict[str, AgentConfig] | None = None,
+        sid: str = 'default',
+        confirmation_mode: bool = False,
+        initial_state: State | None = None,
+        is_delegate: bool = False,
+        headless_mode: bool = True,
+        status_callback: Callable | None = None,
+    ):
+        """Initializes a new instance of the AgentController class.
+
+        Args:
+            agent: The agent instance to control.
+            event_stream: The event stream to publish events to.
+            max_iterations: The maximum number of iterations the agent can run.
+            max_budget_per_task: The maximum budget (in USD) allowed per task, beyond which the agent will stop.
+            agent_to_llm_config: A dictionary mapping agent names to LLM configurations in the case that
+                we delegate to a different agent.
+            agent_configs: A dictionary mapping agent names to agent configurations in the case that
+                we delegate to a different agent.
+            sid: The session ID of the agent.
+            confirmation_mode: Whether to enable confirmation mode for agent actions.
+            initial_state: The initial state of the controller.
+            is_delegate: Whether this controller is a delegate.
+            headless_mode: Whether the agent is run in headless mode.
+            status_callback: Optional callback function to handle status updates.
+        """
+        self._step_lock = asyncio.Lock()
+        self.id = sid
+        self.agent = agent
+        self.headless_mode = headless_mode
+
+        # subscribe to the event stream
+        self.event_stream = event_stream
+        self.event_stream.subscribe(
+            EventStreamSubscriber.AGENT_CONTROLLER, self.on_event, self.id
+        )
+
+        # state from the previous session, state from a parent agent, or a fresh state
+        self.set_initial_state(
+            state=initial_state,
+            max_iterations=max_iterations,
+            confirmation_mode=confirmation_mode,
+        )
+        self.max_budget_per_task = max_budget_per_task
+        self.agent_to_llm_config = agent_to_llm_config if agent_to_llm_config else {}
+        self.agent_configs = agent_configs if agent_configs else {}
+        self._initial_max_iterations = max_iterations
+        self._initial_max_budget_per_task = max_budget_per_task
+
+        # stuck helper
+        self._stuck_detector = StuckDetector(self.state)
+        self.status_callback = status_callback
+
+    async def close(self) -> None:
+        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream.
+
+        Note that it's fairly important that this closes properly, otherwise the state is incomplete.
+        """
+        await self.set_agent_state_to(AgentState.STOPPED)
+
+        # we made history, now is the time to rewrite it!
+        # the final state.history will be used by external scripts like evals, tests, etc.
+        # history will need to be complete WITH delegates events
+        # like the regular agent history, it does not include:
+        # - 'hidden' events, events with hidden=True
+        # - backend events (the default 'filtered out' types, types in self.filter_out)
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id >= 0
+            else self.event_stream.get_latest_event_id()
+        )
+        self.state.history = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
+            )
+        )
+
+        # unsubscribe from the event stream
+        self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER, self.id)
+        self._closed = True
+
+    def log(self, level: str, message: str, extra: dict | None = None) -> None:
+        """Logs a message to the agent controller's logger.
+
+        Args:
+            level (str): The logging level to use (e.g., 'info', 'debug', 'error').
+            message (str): The message to log.
+            extra (dict | None, optional): Additional fields to include in the log. Defaults to None.
+        """
+        message = f'[Agent Controller {self.id}] {message}'
+        getattr(logger, level)(message, extra=extra, stacklevel=2)
+
+    def update_state_before_step(self):
+        self.state.iteration += 1
+        self.state.local_iteration += 1
+
+    async def update_state_after_step(self):
+        # update metrics especially for cost. Use deepcopy to avoid it being modified by agent._reset()
+        self.state.local_metrics = copy.deepcopy(self.agent.llm.metrics)
+
+    async def _react_to_exception(
+        self,
+        e: Exception,
+    ):
+        await self.set_agent_state_to(AgentState.ERROR)
+        if self.status_callback is not None:
+            err_id = ''
+            if isinstance(e, litellm.AuthenticationError):
+                err_id = 'STATUS$ERROR_LLM_AUTHENTICATION'
+            self.status_callback('error', err_id, str(e))
+
+    async def start_step_loop(self):
+        """The main loop for the agent's step-by-step execution."""
+        self.log('info', 'Starting step loop...')
+        while True:
+            if not self._is_awaiting_observation() and not should_continue():
+                break
+            if self._closed:
+                break
+            try:
+                await self._step()
+            except asyncio.CancelledError:
+                self.log('debug', 'AgentController task was cancelled')
+                break
+            except Exception as e:
+                traceback.print_exc()
+                self.log('error', f'Error while running the agent: {e}')
+                await self._react_to_exception(e)
+
+            await asyncio.sleep(0.1)
+
+    async def on_event(self, event: Event) -> None:
+        """Callback from the event stream. Notifies the controller of incoming events.
+
+        Args:
+            event (Event): The incoming event to process.
+        """
+        if hasattr(event, 'hidden') and event.hidden:
+            return
+
+        # if the event is not filtered out, add it to the history
+        if not any(isinstance(event, filter_type) for filter_type in self.filter_out):
+            self.state.history.append(event)
+
+        if isinstance(event, Action):
+            await self._handle_action(event)
+        elif isinstance(event, Observation):
+            await self._handle_observation(event)
+
+    async def _handle_action(self, action: Action) -> None:
+        """Handles actions from the event stream.
+
+        Args:
+            action (Action): The action to handle.
+        """
+        if isinstance(action, ChangeAgentStateAction):
+            await self.set_agent_state_to(action.agent_state)  # type: ignore
+        elif isinstance(action, MessageAction):
+            await self._handle_message_action(action)
+        elif isinstance(action, AgentDelegateAction):
+            await self.start_delegate(action)
+        elif isinstance(action, AddTaskAction):
+            self.state.root_task.add_subtask(
+                action.parent, action.goal, action.subtasks
+            )
+        elif isinstance(action, ModifyTaskAction):
+            self.state.root_task.set_subtask_state(action.task_id, action.state)
+        elif isinstance(action, AgentFinishAction):
+            self.state.outputs = action.outputs
+            self.state.metrics.merge(self.state.local_metrics)
+            await self.set_agent_state_to(AgentState.FINISHED)
+        elif isinstance(action, AgentRejectAction):
+            self.state.outputs = action.outputs
+            self.state.metrics.merge(self.state.local_metrics)
+            await self.set_agent_state_to(AgentState.REJECTED)
+
+    async def _handle_observation(self, observation: Observation) -> None:
+        """Handles observation from the event stream.
+
+        Args:
+            observation (observation): The observation to handle.
+        """
+        observation_to_print = copy.deepcopy(observation)
+        if len(observation_to_print.content) > self.agent.llm.config.max_message_chars:
+            observation_to_print.content = truncate_content(
+                observation_to_print.content, self.agent.llm.config.max_message_chars
+            )
+        # Use info level if LOG_ALL_EVENTS is set
+        log_level = 'info' if os.getenv('LOG_ALL_EVENTS') in ('true', '1') else 'debug'
+        self.log(
+            log_level, str(observation_to_print), extra={'msg_type': 'OBSERVATION'}
+        )
+
+        if observation.llm_metrics is not None:
+            self.agent.llm.metrics.merge(observation.llm_metrics)
+
+        if self._pending_action and self._pending_action.id == observation.cause:
+            if self.state.agent_state == AgentState.AWAITING_USER_CONFIRMATION:
+                return
+            self._pending_action = None
+            if self.state.agent_state == AgentState.USER_CONFIRMED:
+                await self.set_agent_state_to(AgentState.RUNNING)
+            if self.state.agent_state == AgentState.USER_REJECTED:
+                await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+            return
+        elif isinstance(observation, ErrorObservation):
+            if self.state.agent_state == AgentState.ERROR:
+                self.state.metrics.merge(self.state.local_metrics)
+
+    async def _handle_message_action(self, action: MessageAction) -> None:
+        """Handles message actions from the event stream.
+
+        Args:
+            action (MessageAction): The message action to handle.
+        """
+        if action.source == EventSource.USER:
+            # Use info level if LOG_ALL_EVENTS is set
+            log_level = (
+                'info' if os.getenv('LOG_ALL_EVENTS') in ('true', '1') else 'debug'
+            )
+            self.log(
+                log_level,
+                str(action),
+                extra={'msg_type': 'ACTION', 'event_source': EventSource.USER},
+            )
+            # Extend max iterations when the user sends a message (only in non-headless mode)
+            if self._initial_max_iterations is not None and not self.headless_mode:
+                self.state.max_iterations = (
+                    self.state.iteration + self._initial_max_iterations
+                )
+                if (
+                    self.state.traffic_control_state == TrafficControlState.THROTTLING
+                    or self.state.traffic_control_state == TrafficControlState.PAUSED
+                ):
+                    self.state.traffic_control_state = TrafficControlState.NORMAL
+                self.log(
+                    'debug',
+                    f'Extended max iterations to {self.state.max_iterations} after user message',
+                )
+            if self.get_agent_state() != AgentState.RUNNING:
+                await self.set_agent_state_to(AgentState.RUNNING)
+        elif action.source == EventSource.AGENT and action.wait_for_response:
+            await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+
+    def _reset(self) -> None:
+        """Resets the agent controller"""
+
+        self._pending_action = None
+        self.agent.reset()
+
+    async def set_agent_state_to(self, new_state: AgentState) -> None:
+        """Updates the agent's state and handles side effects. Can emit events to the event stream.
+
+        Args:
+            new_state (AgentState): The new state to set for the agent.
+        """
+        self.log(
+            'info',
+            f'Setting agent({self.agent.name}) state from {self.state.agent_state} to {new_state}',
+        )
+
+        if new_state == self.state.agent_state:
+            return
+
+        if new_state in (AgentState.STOPPED, AgentState.ERROR):
+            self._reset()
+        elif (
+            new_state == AgentState.RUNNING
+            and self.state.agent_state == AgentState.PAUSED
+            # TODO: do we really need both THROTTLING and PAUSED states, or can we clean up one of them completely?
+            and self.state.traffic_control_state == TrafficControlState.THROTTLING
+        ):
+            # user intends to interrupt traffic control and let the task resume temporarily
+            self.state.traffic_control_state = TrafficControlState.PAUSED
+            # User has chosen to deliberately continue - lets double the max iterations
+            if (
+                self.state.iteration is not None
+                and self.state.max_iterations is not None
+                and self._initial_max_iterations is not None
+                and not self.headless_mode
+            ):
+                if self.state.iteration >= self.state.max_iterations:
+                    self.state.max_iterations += self._initial_max_iterations
+
+            if (
+                self.state.metrics.accumulated_cost is not None
+                and self.max_budget_per_task is not None
+                and self._initial_max_budget_per_task is not None
+            ):
+                if self.state.metrics.accumulated_cost >= self.max_budget_per_task:
+                    self.max_budget_per_task += self._initial_max_budget_per_task
+        elif self._pending_action is not None and (
+            new_state in (AgentState.USER_CONFIRMED, AgentState.USER_REJECTED)
+        ):
+            if hasattr(self._pending_action, 'thought'):
+                self._pending_action.thought = ''  # type: ignore[union-attr]
+            if new_state == AgentState.USER_CONFIRMED:
+                confirmation_state = ActionConfirmationStatus.CONFIRMED
+            else:
+                confirmation_state = ActionConfirmationStatus.REJECTED
+            self._pending_action.confirmation_state = confirmation_state  # type: ignore[attr-defined]
+            self._pending_action._id = None  # type: ignore[attr-defined]
+            self.event_stream.add_event(self._pending_action, EventSource.AGENT)
+
+        self.state.agent_state = new_state
+        self.event_stream.add_event(
+            AgentStateChangedObservation('', self.state.agent_state),
+            EventSource.ENVIRONMENT,
+        )
+
+        if new_state == AgentState.INIT and self.state.resume_state:
+            await self.set_agent_state_to(self.state.resume_state)
+            self.state.resume_state = None
+
+    def get_agent_state(self) -> AgentState:
+        """Returns the current state of the agent.
+
+        Returns:
+            AgentState: The current state of the agent.
+        """
+        return self.state.agent_state
+
+    async def start_delegate(self, action: AgentDelegateAction) -> None:
+        """Start a delegate agent to handle a subtask.
+
+        OpenHands is a multi-agentic system. A `task` is a conversation between
+        OpenHands (the whole system) and the user, which might involve one or more inputs
+        from the user. It starts with an initial input (typically a task statement) from
+        the user, and ends with either an `AgentFinishAction` initiated by the agent, a
+        stop initiated by the user, or an error.
+
+        A `subtask` is a conversation between an agent and the user, or another agent. If a `task`
+        is conducted by a single agent, then it's also a `subtask`. Otherwise, a `task` consists of
+        multiple `subtasks`, each executed by one agent.
+
+        Args:
+            action (AgentDelegateAction): The action containing information about the delegate agent to start.
+        """
+        agent_cls: Type[Agent] = Agent.get_cls(action.agent)
+        agent_config = self.agent_configs.get(action.agent, self.agent.config)
+        llm_config = self.agent_to_llm_config.get(action.agent, self.agent.llm.config)
+        llm = LLM(config=llm_config)
+        delegate_agent = agent_cls(llm=llm, config=agent_config)
+        state = State(
+            inputs=action.inputs or {},
+            local_iteration=0,
+            iteration=self.state.iteration,
+            max_iterations=self.state.max_iterations,
+            delegate_level=self.state.delegate_level + 1,
+            # global metrics should be shared between parent and child
+            metrics=self.state.metrics,
+            # start on top of the stream
+            start_id=self.event_stream.get_latest_event_id() + 1,
+        )
+        self.log(
+            'debug',
+            f'start delegate, creating agent {delegate_agent.name} using LLM {llm}',
+        )
+
+        self.event_stream.unsubscribe(EventStreamSubscriber.AGENT_CONTROLLER, self.id)
+        self.delegate = AgentController(
+            sid=self.id + '-delegate',
+            agent=delegate_agent,
+            event_stream=self.event_stream,
+            max_iterations=self.state.max_iterations,
+            max_budget_per_task=self.max_budget_per_task,
+            agent_to_llm_config=self.agent_to_llm_config,
+            agent_configs=self.agent_configs,
+            initial_state=state,
+            is_delegate=True,
+            headless_mode=self.headless_mode,
+        )
+        await self.delegate.set_agent_state_to(AgentState.RUNNING)
+
+    async def _step(self) -> None:
+        """Executes a single step of the parent or delegate agent. Detects stuck agents and limits on the number of iterations and the task budget."""
+        if self.get_agent_state() != AgentState.RUNNING:
+            await asyncio.sleep(1)
+            return
+
+        if self._pending_action:
+            await asyncio.sleep(1)
+            return
+
+        if self.delegate is not None:
+            assert self.delegate != self
+            if self.delegate.get_agent_state() == AgentState.PAUSED:
+                # no need to check too often
+                await asyncio.sleep(1)
+            else:
+                await self._delegate_step()
+            return
+
+        self.log(
+            'info',
+            f'LEVEL {self.state.delegate_level} LOCAL STEP {self.state.local_iteration} GLOBAL STEP {self.state.iteration}',
+            extra={'msg_type': 'STEP'},
+        )
+
+        # check if agent hit the resources limit
+        stop_step = False
+        if self.state.iteration >= self.state.max_iterations:
+            stop_step = await self._handle_traffic_control(
+                'iteration', self.state.iteration, self.state.max_iterations
+            )
+        if self.max_budget_per_task is not None:
+            current_cost = self.state.metrics.accumulated_cost
+            if current_cost > self.max_budget_per_task:
+                stop_step = await self._handle_traffic_control(
+                    'budget', current_cost, self.max_budget_per_task
+                )
+        if stop_step:
+            return
+
+        if self._is_stuck():
+            await self._react_to_exception(RuntimeError('Agent got stuck in a loop'))
+            return
+
+        self.update_state_before_step()
+        action: Action = NullAction()
+        try:
+            action = self.agent.step(self.state)
+            if action is None:
+                raise LLMNoActionError('No action was returned')
+        except (
+            LLMMalformedActionError,
+            LLMNoActionError,
+            LLMResponseError,
+            FunctionCallValidationError,
+            FunctionCallNotExistsError,
+        ) as e:
+            self.event_stream.add_event(
+                ErrorObservation(
+                    content=str(e),
+                ),
+                EventSource.AGENT,
+            )
+            return
+        except (ContextWindowExceededError, BadRequestError) as e:
+            # FIXME: this is a hack until a litellm fix is confirmed
+            # Check if this is a nested context window error
+            error_str = str(e).lower()
+            if (
+                'contextwindowexceedederror' in error_str
+                or 'prompt is too long' in error_str
+                or isinstance(e, ContextWindowExceededError)
+            ):
+                # When context window is exceeded, keep roughly half of agent interactions
+                self.state.history = self._apply_conversation_window(self.state.history)
+
+                # Save the ID of the first event in our truncated history for future reloading
+                if self.state.history:
+                    self.state.start_id = self.state.history[0].id
+                # Don't add error event - let the agent retry with reduced context
+                return
+            raise
+
+        if action.runnable:
+            if self.state.confirmation_mode and (
+                type(action) is CmdRunAction or type(action) is IPythonRunCellAction
+            ):
+                action.confirmation_state = (
+                    ActionConfirmationStatus.AWAITING_CONFIRMATION
+                )
+            self._pending_action = action
+
+        if not isinstance(action, NullAction):
+            if (
+                hasattr(action, 'confirmation_state')
+                and action.confirmation_state
+                == ActionConfirmationStatus.AWAITING_CONFIRMATION
+            ):
+                await self.set_agent_state_to(AgentState.AWAITING_USER_CONFIRMATION)
+            self.event_stream.add_event(action, EventSource.AGENT)
+
+        await self.update_state_after_step()
+
+        log_level = 'info' if LOG_ALL_EVENTS else 'debug'
+        self.log(log_level, str(action), extra={'msg_type': 'ACTION'})
+
+    async def _delegate_step(self) -> None:
+        """Executes a single step of the delegate agent."""
+        await self.delegate._step()  # type: ignore[union-attr]
+        assert self.delegate is not None
+        delegate_state = self.delegate.get_agent_state()
+        self.log('debug', f'Delegate state: {delegate_state}')
+        if delegate_state == AgentState.ERROR:
+            # update iteration that shall be shared across agents
+            self.state.iteration = self.delegate.state.iteration
+
+            # emit AgentDelegateObservation to mark delegate termination due to error
+            delegate_outputs = (
+                self.delegate.state.outputs if self.delegate.state else {}
+            )
+            content = (
+                f'{self.delegate.agent.name} encountered an error during execution.'
+            )
+            obs = AgentDelegateObservation(outputs=delegate_outputs, content=content)
+            self.event_stream.add_event(obs, EventSource.AGENT)
+
+            # close the delegate upon error
+            await self.delegate.close()
+
+            # resubscribe parent when delegate is finished
+            self.event_stream.subscribe(
+                EventStreamSubscriber.AGENT_CONTROLLER, self.on_event, self.id
+            )
+            self.delegate = None
+            self.delegateAction = None
+
+        elif delegate_state in (AgentState.FINISHED, AgentState.REJECTED):
+            self.log('debug', 'Delegate agent has finished execution')
+            # retrieve delegate result
+            outputs = self.delegate.state.outputs if self.delegate.state else {}
+
+            # update iteration that shall be shared across agents
+            self.state.iteration = self.delegate.state.iteration
+
+            # close delegate controller: we must close the delegate controller before adding new events
+            await self.delegate.close()
+
+            # resubscribe parent when delegate is finished
+            self.event_stream.subscribe(
+                EventStreamSubscriber.AGENT_CONTROLLER, self.on_event, self.id
+            )
+
+            # update delegate result observation
+            # TODO: replace this with AI-generated summary (#2395)
+            formatted_output = ', '.join(
+                f'{key}: {value}' for key, value in outputs.items()
+            )
+            content = (
+                f'{self.delegate.agent.name} finishes task with {formatted_output}'
+            )
+            obs = AgentDelegateObservation(outputs=outputs, content=content)
+
+            # clean up delegate status
+            self.delegate = None
+            self.delegateAction = None
+            self.event_stream.add_event(obs, EventSource.AGENT)
+        return
+
+    async def _handle_traffic_control(
+        self, limit_type: str, current_value: float, max_value: float
+    ) -> bool:
+        """Handles agent state after hitting the traffic control limit.
+
+        Args:
+            limit_type (str): The type of limit that was hit.
+            current_value (float): The current value of the limit.
+            max_value (float): The maximum value of the limit.
+        """
+        stop_step = False
+        if self.state.traffic_control_state == TrafficControlState.PAUSED:
+            self.log(
+                'debug', 'Hitting traffic control, temporarily resume upon user request'
+            )
+            self.state.traffic_control_state = TrafficControlState.NORMAL
+        else:
+            self.state.traffic_control_state = TrafficControlState.THROTTLING
+            # Format values as integers for iterations, keep decimals for budget
+            if limit_type == 'iteration':
+                current_str = str(int(current_value))
+                max_str = str(int(max_value))
+            else:
+                current_str = f'{current_value:.2f}'
+                max_str = f'{max_value:.2f}'
+
+            if self.headless_mode:
+                e = RuntimeError(
+                    f'Agent reached maximum {limit_type} in headless mode. '
+                    f'Current {limit_type}: {current_str}, max {limit_type}: {max_str}'
+                )
+                await self._react_to_exception(e)
+            else:
+                e = RuntimeError(
+                    f'Agent reached maximum {limit_type}. '
+                    f'Current {limit_type}: {current_str}, max {limit_type}: {max_str}. '
+                )
+                # FIXME: this isn't really an exception--we should have a different path
+                await self._react_to_exception(e)
+            stop_step = True
+        return stop_step
+
+    def get_state(self) -> State:
+        """Returns the current running state object.
+
+        Returns:
+            State: The current state object.
+        """
+        return self.state
+
+    def set_initial_state(
+        self,
+        state: State | None,
+        max_iterations: int,
+        confirmation_mode: bool = False,
+    ) -> None:
+        """Sets the initial state for the agent, either from the previous session, or from a parent agent, or by creating a new one.
+
+        Args:
+            state: The state to initialize with, or None to create a new state.
+            max_iterations: The maximum number of iterations allowed for the task.
+            confirmation_mode: Whether to enable confirmation mode.
+        """
+        # state can come from:
+        # - the previous session, in which case it has history
+        # - from a parent agent, in which case it has no history
+        # - None / a new state
+        if state is None:
+            self.state = State(
+                inputs={},
+                max_iterations=max_iterations,
+                confirmation_mode=confirmation_mode,
+            )
+        else:
+            self.state = state
+
+            if self.state.start_id <= -1:
+                self.state.start_id = 0
+
+            self.log(
+                'debug',
+                f'AgentController {self.id} initializing history from event {self.state.start_id}',
+            )
+
+            self._init_history()
+
+    def _init_history(self) -> None:
+        """Initializes the agent's history from the event stream.
+
+        The history is a list of events that:
+        - Excludes events of types listed in self.filter_out
+        - Excludes events with hidden=True attribute
+        - For delegate events (between AgentDelegateAction and AgentDelegateObservation):
+            - Excludes all events between the action and observation
+            - Includes the delegate action and observation themselves
+
+        The history is loaded in two parts if truncation_id is set:
+        1. First user message from start_id onwards
+        2. Rest of history from truncation_id to the end
+
+        Otherwise loads normally from start_id.
+        """
+        # define range of events to fetch
+        # delegates start with a start_id and initially won't find any events
+        # otherwise we're restoring a previous session
+        start_id = self.state.start_id if self.state.start_id >= 0 else 0
+        end_id = (
+            self.state.end_id
+            if self.state.end_id >= 0
+            else self.event_stream.get_latest_event_id()
+        )
+
+        # sanity check
+        if start_id > end_id + 1:
+            self.log(
+                'warning',
+                f'start_id {start_id} is greater than end_id + 1 ({end_id + 1}). History will be empty.',
+            )
+            self.state.history = []
+            return
+
+        events: list[Event] = []
+
+        # If we have a truncation point, get first user message and then rest of history
+        if hasattr(self.state, 'truncation_id') and self.state.truncation_id > 0:
+            # Find first user message from stream
+            first_user_msg = next(
+                (
+                    e
+                    for e in self.event_stream.get_events(
+                        start_id=start_id,
+                        end_id=end_id,
+                        reverse=False,
+                        filter_out_type=self.filter_out,
+                        filter_hidden=True,
+                    )
+                    if isinstance(e, MessageAction) and e.source == EventSource.USER
+                ),
+                None,
+            )
+            if first_user_msg:
+                events.append(first_user_msg)
+
+            # the rest of the events are from the truncation point
+            start_id = self.state.truncation_id
+
+        # Get rest of history
+        events_to_add = list(
+            self.event_stream.get_events(
+                start_id=start_id,
+                end_id=end_id,
+                reverse=False,
+                filter_out_type=self.filter_out,
+                filter_hidden=True,
+            )
+        )
+        events.extend(events_to_add)
+
+        # Find all delegate action/observation pairs
+        delegate_ranges: list[tuple[int, int]] = []
+        delegate_action_ids: list[int] = []  # stack of unmatched delegate action IDs
+
+        for event in events:
+            if isinstance(event, AgentDelegateAction):
+                delegate_action_ids.append(event.id)
+                # Note: we can get agent=event.agent and task=event.inputs.get('task','')
+                # if we need to track these in the future
+
+            elif isinstance(event, AgentDelegateObservation):
+                # Match with most recent unmatched delegate action
+                if not delegate_action_ids:
+                    self.log(
+                        'warning',
+                        f'Found AgentDelegateObservation without matching action at id={event.id}',
+                    )
+                    continue
+
+                action_id = delegate_action_ids.pop()
+                delegate_ranges.append((action_id, event.id))
+
+        # Filter out events between delegate action/observation pairs
+        if delegate_ranges:
+            filtered_events: list[Event] = []
+            current_idx = 0
+
+            for start_id, end_id in sorted(delegate_ranges):
+                # Add events before delegate range
+                filtered_events.extend(
+                    event for event in events[current_idx:] if event.id < start_id
+                )
+
+                # Add delegate action and observation
+                filtered_events.extend(
+                    event for event in events if event.id in (start_id, end_id)
+                )
+
+                # Update index to after delegate range
+                current_idx = next(
+                    (i for i, e in enumerate(events) if e.id > end_id), len(events)
+                )
+
+            # Add any remaining events after last delegate range
+            filtered_events.extend(events[current_idx:])
+
+            self.state.history = filtered_events
+        else:
+            self.state.history = events
+
+        # make sure history is in sync
+        self.state.start_id = start_id
+
+    def _apply_conversation_window(self, events: list[Event]) -> list[Event]:
+        """Cuts history roughly in half when context window is exceeded, preserving action-observation pairs
+        and ensuring the first user message is always included.
+
+        The algorithm:
+        1. Cut history in half
+        2. Check first event in new history:
+           - If Observation: find and include its Action
+           - If MessageAction: ensure its related Action-Observation pair isn't split
+        3. Always include the first user message
+
+        Args:
+            events: List of events to filter
+
+        Returns:
+            Filtered list of events keeping newest half while preserving pairs
+        """
+        if not events:
+            return events
+
+        # Find first user message - we'll need to ensure it's included
+        first_user_msg = next(
+            (
+                e
+                for e in events
+                if isinstance(e, MessageAction) and e.source == EventSource.USER
+            ),
+            None,
+        )
+
+        # cut in half
+        mid_point = max(1, len(events) // 2)
+        kept_events = events[mid_point:]
+
+        # Handle first event in truncated history
+        if kept_events:
+            i = 0
+            while i < len(kept_events):
+                first_event = kept_events[i]
+                if isinstance(first_event, Observation) and first_event.cause:
+                    # Find its action and include it
+                    matching_action = next(
+                        (
+                            e
+                            for e in reversed(events[:mid_point])
+                            if isinstance(e, Action) and e.id == first_event.cause
+                        ),
+                        None,
+                    )
+                    if matching_action:
+                        kept_events = [matching_action] + kept_events
+                    else:
+                        self.log(
+                            'warning',
+                            f'Found Observation without matching Action at id={first_event.id}',
+                        )
+                        # drop this observation
+                        kept_events = kept_events[1:]
+                    break
+
+                elif isinstance(first_event, MessageAction) or (
+                    isinstance(first_event, Action)
+                    and first_event.source == EventSource.USER
+                ):
+                    # if it's a message action or a user action, keep it and continue to find the next event
+                    i += 1
+                    continue
+
+                else:
+                    # if it's an action with source == EventSource.AGENT, we're good
+                    break
+
+        # Save where to continue from in next reload
+        if kept_events:
+            self.state.truncation_id = kept_events[0].id
+
+        # Ensure first user message is included
+        if first_user_msg and first_user_msg not in kept_events:
+            kept_events = [first_user_msg] + kept_events
+
+        # start_id points to first user message
+        if first_user_msg:
+            self.state.start_id = first_user_msg.id
+
+        return kept_events
+
+    def _is_stuck(self) -> bool:
+        """Checks if the agent or its delegate is stuck in a loop.
+
+        Returns:
+            bool: True if the agent is stuck, False otherwise.
+        """
+        # check if delegate stuck
+        if self.delegate and self.delegate._is_stuck():
+            return True
+
+        return self._stuck_detector.is_stuck(self.headless_mode)
+
+    def __repr__(self):
+        return (
+            f'AgentController(id={self.id}, agent={self.agent!r}, '
+            f'event_stream={self.event_stream!r}, '
+            f'state={self.state!r}, agent_task={self.agent_task!r}, '
+            f'delegate={self.delegate!r}, _pending_action={self._pending_action!r})'
+        )
+
+    def _is_awaiting_observation(self):
+        events = self.event_stream.get_events(reverse=True)
+        for event in events:
+            if isinstance(event, AgentStateChangedObservation):
+                result = event.agent_state == AgentState.RUNNING
+                return result
+        return False
@@ -0,0 +1,171 @@
+import base64
+import pickle
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+
+from openhands.controller.state.task import RootTask
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.schema import AgentState
+from openhands.events.action import (
+    MessageAction,
+)
+from openhands.events.action.agent import AgentFinishAction
+from openhands.events.event import Event, EventSource
+from openhands.llm.metrics import Metrics
+from openhands.storage.files import FileStore
+
+
+class TrafficControlState(str, Enum):
+    # default state, no rate limiting
+    NORMAL = 'normal'
+
+    # task paused due to traffic control
+    THROTTLING = 'throttling'
+
+    # traffic control is temporarily paused
+    PAUSED = 'paused'
+
+
+RESUMABLE_STATES = [
+    AgentState.RUNNING,
+    AgentState.PAUSED,
+    AgentState.AWAITING_USER_INPUT,
+    AgentState.FINISHED,
+]
+
+
+@dataclass
+class State:
+    """
+    Represents the running state of an agent in the OpenHands system, saving data of its operation and memory.
+
+    - Multi-agent/delegate state:
+      - store the task (conversation between the agent and the user)
+      - the subtask (conversation between an agent and the user or another agent)
+      - global and local iterations
+      - delegate levels for multi-agent interactions
+      - almost stuck state
+
+    - Running state of an agent:
+      - current agent state (e.g., LOADING, RUNNING, PAUSED)
+      - traffic control state for rate limiting
+      - confirmation mode
+      - the last error encountered
+
+    - Data for saving and restoring the agent:
+      - save to and restore from a session
+      - serialize with pickle and base64
+
+    - Save / restore data about message history
+      - start and end IDs for events in agent's history
+      - summaries and delegate summaries
+
+    - Metrics:
+      - global metrics for the current task
+      - local metrics for the current subtask
+
+    - Extra data:
+      - additional task-specific data
+    """
+
+    root_task: RootTask = field(default_factory=RootTask)
+    # global iteration for the current task
+    iteration: int = 0
+    # local iteration for the current subtask
+    local_iteration: int = 0
+    # max number of iterations for the current task
+    max_iterations: int = 100
+    confirmation_mode: bool = False
+    history: list[Event] = field(default_factory=list)
+    inputs: dict = field(default_factory=dict)
+    outputs: dict = field(default_factory=dict)
+    agent_state: AgentState = AgentState.LOADING
+    resume_state: AgentState | None = None
+    traffic_control_state: TrafficControlState = TrafficControlState.NORMAL
+    # global metrics for the current task
+    metrics: Metrics = field(default_factory=Metrics)
+    # local metrics for the current subtask
+    local_metrics: Metrics = field(default_factory=Metrics)
+    # root agent has level 0, and every delegate increases the level by one
+    delegate_level: int = 0
+    # start_id and end_id track the range of events in history
+    start_id: int = -1
+    end_id: int = -1
+    # truncation_id tracks where to load history after context window truncation
+    truncation_id: int = -1
+
+    delegates: dict[tuple[int, int], tuple[str, str]] = field(default_factory=dict)
+    # NOTE: This will never be used by the controller, but it can be used by different
+    # evaluation tasks to store extra data needed to track the progress/state of the task.
+    extra_data: dict[str, Any] = field(default_factory=dict)
+    last_error: str = ''
+
+    def save_to_session(self, sid: str, file_store: FileStore):
+        pickled = pickle.dumps(self)
+        logger.debug(f'Saving state to session {sid}:{self.agent_state}')
+        encoded = base64.b64encode(pickled).decode('utf-8')
+        try:
+            file_store.write(f'sessions/{sid}/agent_state.pkl', encoded)
+        except Exception as e:
+            logger.error(f'Failed to save state to session: {e}')
+            raise e
+
+    @staticmethod
+    def restore_from_session(sid: str, file_store: FileStore) -> 'State':
+        try:
+            encoded = file_store.read(f'sessions/{sid}/agent_state.pkl')
+            pickled = base64.b64decode(encoded)
+            state = pickle.loads(pickled)
+        except Exception as e:
+            logger.debug(f'Could not restore state from session: {e}')
+            raise e
+
+        # update state
+        if state.agent_state in RESUMABLE_STATES:
+            state.resume_state = state.agent_state
+        else:
+            state.resume_state = None
+
+        # first state after restore
+        state.agent_state = AgentState.LOADING
+        return state
+
+    def __getstate__(self):
+        # don't pickle history, it will be restored from the event stream
+        state = self.__dict__.copy()
+        state['history'] = []
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+        # make sure we always have the attribute history
+        if not hasattr(self, 'history'):
+            self.history = []
+
+    def get_current_user_intent(self) -> tuple[str | None, list[str] | None]:
+        """Returns the latest user message and image(if provided) that appears after a FinishAction, or the first (the task) if nothing was finished yet."""
+        last_user_message = None
+        last_user_message_image_urls: list[str] | None = []
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == 'user':
+                last_user_message = event.content
+                last_user_message_image_urls = event.image_urls
+            elif isinstance(event, AgentFinishAction):
+                if last_user_message is not None:
+                    return last_user_message, None
+
+        return last_user_message, last_user_message_image_urls
+
+    def get_last_agent_message(self) -> MessageAction | None:
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == EventSource.AGENT:
+                return event
+        return None
+
+    def get_last_user_message(self) -> MessageAction | None:
+        for event in reversed(self.history):
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
+                return event
+        return None
@@ -1,24 +1,36 @@
-from typing import List
-
-from opendevin.logger import opendevin_logger as logger
-from opendevin.exceptions import PlanInvalidStateError
+from openhands.core.exceptions import (
+    LLMMalformedActionError,
+    TaskInvalidStateError,
+)
+from openhands.core.logger import openhands_logger as logger

 OPEN_STATE = 'open'
 COMPLETED_STATE = 'completed'
 ABANDONED_STATE = 'abandoned'
 IN_PROGRESS_STATE = 'in_progress'
 VERIFIED_STATE = 'verified'
-STATES = [OPEN_STATE, COMPLETED_STATE,
-          ABANDONED_STATE, IN_PROGRESS_STATE, VERIFIED_STATE]
+STATES = [
+    OPEN_STATE,
+    COMPLETED_STATE,
+    ABANDONED_STATE,
+    IN_PROGRESS_STATE,
+    VERIFIED_STATE,
+]


 class Task:
    id: str
    goal: str
    parent: 'Task | None'
-    subtasks: List['Task']
+    subtasks: list['Task']

-    def __init__(self, parent: 'Task | None', goal: str, state: str = OPEN_STATE, subtasks: List = []):
+    def __init__(
+        self,
+        parent: 'Task',
+        goal: str,
+        state: str = OPEN_STATE,
+        subtasks=None,  # noqa: B006
+    ):
        """Initializes a new instance of the Task class.

        Args:
@@ -27,20 +39,24 @@ class Task:
            state: The initial state of the task.
            subtasks: A list of subtasks associated with this task.
        """
-        if parent is None:
-            self.id = '0'
-        else:
+        if subtasks is None:
+            subtasks = []
+        if parent.id:
            self.id = parent.id + '.' + str(len(parent.subtasks))
+        else:
+            self.id = str(len(parent.subtasks))
        self.parent = parent
        self.goal = goal
+        logger.debug(f'Creating task {self.id} with parent={parent.id}, goal={goal}')
        self.subtasks = []
-        for subtask in (subtasks or []):
+        for subtask in subtasks or []:
            if isinstance(subtask, Task):
                self.subtasks.append(subtask)
            else:
                goal = subtask.get('goal')
                state = subtask.get('state')
                subtasks = subtask.get('subtasks')
+                logger.debug(f'Reading: {goal}, {state}, {subtasks}')
                self.subtasks.append(Task(self, goal, state, subtasks))

        self.state = OPEN_STATE
@@ -80,7 +96,7 @@ class Task:
            'id': self.id,
            'goal': self.goal,
            'state': self.state,
-            'subtasks': [t.to_dict() for t in self.subtasks]
+            'subtasks': [t.to_dict() for t in self.subtasks],
        }

    def set_state(self, state):
@@ -89,13 +105,17 @@ class Task:
        Args:            state: The new state of the task.

        Raises:
-            PlanInvalidStateError: If the provided state is invalid.
+            TaskInvalidStateError: If the provided state is invalid.
        """
        if state not in STATES:
            logger.error('Invalid state: %s', state)
-            raise PlanInvalidStateError(state)
+            raise TaskInvalidStateError(state)
        self.state = state
-        if state == COMPLETED_STATE or state == ABANDONED_STATE or state == VERIFIED_STATE:
+        if (
+            state == COMPLETED_STATE
+            or state == ABANDONED_STATE
+            or state == VERIFIED_STATE
+        ):
            for subtask in self.subtasks:
                if subtask.state != ABANDONED_STATE:
                    subtask.set_state(state)
@@ -117,32 +137,35 @@ class Task:
        return None


-class Plan:
-    """Represents a plan consisting of tasks.
+class RootTask(Task):
+    """Serves as the root node in a tree of tasks.
+    Because we want the top-level of the root_task to be a list of tasks (1, 2, 3, etc.),
+    the "root node" of the data structure is kind of invisible--it just
+    holds references to the top-level tasks.

    Attributes:
-        main_goal: The main goal of the plan.
-        task: The root task of the plan.
+        id: Kept blank for root_task
+        goal: Kept blank for root_task
+        parent: None for root_task
+        subtasks: The top-level list of tasks associated with the root_task.
+        state: The state of the root_task.
    """
-    main_goal: str
-    task: Task

-    def __init__(self, task: str):
-        """Initializes a new instance of the Plan class.
+    id: str = ''
+    goal: str = ''
+    parent: None = None

-        Args:
-            task: The main goal of the plan.
-        """
-        self.main_goal = task
-        self.task = Task(parent=None, goal=task, subtasks=[])
+    def __init__(self):
+        self.subtasks = []
+        self.state = OPEN_STATE

    def __str__(self):
-        """Returns a string representation of the plan.
+        """Returns a string representation of the root_task.

        Returns:
-            A string representation of the plan.
+            A string representation of the root_task.
        """
-        return self.task.to_string()
+        return self.to_string()

    def get_task_by_id(self, id: str) -> Task:
        """Retrieves a task by its ID.
@@ -154,23 +177,24 @@ class Plan:
            The task with the specified ID.

        Raises:
-            ValueError: If the provided task ID is invalid or does not exist.
+            AgentMalformedActionError: If the provided task ID is invalid or does not exist.
        """
+        if id == '':
+            return self
+        if len(self.subtasks) == 0:
+            raise LLMMalformedActionError('Task does not exist:' + id)
        try:
            parts = [int(p) for p in id.split('.')]
        except ValueError:
-            raise ValueError('Invalid task id, non-integer:' + id)
-        if parts[0] != 0:
-            raise ValueError('Invalid task id, must start with 0:' + id)
-        parts = parts[1:]
-        task = self.task
+            raise LLMMalformedActionError('Invalid task id:' + id)
+        task: Task = self
        for part in parts:
            if part >= len(task.subtasks):
-                raise ValueError('Task does not exist:' + id)
+                raise LLMMalformedActionError('Task does not exist:' + id)
            task = task.subtasks[part]
        return task

-    def add_subtask(self, parent_id: str, goal: str, subtasks: List = []):
+    def add_subtask(self, parent_id: str, goal: str, subtasks: list | None = None):
        """Adds a subtask to a parent task.

        Args:
@@ -178,6 +202,7 @@ class Plan:
            goal: The goal of the subtask.
            subtasks: A list of subtasks associated with the new subtask.
        """
+        subtasks = subtasks or []
        parent = self.get_task_by_id(parent_id)
        child = Task(parent=parent, goal=goal, subtasks=subtasks)
        parent.subtasks.append(child)
@@ -190,12 +215,12 @@ class Plan:
            state: The new state of the subtask.
        """
        task = self.get_task_by_id(id)
+        logger.debug('Setting task {task.id} from state {task.state} to {state}')
        task.set_state(state)
-
-    def get_current_task(self):
-        """Retrieves the current task in progress.
-
-        Returns:
-            The current task in progress, or None if no task is in progress.
-        """
-        return self.task.get_current_task()
+        unfinished_tasks = [
+            t
+            for t in self.subtasks
+            if t.state not in [COMPLETED_STATE, VERIFIED_STATE, ABANDONED_STATE]
+        ]
+        if len(unfinished_tasks) == 0:
+            self.set_state(COMPLETED_STATE)
@@ -0,0 +1,335 @@
+from openhands.controller.state.state import State
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action.action import Action
+from openhands.events.action.commands import IPythonRunCellAction
+from openhands.events.action.empty import NullAction
+from openhands.events.action.message import MessageAction
+from openhands.events.event import Event, EventSource
+from openhands.events.observation.commands import (
+    CmdOutputObservation,
+    IPythonRunCellObservation,
+)
+from openhands.events.observation.empty import NullObservation
+from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.observation import Observation
+
+
+class StuckDetector:
+    SYNTAX_ERROR_MESSAGES = [
+        'SyntaxError: unterminated string literal (detected at line',
+        'SyntaxError: invalid syntax. Perhaps you forgot a comma?',
+        'SyntaxError: incomplete input',
+    ]
+
+    def __init__(self, state: State):
+        self.state = state
+
+    def is_stuck(self, headless_mode: bool = True):
+        """Checks if the agent is stuck in a loop.
+
+        Args:
+            headless_mode: Matches AgentController's headless_mode.
+                          If True: Consider all history (automated/testing)
+                          If False: Consider only history after last user message (interactive)
+
+        Returns:
+            bool: True if the agent is stuck in a loop, False otherwise.
+        """
+        if not headless_mode:
+            # In interactive mode, only look at history after the last user message
+            last_user_msg_idx = -1
+            for i, event in enumerate(reversed(self.state.history)):
+                if (
+                    isinstance(event, MessageAction)
+                    and event.source == EventSource.USER
+                ):
+                    last_user_msg_idx = len(self.state.history) - i - 1
+                    break
+
+            history_to_check = self.state.history[last_user_msg_idx + 1 :]
+        else:
+            # In headless mode, look at all history
+            history_to_check = self.state.history
+
+        # Filter out user messages and null events
+        filtered_history = [
+            event
+            for event in history_to_check
+            if not (
+                # Filter works elegantly in both modes:
+                # - In headless: actively filters out user messages from full history
+                # - In non-headless: no-op since we already sliced after last user message
+                (isinstance(event, MessageAction) and event.source == EventSource.USER)
+                # there might be some NullAction or NullObservation in the history at least for now
+                or isinstance(event, (NullAction, NullObservation))
+            )
+        ]
+
+        # it takes 3 actions minimum to detect a loop, otherwise nothing to do here
+        if len(filtered_history) < 3:
+            return False
+
+        # the first few scenarios detect 3 or 4 repeated steps
+        # prepare the last 4 actions and observations, to check them out
+        last_actions: list[Event] = []
+        last_observations: list[Event] = []
+
+        # retrieve the last four actions and observations starting from the end of history, wherever they are
+        for event in reversed(filtered_history):
+            if isinstance(event, Action) and len(last_actions) < 4:
+                last_actions.append(event)
+            elif isinstance(event, Observation) and len(last_observations) < 4:
+                last_observations.append(event)
+
+            if len(last_actions) == 4 and len(last_observations) == 4:
+                break
+
+        # scenario 1: same action, same observation
+        if self._is_stuck_repeating_action_observation(last_actions, last_observations):
+            return True
+
+        # scenario 2: same action, errors
+        if self._is_stuck_repeating_action_error(last_actions, last_observations):
+            return True
+
+        # scenario 3: monologue
+        if self._is_stuck_monologue(filtered_history):
+            return True
+
+        # scenario 4: action, observation pattern on the last six steps
+        if len(filtered_history) < 6:
+            return False
+        if self._is_stuck_action_observation_pattern(filtered_history):
+            return True
+
+        return False
+
+    def _is_stuck_repeating_action_observation(self, last_actions, last_observations):
+        # scenario 1: same action, same observation
+        # it takes 4 actions and 4 observations to detect a loop
+        # assert len(last_actions) == 4 and len(last_observations) == 4
+
+        # Check for a loop of 4 identical action-observation pairs
+        if len(last_actions) == 4 and len(last_observations) == 4:
+            actions_equal = all(
+                self._eq_no_pid(last_actions[0], action) for action in last_actions
+            )
+            observations_equal = all(
+                self._eq_no_pid(last_observations[0], observation)
+                for observation in last_observations
+            )
+
+            if actions_equal and observations_equal:
+                logger.warning('Action, Observation loop detected')
+                return True
+
+        return False
+
+    def _is_stuck_repeating_action_error(self, last_actions, last_observations):
+        # scenario 2: same action, errors
+        # it takes 3 actions and 3 observations to detect a loop
+        # check if the last three actions are the same and result in errors
+
+        if len(last_actions) < 4 or len(last_observations) < 4:
+            return False
+
+        # are the last three actions the "same"?
+        if all(self._eq_no_pid(last_actions[0], action) for action in last_actions[:3]):
+            # and the last three observations are all errors?
+            if all(isinstance(obs, ErrorObservation) for obs in last_observations[:3]):
+                logger.warning('Action, ErrorObservation loop detected')
+                return True
+            # or, are the last three observations all IPythonRunCellObservation with SyntaxError?
+            elif all(
+                isinstance(obs, IPythonRunCellObservation)
+                for obs in last_observations[:3]
+            ):
+                warning = 'Action, IPythonRunCellObservation loop detected'
+                for error_message in self.SYNTAX_ERROR_MESSAGES:
+                    if error_message.startswith(
+                        'SyntaxError: unterminated string literal (detected at line'
+                    ):
+                        if self._check_for_consistent_line_error(
+                            last_observations[:3], error_message
+                        ):
+                            logger.warning(warning)
+                            return True
+                    elif error_message in (
+                        'SyntaxError: invalid syntax. Perhaps you forgot a comma?',
+                        'SyntaxError: incomplete input',
+                    ) and self._check_for_consistent_invalid_syntax(
+                        last_observations[:3], error_message
+                    ):
+                        logger.warning(warning)
+                        return True
+        return False
+
+    def _check_for_consistent_invalid_syntax(self, observations, error_message):
+        first_lines = []
+        valid_observations = []
+
+        for obs in observations:
+            content = obs.content
+            lines = content.strip().split('\n')
+
+            if len(lines) < 6:  # 6 because a real syntax error has at least 6 lines
+                return False
+
+            line1 = lines[0].strip()
+            if not line1.startswith('Cell In[1], line'):
+                return False
+
+            first_lines.append(line1)  # Store the first line of each observation
+
+            # Check last three lines
+            if (
+                lines[-1].startswith('[Jupyter Python interpreter:')
+                and lines[-2].startswith('[Jupyter current working directory:')
+                and error_message in lines[-3]
+            ):
+                valid_observations.append(obs)
+
+        # Check if:
+        # 1. All first lines are identical
+        # 2. We have exactly 3 valid observations
+        # 3. The error message line is identical in all valid observations
+        return (
+            len(set(first_lines)) == 1
+            and len(valid_observations) == 3
+            and len(
+                set(
+                    obs.content.strip().split('\n')[:-2][-1]
+                    for obs in valid_observations
+                )
+            )
+            == 1
+        )
+
+    def _check_for_consistent_line_error(self, observations, error_message):
+        error_lines = []
+
+        for obs in observations:
+            content = obs.content
+            lines = content.strip().split('\n')
+
+            if len(lines) < 3:
+                return False
+
+            last_lines = lines[-3:]
+
+            # Check if the last two lines are our own
+            if not (
+                last_lines[-2].startswith('[Jupyter current working directory:')
+                and last_lines[-1].startswith('[Jupyter Python interpreter:')
+            ):
+                return False
+
+            # Check for the error message in the 3rd-to-last line
+            if error_message in last_lines[-3]:
+                error_lines.append(last_lines[-3])
+
+        # Check if we found the error message in all 3 observations
+        # and the 3rd-to-last line is identical across all occurrences
+        return len(error_lines) == 3 and len(set(error_lines)) == 1
+
+    def _is_stuck_monologue(self, filtered_history):
+        # scenario 3: monologue
+        # check for repeated MessageActions with source=AGENT
+        # see if the agent is engaged in a good old monologue, telling itself the same thing over and over
+        agent_message_actions = [
+            (i, event)
+            for i, event in enumerate(filtered_history)
+            if isinstance(event, MessageAction) and event.source == EventSource.AGENT
+        ]
+
+        # last three message actions will do for this check
+        if len(agent_message_actions) >= 3:
+            last_agent_message_actions = agent_message_actions[-3:]
+
+            if all(
+                (last_agent_message_actions[0][1] == action[1])
+                for action in last_agent_message_actions
+            ):
+                # check if there are any observations between the repeated MessageActions
+                # then it's not yet a loop, maybe it can recover
+                start_index = last_agent_message_actions[0][0]
+                end_index = last_agent_message_actions[-1][0]
+
+                has_observation_between = False
+                for event in filtered_history[start_index + 1 : end_index]:
+                    if isinstance(event, Observation):
+                        has_observation_between = True
+                        break
+
+                if not has_observation_between:
+                    logger.warning('Repeated MessageAction with source=AGENT detected')
+                    return True
+        return False
+
+    def _is_stuck_action_observation_pattern(self, filtered_history):
+        # scenario 4: action, observation pattern on the last six steps
+        # check if the agent repeats the same (Action, Observation)
+        # every other step in the last six steps
+        last_six_actions: list[Event] = []
+        last_six_observations: list[Event] = []
+
+        # the end of history is most interesting
+        for event in reversed(filtered_history):
+            if isinstance(event, Action) and len(last_six_actions) < 6:
+                last_six_actions.append(event)
+            elif isinstance(event, Observation) and len(last_six_observations) < 6:
+                last_six_observations.append(event)
+
+            if len(last_six_actions) == 6 and len(last_six_observations) == 6:
+                break
+
+        # this pattern is every other step, like:
+        # (action_1, obs_1), (action_2, obs_2), (action_1, obs_1), (action_2, obs_2),...
+        if len(last_six_actions) == 6 and len(last_six_observations) == 6:
+            actions_equal = (
+                # action_0 == action_2 == action_4
+                self._eq_no_pid(last_six_actions[0], last_six_actions[2])
+                and self._eq_no_pid(last_six_actions[0], last_six_actions[4])
+                # action_1 == action_3 == action_5
+                and self._eq_no_pid(last_six_actions[1], last_six_actions[3])
+                and self._eq_no_pid(last_six_actions[1], last_six_actions[5])
+            )
+            observations_equal = (
+                # obs_0 == obs_2 == obs_4
+                self._eq_no_pid(last_six_observations[0], last_six_observations[2])
+                and self._eq_no_pid(last_six_observations[0], last_six_observations[4])
+                # obs_1 == obs_3 == obs_5
+                and self._eq_no_pid(last_six_observations[1], last_six_observations[3])
+                and self._eq_no_pid(last_six_observations[1], last_six_observations[5])
+            )
+
+            if actions_equal and observations_equal:
+                logger.warning('Action, Observation pattern detected')
+                return True
+        return False
+
+    def _eq_no_pid(self, obj1, obj2):
+        if isinstance(obj1, IPythonRunCellAction) and isinstance(
+            obj2, IPythonRunCellAction
+        ):
+            # for loop detection on edit actions, ignore the thought, compare some code
+            # the code should have at least 3 lines, to avoid simple one-liners
+            if (
+                'edit_file_by_replace(' in obj1.code
+                and 'edit_file_by_replace(' in obj2.code
+            ):
+                return (
+                    len(obj1.code.split('\n')) > 2
+                    and obj1.code.split('\n')[:3] == obj2.code.split('\n')[:3]
+                )
+            else:
+                # default comparison
+                return obj1 == obj2
+        elif isinstance(obj1, CmdOutputObservation) and isinstance(
+            obj2, CmdOutputObservation
+        ):
+            # for loop detection, ignore command_id, which is the pid
+            return obj1.command == obj2.command and obj1.exit_code == obj2.exit_code
+        else:
+            # this is the default comparison
+            return obj1 == obj2
@@ -0,0 +1,231 @@
+import asyncio
+import logging
+import sys
+from typing import Type
+from uuid import uuid4
+
+from termcolor import colored
+
+import openhands.agenthub  # noqa F401 (we import this to get the agents registered)
+from openhands import __version__
+from openhands.controller import AgentController
+from openhands.controller.agent import Agent
+from openhands.core.config import (
+    AppConfig,
+    get_parser,
+    load_app_config,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.loop import run_agent_until_done
+from openhands.core.schema import AgentState
+from openhands.events import EventSource, EventStream, EventStreamSubscriber
+from openhands.events.action import (
+    Action,
+    ActionConfirmationStatus,
+    ChangeAgentStateAction,
+    CmdRunAction,
+    FileEditAction,
+    MessageAction,
+)
+from openhands.events.event import Event
+from openhands.events.observation import (
+    AgentStateChangedObservation,
+    CmdOutputObservation,
+    FileEditObservation,
+    NullObservation,
+)
+from openhands.llm.llm import LLM
+from openhands.runtime import get_runtime_cls
+from openhands.runtime.base import Runtime
+from openhands.security import SecurityAnalyzer, options
+from openhands.storage import get_file_store
+
+
+def display_message(message: str):
+    print(colored('🤖 ' + message + '\n', 'yellow'))
+
+
+def display_command(command: str):
+    print('❯ ' + colored(command + '\n', 'green'))
+
+
+def display_confirmation(confirmation_state: ActionConfirmationStatus):
+    if confirmation_state == ActionConfirmationStatus.CONFIRMED:
+        print(colored('✅ ' + confirmation_state + '\n', 'green'))
+    elif confirmation_state == ActionConfirmationStatus.REJECTED:
+        print(colored('❌ ' + confirmation_state + '\n', 'red'))
+    else:
+        print(colored('⏳ ' + confirmation_state + '\n', 'yellow'))
+
+
+def display_command_output(output: str):
+    lines = output.split('\n')
+    for line in lines:
+        if line.startswith('[Python Interpreter') or line.startswith('openhands@'):
+            # TODO: clean this up once we clean up terminal output
+            continue
+        print(colored(line, 'blue'))
+    print('\n')
+
+
+def display_file_edit(event: FileEditAction | FileEditObservation):
+    print(colored(str(event), 'green'))
+
+
+def display_event(event: Event, config: AppConfig):
+    if isinstance(event, Action):
+        if hasattr(event, 'thought'):
+            display_message(event.thought)
+    if isinstance(event, MessageAction):
+        if event.source == EventSource.AGENT:
+            display_message(event.content)
+    if isinstance(event, CmdRunAction):
+        display_command(event.command)
+    if isinstance(event, CmdOutputObservation):
+        display_command_output(event.content)
+    if isinstance(event, FileEditAction):
+        display_file_edit(event)
+    if isinstance(event, FileEditObservation):
+        display_file_edit(event)
+    if hasattr(event, 'confirmation_state') and config.security.confirmation_mode:
+        display_confirmation(event.confirmation_state)
+
+
+async def main():
+    """Runs the agent in CLI mode"""
+
+    parser = get_parser()
+    # Add the version argument
+    parser.add_argument(
+        '-v',
+        '--version',
+        action='version',
+        version=f'{__version__}',
+        help='Show the version number and exit',
+        default=None,
+    )
+    args = parser.parse_args()
+
+    if args.version:
+        print(f'OpenHands version: {__version__}')
+        return
+
+    logger.setLevel(logging.WARNING)
+    config = load_app_config(config_file=args.config_file)
+    sid = 'cli'
+
+    agent_cls: Type[Agent] = Agent.get_cls(config.default_agent)
+    agent_config = config.get_agent_config(config.default_agent)
+    llm_config = config.get_llm_config_from_agent(config.default_agent)
+    agent = agent_cls(
+        llm=LLM(config=llm_config),
+        config=agent_config,
+    )
+
+    file_store = get_file_store(config.file_store, config.file_store_path)
+    event_stream = EventStream(sid, file_store)
+
+    runtime_cls = get_runtime_cls(config.runtime)
+    runtime: Runtime = runtime_cls(  # noqa: F841
+        config=config,
+        event_stream=event_stream,
+        sid=sid,
+        plugins=agent_cls.sandbox_plugins,
+        headless_mode=True,
+    )
+
+    if config.security.security_analyzer:
+        options.SecurityAnalyzers.get(
+            config.security.security_analyzer, SecurityAnalyzer
+        )(event_stream)
+
+    controller = AgentController(
+        agent=agent,
+        max_iterations=config.max_iterations,
+        max_budget_per_task=config.max_budget_per_task,
+        agent_to_llm_config=config.get_agent_to_llm_config_map(),
+        event_stream=event_stream,
+        confirmation_mode=config.security.confirmation_mode,
+    )
+
+    async def prompt_for_next_task():
+        # Run input() in a thread pool to avoid blocking the event loop
+        loop = asyncio.get_event_loop()
+        next_message = await loop.run_in_executor(
+            None, lambda: input('How can I help? >> ')
+        )
+        if not next_message.strip():
+            await prompt_for_next_task()
+        if next_message == 'exit':
+            event_stream.add_event(
+                ChangeAgentStateAction(AgentState.STOPPED), EventSource.ENVIRONMENT
+            )
+            return
+        action = MessageAction(content=next_message)
+        event_stream.add_event(action, EventSource.USER)
+
+    async def prompt_for_user_confirmation():
+        loop = asyncio.get_event_loop()
+        user_confirmation = await loop.run_in_executor(
+            None, lambda: input('Confirm action (possible security risk)? (y/n) >> ')
+        )
+        return user_confirmation.lower() == 'y'
+
+    async def on_event(event: Event):
+        display_event(event, config)
+        if isinstance(event, AgentStateChangedObservation):
+            if event.agent_state in [
+                AgentState.AWAITING_USER_INPUT,
+                AgentState.FINISHED,
+            ]:
+                await prompt_for_next_task()
+        if (
+            isinstance(event, NullObservation)
+            and controller.state.agent_state == AgentState.AWAITING_USER_CONFIRMATION
+        ):
+            user_confirmed = await prompt_for_user_confirmation()
+            if user_confirmed:
+                event_stream.add_event(
+                    ChangeAgentStateAction(AgentState.USER_CONFIRMED), EventSource.USER
+                )
+            else:
+                event_stream.add_event(
+                    ChangeAgentStateAction(AgentState.USER_REJECTED), EventSource.USER
+                )
+
+    event_stream.subscribe(EventStreamSubscriber.MAIN, on_event, str(uuid4()))
+
+    await runtime.connect()
+
+    asyncio.create_task(prompt_for_next_task())
+
+    await run_agent_until_done(
+        controller, runtime, [AgentState.STOPPED, AgentState.ERROR]
+    )
+
+
+if __name__ == '__main__':
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        loop.run_until_complete(main())
+    except KeyboardInterrupt:
+        print('Received keyboard interrupt, shutting down...')
+    except ConnectionRefusedError as e:
+        print(f'Connection refused: {e}')
+        sys.exit(1)
+    except Exception as e:
+        print(f'An error occurred: {e}')
+        sys.exit(1)
+    finally:
+        try:
+            # Cancel all running tasks
+            pending = asyncio.all_tasks(loop)
+            for task in pending:
+                task.cancel()
+            # Wait for all tasks to complete with a timeout
+            loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
+            loop.close()
+        except Exception as e:
+            print(f'Error during cleanup: {e}')
+            sys.exit(1)
@@ -0,0 +1,37 @@
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.app_config import AppConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+    get_field_info,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+from openhands.core.config.security_config import SecurityConfig
+from openhands.core.config.utils import (
+    finalize_config,
+    get_llm_config_arg,
+    get_parser,
+    load_app_config,
+    load_from_env,
+    load_from_toml,
+    parse_arguments,
+)
+
+__all__ = [
+    'OH_DEFAULT_AGENT',
+    'OH_MAX_ITERATIONS',
+    'AgentConfig',
+    'AppConfig',
+    'LLMConfig',
+    'SandboxConfig',
+    'SecurityConfig',
+    'load_app_config',
+    'load_from_env',
+    'load_from_toml',
+    'finalize_config',
+    'get_llm_config_arg',
+    'get_field_info',
+    'get_parser',
+    'parse_arguments',
+]
@@ -0,0 +1,38 @@
+from dataclasses import dataclass, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class AgentConfig:
+    """Configuration for the agent.
+
+    Attributes:
+        function_calling: Whether function calling is enabled. Default is True.
+        codeact_enable_browsing: Whether browsing delegate is enabled in the action space. Default is False. Only works with function calling.
+        codeact_enable_llm_editor: Whether LLM editor is enabled in the action space. Default is False. Only works with function calling.
+        codeact_enable_jupyter: Whether Jupyter is enabled in the action space. Default is False.
+        micro_agent_name: The name of the micro agent to use for this agent.
+        memory_enabled: Whether long-term memory (embeddings) is enabled.
+        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
+        llm_config: The name of the llm config to use. If specified, this will override global llm config.
+        use_microagents: Whether to use microagents at all. Default is True.
+        disabled_microagents: A list of microagents to disable. Default is None.
+    """
+
+    codeact_enable_browsing: bool = True
+    codeact_enable_llm_editor: bool = False
+    codeact_enable_jupyter: bool = True
+    micro_agent_name: str | None = None
+    memory_enabled: bool = False
+    memory_max_threads: int = 3
+    llm_config: str | None = None
+    use_microagents: bool = True
+    disabled_microagents: list[str] | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            result[f.name] = get_field_info(f)
+        return result
@@ -0,0 +1,156 @@
+from dataclasses import dataclass, field, fields, is_dataclass
+from typing import ClassVar
+
+from openhands.core import logger
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+    get_field_info,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+from openhands.core.config.security_config import SecurityConfig
+
+
+@dataclass
+class AppConfig:
+    """Configuration for the app.
+
+    Attributes:
+        llms: Dictionary mapping LLM names to their configurations.
+            The default configuration is stored under the 'llm' key.
+        agents: Dictionary mapping agent names to their configurations.
+            The default configuration is stored under the 'agent' key.
+        default_agent: Name of the default agent to use.
+        sandbox: Sandbox configuration settings.
+        runtime: Runtime environment identifier.
+        file_store: Type of file store to use.
+        file_store_path: Path to the file store.
+        trajectories_path: Folder path to store trajectories.
+        workspace_base: Base path for the workspace. Defaults to `./workspace` as absolute path.
+        workspace_mount_path: Path to mount the workspace. Defaults to `workspace_base`.
+        workspace_mount_path_in_sandbox: Path to mount the workspace in sandbox. Defaults to `/workspace`.
+        workspace_mount_rewrite: Path to rewrite the workspace mount path.
+        cache_dir: Path to cache directory. Defaults to `/tmp/cache`.
+        run_as_openhands: Whether to run as openhands.
+        max_iterations: Maximum number of iterations allowed.
+        max_budget_per_task: Maximum budget per task, agent stops if exceeded.
+        e2b_api_key: E2B API key.
+        disable_color: Whether to disable terminal colors. For terminals that don't support color.
+        debug: Whether to enable debugging mode.
+        file_uploads_max_file_size_mb: Maximum file upload size in MB. `0` means unlimited.
+        file_uploads_restrict_file_types: Whether to restrict upload file types.
+        file_uploads_allowed_extensions: Allowed file extensions. `['.*']` allows all.
+    """
+
+    llms: dict[str, LLMConfig] = field(default_factory=dict)
+    agents: dict = field(default_factory=dict)
+    default_agent: str = OH_DEFAULT_AGENT
+    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
+    security: SecurityConfig = field(default_factory=SecurityConfig)
+    runtime: str = 'eventstream'
+    file_store: str = 'memory'
+    file_store_path: str = '/tmp/file_store'
+    trajectories_path: str | None = None
+    workspace_base: str | None = None
+    workspace_mount_path: str | None = None
+    workspace_mount_path_in_sandbox: str = '/workspace'
+    workspace_mount_rewrite: str | None = None
+    cache_dir: str = '/tmp/cache'
+    run_as_openhands: bool = True
+    max_iterations: int = OH_MAX_ITERATIONS
+    max_budget_per_task: float | None = None
+    e2b_api_key: str = ''
+    modal_api_token_id: str = ''
+    modal_api_token_secret: str = ''
+    disable_color: bool = False
+    jwt_secret: str = ''
+    settings_store_class: str = (
+        'openhands.storage.file_settings_store.FileSettingsStore'
+    )
+    debug: bool = False
+    file_uploads_max_file_size_mb: int = 0
+    file_uploads_restrict_file_types: bool = False
+    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
+    runloop_api_key: str | None = None
+
+    defaults_dict: ClassVar[dict] = {}
+
+    def get_llm_config(self, name='llm') -> LLMConfig:
+        """'llm' is the name for default config (for backward compatibility prior to 0.8)."""
+        if name in self.llms:
+            return self.llms[name]
+        if name is not None and name != 'llm':
+            logger.openhands_logger.warning(
+                f'llm config group {name} not found, using default config'
+            )
+        if 'llm' not in self.llms:
+            self.llms['llm'] = LLMConfig()
+        return self.llms['llm']
+
+    def set_llm_config(self, value: LLMConfig, name='llm') -> None:
+        self.llms[name] = value
+
+    def get_agent_config(self, name='agent') -> AgentConfig:
+        """'agent' is the name for default config (for backward compatibility prior to 0.8)."""
+        if name in self.agents:
+            return self.agents[name]
+        if 'agent' not in self.agents:
+            self.agents['agent'] = AgentConfig()
+        return self.agents['agent']
+
+    def set_agent_config(self, value: AgentConfig, name='agent') -> None:
+        self.agents[name] = value
+
+    def get_agent_to_llm_config_map(self) -> dict[str, LLMConfig]:
+        """Get a map of agent names to llm configs."""
+        return {name: self.get_llm_config_from_agent(name) for name in self.agents}
+
+    def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
+        agent_config: AgentConfig = self.get_agent_config(name)
+        llm_config_name = agent_config.llm_config
+        return self.get_llm_config(llm_config_name)
+
+    def get_agent_configs(self) -> dict[str, AgentConfig]:
+        return self.agents
+
+    def __post_init__(self):
+        """Post-initialization hook, called when the instance is created with only default values."""
+        AppConfig.defaults_dict = self.defaults_to_dict()
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            field_value = getattr(self, f.name)
+
+            # dataclasses compute their defaults themselves
+            if is_dataclass(type(field_value)):
+                result[f.name] = field_value.defaults_to_dict()
+            else:
+                result[f.name] = get_field_info(f)
+        return result
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            if attr_name in [
+                'e2b_api_key',
+                'github_token',
+                'jwt_secret',
+                'modal_api_token_id',
+                'modal_api_token_secret',
+                'runloop_api_key',
+            ]:
+                attr_value = '******' if attr_value else None
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"AppConfig({', '.join(attr_str)}"
+
+    def __repr__(self):
+        return self.__str__()
@@ -0,0 +1,39 @@
+from types import UnionType
+from typing import get_args, get_origin
+
+OH_DEFAULT_AGENT = 'CodeActAgent'
+OH_MAX_ITERATIONS = 500
+
+
+def get_field_info(f):
+    """Extract information about a dataclass field: type, optional, and default.
+
+    Args:
+        f: The field to extract information from.
+
+    Returns: A dict with the field's type, whether it's optional, and its default value.
+    """
+    field_type = f.type
+    optional = False
+
+    # for types like str | None, find the non-None type and set optional to True
+    # this is useful for the frontend to know if a field is optional
+    # and to show the correct type in the UI
+    # Note: this only works for UnionTypes with None as one of the types
+    if get_origin(field_type) is UnionType:
+        types = get_args(field_type)
+        non_none_arg = next((t for t in types if t is not type(None)), None)
+        if non_none_arg is not None:
+            field_type = non_none_arg
+            optional = True
+
+    # type name in a pretty format
+    type_name = (
+        field_type.__name__ if hasattr(field_type, '__name__') else str(field_type)
+    )
+
+    # default is always present
+    default = f.default
+
+    # return a schema with the useful info for frontend
+    return {'type': type_name.lower(), 'optional': optional, 'default': default}
@@ -0,0 +1,143 @@
+import os
+from dataclasses import dataclass, fields
+from typing import Optional
+
+from openhands.core.config.config_utils import get_field_info
+from openhands.core.logger import LOG_DIR
+
+LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']
+
+
+@dataclass
+class LLMConfig:
+    """Configuration for the LLM model.
+
+    Attributes:
+        model: The model to use.
+        api_key: The API key to use.
+        base_url: The base URL for the API. This is necessary for local LLMs. It is also used for Azure embeddings.
+        api_version: The version of the API.
+        embedding_model: The embedding model to use.
+        embedding_base_url: The base URL for the embedding API.
+        embedding_deployment_name: The name of the deployment for the embedding API. This is used for Azure OpenAI.
+        aws_access_key_id: The AWS access key ID.
+        aws_secret_access_key: The AWS secret access key.
+        aws_region_name: The AWS region name.
+        num_retries: The number of retries to attempt.
+        retry_multiplier: The multiplier for the exponential backoff.
+        retry_min_wait: The minimum time to wait between retries, in seconds. This is exponential backoff minimum. For models with very low limits, this can be set to 15-20.
+        retry_max_wait: The maximum time to wait between retries, in seconds. This is exponential backoff maximum.
+        timeout: The timeout for the API.
+        max_message_chars: The approximate max number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated.
+        temperature: The temperature for the API.
+        top_p: The top p for the API.
+        custom_llm_provider: The custom LLM provider to use. This is undocumented in openhands, and normally not used. It is documented on the litellm side.
+        max_input_tokens: The maximum number of input tokens. Note that this is currently unused, and the value at runtime is actually the total tokens in OpenAI (e.g. 128,000 tokens for GPT-4).
+        max_output_tokens: The maximum number of output tokens. This is sent to the LLM.
+        input_cost_per_token: The cost per input token. This will available in logs for the user to check.
+        output_cost_per_token: The cost per output token. This will available in logs for the user to check.
+        ollama_base_url: The base URL for the OLLAMA API.
+        drop_params: Drop any unmapped (unsupported) params without causing an exception.
+        modify_params: Modify params allows litellm to do transformations like adding a default message, when a message is empty.
+        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
+        caching_prompt: Use the prompt caching feature if provided by the LLM and supported by the provider.
+        log_completions: Whether to log LLM completions to the state.
+        log_completions_folder: The folder to log LLM completions to. Required if log_completions is True.
+        draft_editor: A more efficient LLM to use for file editing. Introduced in [PR 3985](https://github.com/All-Hands-AI/OpenHands/pull/3985).
+        custom_tokenizer: A custom tokenizer to use for token counting.
+    """
+
+    model: str = 'claude-3-5-sonnet-20241022'
+    api_key: str | None = None
+    base_url: str | None = None
+    api_version: str | None = None
+    embedding_model: str = 'local'
+    embedding_base_url: str | None = None
+    embedding_deployment_name: str | None = None
+    aws_access_key_id: str | None = None
+    aws_secret_access_key: str | None = None
+    aws_region_name: str | None = None
+    openrouter_site_url: str = 'https://docs.all-hands.dev/'
+    openrouter_app_name: str = 'OpenHands'
+    num_retries: int = 8
+    retry_multiplier: float = 2
+    retry_min_wait: int = 15
+    retry_max_wait: int = 120
+    timeout: int | None = None
+    max_message_chars: int = 30_000  # maximum number of characters in an observation's content when sent to the llm
+    temperature: float = 0.0
+    top_p: float = 1.0
+    custom_llm_provider: str | None = None
+    max_input_tokens: int | None = None
+    max_output_tokens: int | None = None
+    input_cost_per_token: float | None = None
+    output_cost_per_token: float | None = None
+    ollama_base_url: str | None = None
+    # This setting can be sent in each call to litellm
+    drop_params: bool = True
+    # Note: this setting is actually global, unlike drop_params
+    modify_params: bool = True
+    disable_vision: bool | None = None
+    caching_prompt: bool = True
+    log_completions: bool = False
+    log_completions_folder: str = os.path.join(LOG_DIR, 'completions')
+    draft_editor: Optional['LLMConfig'] = None
+    custom_tokenizer: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            result[f.name] = get_field_info(f)
+        return result
+
+    def __post_init__(self):
+        """
+        Post-initialization hook to assign OpenRouter-related variables to environment variables.
+        This ensures that these values are accessible to litellm at runtime.
+        """
+
+        # Assign OpenRouter-specific variables to environment variables
+        if self.openrouter_site_url:
+            os.environ['OR_SITE_URL'] = self.openrouter_site_url
+        if self.openrouter_app_name:
+            os.environ['OR_APP_NAME'] = self.openrouter_app_name
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            if attr_name in LLM_SENSITIVE_FIELDS:
+                attr_value = '******' if attr_value else None
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"LLMConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def to_safe_dict(self):
+        """Return a dict with the sensitive fields replaced with ******."""
+        ret = self.__dict__.copy()
+        for k, v in ret.items():
+            if k in LLM_SENSITIVE_FIELDS:
+                ret[k] = '******' if v else None
+            elif isinstance(v, LLMConfig):
+                ret[k] = v.to_safe_dict()
+        return ret
+
+    @classmethod
+    def from_dict(cls, llm_config_dict: dict) -> 'LLMConfig':
+        """Create an LLMConfig object from a dictionary.
+
+        This function is used to create an LLMConfig object from a dictionary,
+        with the exception of the 'draft_editor' key, which is a nested LLMConfig object.
+        """
+        args = {k: v for k, v in llm_config_dict.items() if not isinstance(v, dict)}
+        if 'draft_editor' in llm_config_dict:
+            draft_editor_config = LLMConfig(**llm_config_dict['draft_editor'])
+            args['draft_editor'] = draft_editor_config
+        return cls(**args)
@@ -0,0 +1,78 @@
+import os
+from dataclasses import dataclass, field, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class SandboxConfig:
+    """Configuration for the sandbox.
+
+    Attributes:
+        remote_runtime_api_url: The hostname for the Remote Runtime API.
+        local_runtime_url: The default hostname for the local runtime. You may want to change to http://host.docker.internal for DIND environments
+        base_container_image: The base container image from which to build the runtime image.
+        runtime_container_image: The runtime container image to use.
+        user_id: The user ID for the sandbox.
+        timeout: The timeout for the default sandbox action execution.
+        remote_runtime_init_timeout: The timeout for the remote runtime to start.
+        enable_auto_lint: Whether to enable auto-lint.
+        use_host_network: Whether to use the host network.
+        initialize_plugins: Whether to initialize plugins.
+        force_rebuild_runtime: Whether to force rebuild the runtime image.
+        runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
+            This will be rendered into the end of the Dockerfile that builds the runtime image.
+            It can contain any valid shell commands (e.g., pip install numpy).
+            The path to the interpreter is available as $OH_INTERPRETER_PATH,
+            which can be used to install dependencies for the OH-specific Python interpreter.
+        runtime_startup_env_vars: The environment variables to set at the launch of the runtime.
+            This is a dictionary of key-value pairs.
+            This is useful for setting environment variables that are needed by the runtime.
+            For example, for specifying the base url of website for browsergym evaluation.
+        browsergym_eval_env: The BrowserGym environment to use for evaluation.
+            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
+        platform: The platform on which the image should be built. Default is None.
+    """
+
+    remote_runtime_api_url: str = 'http://localhost:8000'
+    local_runtime_url: str = 'http://localhost'
+    keep_runtime_alive: bool = False
+    rm_all_containers: bool = False
+    api_key: str | None = None
+    base_container_image: str = 'nikolaik/python-nodejs:python3.12-nodejs22'  # default to nikolaik/python-nodejs:python3.12-nodejs22 for eventstream runtime
+    runtime_container_image: str | None = None
+    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
+    timeout: int = 120
+    remote_runtime_init_timeout: int = 180
+    enable_auto_lint: bool = (
+        False  # once enabled, OpenHands would lint files after editing
+    )
+    use_host_network: bool = False
+    runtime_extra_build_args: list[str] | None = None
+    initialize_plugins: bool = True
+    force_rebuild_runtime: bool = False
+    runtime_extra_deps: str | None = None
+    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
+    browsergym_eval_env: str | None = None
+    platform: str | None = None
+    close_delay: int = 15
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        dict = {}
+        for f in fields(self):
+            dict[f.name] = get_field_info(f)
+        return dict
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"SandboxConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
@@ -0,0 +1,40 @@
+from dataclasses import dataclass, fields
+
+from openhands.core.config.config_utils import get_field_info
+
+
+@dataclass
+class SecurityConfig:
+    """Configuration for security related functionalities.
+
+    Attributes:
+        confirmation_mode: Whether to enable confirmation mode.
+        security_analyzer: The security analyzer to use.
+    """
+
+    confirmation_mode: bool = False
+    security_analyzer: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        dict = {}
+        for f in fields(self):
+            dict[f.name] = get_field_info(f)
+        return dict
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"SecurityConfig({', '.join(attr_str)})"
+
+    @classmethod
+    def from_dict(cls, security_config_dict: dict) -> 'SecurityConfig':
+        return cls(**security_config_dict)
+
+    def __repr__(self):
+        return self.__str__()
@@ -0,0 +1,429 @@
+import argparse
+import os
+import pathlib
+import platform
+from dataclasses import is_dataclass
+from types import UnionType
+from typing import Any, MutableMapping, get_args, get_origin
+from uuid import uuid4
+
+import toml
+from dotenv import load_dotenv
+
+from openhands.core import logger
+from openhands.core.config.agent_config import AgentConfig
+from openhands.core.config.app_config import AppConfig
+from openhands.core.config.config_utils import (
+    OH_DEFAULT_AGENT,
+    OH_MAX_ITERATIONS,
+)
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.config.sandbox_config import SandboxConfig
+from openhands.core.config.security_config import SecurityConfig
+from openhands.storage import get_file_store
+from openhands.storage.files import FileStore
+
+JWT_SECRET = '.jwt_secret'
+load_dotenv()
+
+
+def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, str]):
+    """Reads the env-style vars and sets config attributes based on env vars or a config.toml dict.
+    Compatibility with vars like LLM_BASE_URL, AGENT_MEMORY_ENABLED, SANDBOX_TIMEOUT and others.
+
+    Args:
+        cfg: The AppConfig object to set attributes on.
+        env_or_toml_dict: The environment variables or a config.toml dict.
+    """
+
+    def get_optional_type(union_type: UnionType) -> Any:
+        """Returns the non-None type from a Union."""
+        types = get_args(union_type)
+        return next((t for t in types if t is not type(None)), None)
+
+    # helper function to set attributes based on env vars
+    def set_attr_from_env(sub_config: Any, prefix=''):
+        """Set attributes of a config dataclass based on environment variables."""
+        for field_name, field_type in sub_config.__annotations__.items():
+            # compute the expected env var name from the prefix and field name
+            # e.g. LLM_BASE_URL
+            env_var_name = (prefix + field_name).upper()
+
+            if is_dataclass(field_type):
+                # nested dataclass
+                nested_sub_config = getattr(sub_config, field_name)
+                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
+            elif env_var_name in env_or_toml_dict:
+                # convert the env var to the correct type and set it
+                value = env_or_toml_dict[env_var_name]
+
+                # skip empty config values (fall back to default)
+                if not value:
+                    continue
+
+                try:
+                    # if it's an optional type, get the non-None type
+                    if get_origin(field_type) is UnionType:
+                        field_type = get_optional_type(field_type)
+
+                    # Attempt to cast the env var to type hinted in the dataclass
+                    if field_type is bool:
+                        cast_value = str(value).lower() in ['true', '1']
+                    else:
+                        cast_value = field_type(value)
+                    setattr(sub_config, field_name, cast_value)
+                except (ValueError, TypeError):
+                    logger.openhands_logger.error(
+                        f'Error setting env var {env_var_name}={value}: check that the value is of the right type'
+                    )
+
+    # Start processing from the root of the config object
+    set_attr_from_env(cfg)
+
+    # load default LLM config from env
+    default_llm_config = cfg.get_llm_config()
+    set_attr_from_env(default_llm_config, 'LLM_')
+    # load default agent config from env
+    default_agent_config = cfg.get_agent_config()
+    set_attr_from_env(default_agent_config, 'AGENT_')
+
+
+def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
+    """Load the config from the toml file. Supports both styles of config vars.
+
+    Args:
+        cfg: The AppConfig object to update attributes of.
+        toml_file: The path to the toml file. Defaults to 'config.toml'.
+    """
+    # try to read the config.toml file into the config object
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError:
+        return
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.warning(
+            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
+            exc_info=False,
+        )
+        return
+
+    # if there was an exception or core is not in the toml, try to use the old-style toml
+    if 'core' not in toml_config:
+        # re-use the env loader to set the config from env-style vars
+        load_from_env(cfg, toml_config)
+        return
+
+    core_config = toml_config['core']
+
+    # load llm configs and agent configs
+    for key, value in toml_config.items():
+        if isinstance(value, dict):
+            try:
+                if key is not None and key.lower() == 'agent':
+                    logger.openhands_logger.debug(
+                        'Attempt to load default agent config from config toml'
+                    )
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    agent_config = AgentConfig(**non_dict_fields)
+                    cfg.set_agent_config(agent_config, 'agent')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.openhands_logger.debug(
+                                f'Attempt to load group {nested_key} from config toml as agent config'
+                            )
+                            agent_config = AgentConfig(**nested_value)
+                            cfg.set_agent_config(agent_config, nested_key)
+                elif key is not None and key.lower() == 'llm':
+                    logger.openhands_logger.debug(
+                        'Attempt to load default LLM config from config toml'
+                    )
+                    llm_config = LLMConfig.from_dict(value)
+                    cfg.set_llm_config(llm_config, 'llm')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.openhands_logger.debug(
+                                f'Attempt to load group {nested_key} from config toml as llm config'
+                            )
+                            llm_config = LLMConfig.from_dict(nested_value)
+                            cfg.set_llm_config(llm_config, nested_key)
+                elif key is not None and key.lower() == 'security':
+                    logger.openhands_logger.debug(
+                        'Attempt to load security config from config toml'
+                    )
+                    security_config = SecurityConfig.from_dict(value)
+                    cfg.security = security_config
+                elif not key.startswith('sandbox') and key.lower() != 'core':
+                    logger.openhands_logger.warning(
+                        f'Unknown key in {toml_file}: "{key}"'
+                    )
+            except (TypeError, KeyError) as e:
+                logger.openhands_logger.warning(
+                    f'Cannot parse config from toml, toml values have not been applied.\n Error: {e}',
+                    exc_info=False,
+                )
+        else:
+            logger.openhands_logger.warning(f'Unknown key in {toml_file}: "{key}')
+
+    try:
+        # set sandbox config from the toml file
+        sandbox_config = cfg.sandbox
+
+        # migrate old sandbox configs from [core] section to sandbox config
+        keys_to_migrate = [key for key in core_config if key.startswith('sandbox_')]
+        for key in keys_to_migrate:
+            new_key = key.replace('sandbox_', '')
+            if new_key in sandbox_config.__annotations__:
+                # read the key in sandbox and remove it from core
+                setattr(sandbox_config, new_key, core_config.pop(key))
+            else:
+                logger.openhands_logger.warning(f'Unknown sandbox config: {key}')
+
+        # the new style values override the old style values
+        if 'sandbox' in toml_config:
+            sandbox_config = SandboxConfig(**toml_config['sandbox'])
+
+        # update the config object with the new values
+        cfg.sandbox = sandbox_config
+        for key, value in core_config.items():
+            if hasattr(cfg, key):
+                setattr(cfg, key, value)
+            else:
+                logger.openhands_logger.warning(f'Unknown core config key: {key}')
+    except (TypeError, KeyError) as e:
+        logger.openhands_logger.warning(
+            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
+            exc_info=False,
+        )
+
+
+def get_or_create_jwt_secret(file_store: FileStore) -> str:
+    try:
+        jwt_secret = file_store.read(JWT_SECRET)
+        return jwt_secret
+    except FileNotFoundError:
+        new_secret = uuid4().hex
+        file_store.write(JWT_SECRET, new_secret)
+        return new_secret
+
+
+def finalize_config(cfg: AppConfig):
+    """More tweaks to the config after it's been loaded."""
+    if cfg.workspace_base is not None:
+        cfg.workspace_base = os.path.abspath(cfg.workspace_base)
+        if cfg.workspace_mount_path is None:
+            cfg.workspace_mount_path = cfg.workspace_base
+
+        if cfg.workspace_mount_rewrite:
+            base = cfg.workspace_base or os.getcwd()
+            parts = cfg.workspace_mount_rewrite.split(':')
+            cfg.workspace_mount_path = base.replace(parts[0], parts[1])
+
+    # make sure log_completions_folder is an absolute path
+    for llm in cfg.llms.values():
+        llm.log_completions_folder = os.path.abspath(llm.log_completions_folder)
+        if llm.embedding_base_url is None:
+            llm.embedding_base_url = llm.base_url
+
+    if cfg.sandbox.use_host_network and platform.system() == 'Darwin':
+        logger.openhands_logger.warning(
+            'Please upgrade to Docker Desktop 4.29.0 or later to use host network mode on macOS. '
+            'See https://github.com/docker/roadmap/issues/238#issuecomment-2044688144 for more information.'
+        )
+
+    # make sure cache dir exists
+    if cfg.cache_dir:
+        pathlib.Path(cfg.cache_dir).mkdir(parents=True, exist_ok=True)
+
+    if not cfg.jwt_secret:
+        cfg.jwt_secret = get_or_create_jwt_secret(
+            get_file_store(cfg.file_store, cfg.file_store_path)
+        )
+
+
+# Utility function for command line --group argument
+def get_llm_config_arg(
+    llm_config_arg: str, toml_file: str = 'config.toml'
+) -> LLMConfig | None:
+    """Get a group of llm settings from the config file.
+
+    A group in config.toml can look like this:
+
+    ```
+    [llm.gpt-3.5-for-eval]
+    model = 'gpt-3.5-turbo'
+    api_key = '...'
+    temperature = 0.5
+    num_retries = 8
+    ...
+    ```
+
+    The user-defined group name, like "gpt-3.5-for-eval", is the argument to this function. The function will load the LLMConfig object
+    with the settings of this group, from the config file, and set it as the LLMConfig object for the app.
+
+    Note that the group must be under "llm" group, or in other words, the group name must start with "llm.".
+
+    Args:
+        llm_config_arg: The group of llm settings to get from the config.toml file.
+        toml_file: Path to the configuration file to read from. Defaults to 'config.toml'.
+
+    Returns:
+        LLMConfig: The LLMConfig object with the settings from the config file.
+    """
+    # keep only the name, just in case
+    llm_config_arg = llm_config_arg.strip('[]')
+
+    # truncate the prefix, just in case
+    if llm_config_arg.startswith('llm.'):
+        llm_config_arg = llm_config_arg[4:]
+
+    logger.openhands_logger.debug(f'Loading llm config from {llm_config_arg}')
+
+    # load the toml file
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError as e:
+        logger.openhands_logger.error(f'Config file not found: {e}')
+        return None
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.error(
+            f'Cannot parse llm group from {llm_config_arg}. Exception: {e}'
+        )
+        return None
+
+    # update the llm config with the specified section
+    if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
+        return LLMConfig.from_dict(toml_config['llm'][llm_config_arg])
+    logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}')
+    return None
+
+
+# Command line arguments
+def get_parser() -> argparse.ArgumentParser:
+    """Get the parser for the command line arguments."""
+    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
+    parser.add_argument(
+        '--config-file',
+        type=str,
+        default='config.toml',
+        help='Path to the config file (default: config.toml in the current directory)',
+    )
+    parser.add_argument(
+        '-d',
+        '--directory',
+        type=str,
+        help='The working directory for the agent',
+    )
+    parser.add_argument(
+        '-t',
+        '--task',
+        type=str,
+        default='',
+        help='The task for the agent to perform',
+    )
+    parser.add_argument(
+        '-f',
+        '--file',
+        type=str,
+        help='Path to a file containing the task. Overrides -t if both are provided.',
+    )
+    parser.add_argument(
+        '-c',
+        '--agent-cls',
+        default=OH_DEFAULT_AGENT,
+        type=str,
+        help='Name of the default agent to use',
+    )
+    parser.add_argument(
+        '-i',
+        '--max-iterations',
+        default=OH_MAX_ITERATIONS,
+        type=int,
+        help='The maximum number of iterations to run the agent',
+    )
+    parser.add_argument(
+        '-b',
+        '--max-budget-per-task',
+        type=float,
+        help='The maximum budget allowed per task, beyond which the agent will stop.',
+    )
+    # --eval configs are for evaluations only
+    parser.add_argument(
+        '--eval-output-dir',
+        default='evaluation/evaluation_outputs/outputs',
+        type=str,
+        help='The directory to save evaluation output',
+    )
+    parser.add_argument(
+        '--eval-n-limit',
+        default=None,
+        type=int,
+        help='The number of instances to evaluate',
+    )
+    parser.add_argument(
+        '--eval-num-workers',
+        default=4,
+        type=int,
+        help='The number of workers to use for evaluation',
+    )
+    parser.add_argument(
+        '--eval-note',
+        default=None,
+        type=str,
+        help='The note to add to the evaluation directory',
+    )
+    parser.add_argument(
+        '-l',
+        '--llm-config',
+        default=None,
+        type=str,
+        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
+    )
+    parser.add_argument(
+        '-n',
+        '--name',
+        default='default',
+        type=str,
+        help='Name for the session',
+    )
+    parser.add_argument(
+        '--eval-ids',
+        default=None,
+        type=str,
+        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
+    )
+    parser.add_argument(
+        '--no-auto-continue',
+        action='store_true',
+        help='Disable automatic "continue" responses. Will read from stdin instead.',
+    )
+    return parser
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse the command line arguments."""
+    parser = get_parser()
+    parsed_args, _ = parser.parse_known_args()
+    return parsed_args
+
+
+def load_app_config(
+    set_logging_levels: bool = True, config_file: str = 'config.toml'
+) -> AppConfig:
+    """Load the configuration from the specified config file and environment variables.
+
+    Args:
+        set_logging_levels: Whether to set the global variables for logging levels.
+        config_file: Path to the config file. Defaults to 'config.toml' in the current directory.
+    """
+    config = AppConfig()
+    load_from_toml(config, config_file)
+    load_from_env(config, os.environ)
+    finalize_config(config)
+    if set_logging_levels:
+        logger.DEBUG = config.debug
+        logger.DISABLE_COLOR_PRINTING = config.disable_color
+    return config
@@ -0,0 +1 @@
+TROUBLESHOOTING_URL = 'https://docs.all-hands.dev/modules/usage/troubleshooting'
@@ -0,0 +1,2 @@
+# Run this file to trigger a model download
+import openhands.agenthub  # noqa F401 (we import this to get the agents registered)
@@ -0,0 +1,123 @@
+class AgentNoInstructionError(Exception):
+    def __init__(self, message='Instruction must be provided'):
+        super().__init__(message)
+
+
+class AgentEventTypeError(Exception):
+    def __init__(self, message='Event must be a dictionary'):
+        super().__init__(message)
+
+
+class AgentAlreadyRegisteredError(Exception):
+    def __init__(self, name=None):
+        if name is not None:
+            message = f"Agent class already registered under '{name}'"
+        else:
+            message = 'Agent class already registered'
+        super().__init__(message)
+
+
+class AgentNotRegisteredError(Exception):
+    def __init__(self, name=None):
+        if name is not None:
+            message = f"No agent class registered under '{name}'"
+        else:
+            message = 'No agent class registered'
+        super().__init__(message)
+
+
+class TaskInvalidStateError(Exception):
+    def __init__(self, state=None):
+        if state is not None:
+            message = f'Invalid state {state}'
+        else:
+            message = 'Invalid state'
+        super().__init__(message)
+
+
+class BrowserInitException(Exception):
+    def __init__(self, message='Failed to initialize browser environment'):
+        super().__init__(message)
+
+
+class BrowserUnavailableException(Exception):
+    def __init__(
+        self,
+        message='Browser environment is not available, please check if has been initialized',
+    ):
+        super().__init__(message)
+
+
+# This exception gets sent back to the LLM
+# It might be malformed JSON
+class LLMMalformedActionError(Exception):
+    def __init__(self, message='Malformed response'):
+        self.message = message
+        super().__init__(message)
+
+    def __str__(self):
+        return self.message
+
+
+# This exception gets sent back to the LLM
+# For some reason, the agent did not return an action
+class LLMNoActionError(Exception):
+    def __init__(self, message='Agent must return an action'):
+        super().__init__(message)
+
+
+# This exception gets sent back to the LLM
+# The LLM output did not include an action, or the action was not the expected type
+class LLMResponseError(Exception):
+    def __init__(self, message='Failed to retrieve action from LLM response'):
+        super().__init__(message)
+
+
+class UserCancelledError(Exception):
+    def __init__(self, message='User cancelled the request'):
+        super().__init__(message)
+
+
+class MicroAgentValidationError(Exception):
+    def __init__(self, message='Micro agent validation failed'):
+        super().__init__(message)
+
+
+class OperationCancelled(Exception):
+    """Exception raised when an operation is cancelled (e.g. by a keyboard interrupt)."""
+
+    def __init__(self, message='Operation was cancelled'):
+        super().__init__(message)
+
+
+class CloudFlareBlockageError(Exception):
+    """Exception raised when a request is blocked by CloudFlare."""
+
+    pass
+
+
+class FunctionCallConversionError(Exception):
+    """Exception raised when FunctionCallingConverter failed to convert a non-function call message to a function call message.
+
+    This typically happens when there's a malformed message (e.g., missing <function=...> tags). But not due to LLM output.
+    """
+
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class FunctionCallValidationError(Exception):
+    """Exception raised when FunctionCallingConverter failed to validate a function call message.
+
+    This typically happens when the LLM outputs unrecognized function call / parameter names / values.
+    """
+
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class FunctionCallNotExistsError(Exception):
+    """Exception raised when an LLM call a tool that is not registered."""
+
+    def __init__(self, message):
+        super().__init__(message)
@@ -0,0 +1,355 @@
+import copy
+import logging
+import os
+import re
+import sys
+import traceback
+from datetime import datetime
+from types import TracebackType
+from typing import Any, Literal, Mapping
+
+from termcolor import colored
+
+LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO').upper()
+DEBUG = os.getenv('DEBUG', 'False').lower() in ['true', '1', 'yes']
+if DEBUG:
+    LOG_LEVEL = 'DEBUG'
+
+LOG_TO_FILE = os.getenv('LOG_TO_FILE', 'False').lower() in ['true', '1', 'yes']
+DISABLE_COLOR_PRINTING = False
+
+LOG_ALL_EVENTS = os.getenv('LOG_ALL_EVENTS', 'False').lower() in ['true', '1', 'yes']
+
+ColorType = Literal[
+    'red',
+    'green',
+    'yellow',
+    'blue',
+    'magenta',
+    'cyan',
+    'light_grey',
+    'dark_grey',
+    'light_red',
+    'light_green',
+    'light_yellow',
+    'light_blue',
+    'light_magenta',
+    'light_cyan',
+    'white',
+]
+
+LOG_COLORS: Mapping[str, ColorType] = {
+    'ACTION': 'green',
+    'USER_ACTION': 'light_red',
+    'OBSERVATION': 'yellow',
+    'USER_OBSERVATION': 'light_green',
+    'DETAIL': 'cyan',
+    'ERROR': 'red',
+    'PLAN': 'light_magenta',
+}
+
+
+class NoColorFormatter(logging.Formatter):
+    """Formatter for non-colored logging in files."""
+
+    def format(self, record: logging.LogRecord) -> str:
+        # Create a deep copy of the record to avoid modifying the original
+        new_record: logging.LogRecord = copy.deepcopy(record)
+        # Strip ANSI color codes from the message
+        new_record.msg = strip_ansi(new_record.msg)
+
+        return super().format(new_record)
+
+
+def strip_ansi(s: str) -> str:
+    """Remove ANSI escape sequences (terminal color/formatting codes) from string.
+
+    Removes ANSI escape sequences from str, as defined by ECMA-048 in
+    http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-048.pdf
+    # https://github.com/ewen-lbh/python-strip-ansi/blob/master/strip_ansi/__init__.py
+    """
+    pattern = re.compile(r'\x1B\[\d+(;\d+){0,2}m')
+    stripped = pattern.sub('', s)
+    return stripped
+
+
+class ColoredFormatter(logging.Formatter):
+    def format(self, record):
+        msg_type = record.__dict__.get('msg_type')
+        event_source = record.__dict__.get('event_source')
+        if event_source:
+            new_msg_type = f'{event_source.upper()}_{msg_type}'
+            if new_msg_type in LOG_COLORS:
+                msg_type = new_msg_type
+        if msg_type in LOG_COLORS and not DISABLE_COLOR_PRINTING:
+            msg_type_color = colored(msg_type, LOG_COLORS[msg_type])
+            msg = colored(record.msg, LOG_COLORS[msg_type])
+            time_str = colored(
+                self.formatTime(record, self.datefmt), LOG_COLORS[msg_type]
+            )
+            name_str = colored(record.name, LOG_COLORS[msg_type])
+            level_str = colored(record.levelname, LOG_COLORS[msg_type])
+            if msg_type in ['ERROR'] or DEBUG:
+                return f'{time_str} - {name_str}:{level_str}: {record.filename}:{record.lineno}\n{msg_type_color}\n{msg}'
+            return f'{time_str} - {msg_type_color}\n{msg}'
+        elif msg_type == 'STEP':
+            if LOG_ALL_EVENTS:
+                msg = '\n\n==============\n' + record.msg + '\n'
+                return f'{msg}'
+            else:
+                return record.msg
+        return super().format(record)
+
+
+file_formatter = NoColorFormatter(
+    '%(asctime)s - %(name)s:%(levelname)s: %(filename)s:%(lineno)s - %(message)s',
+    datefmt='%H:%M:%S',
+)
+llm_formatter = logging.Formatter('%(message)s')
+
+
+class RollingLogger:
+    max_lines: int
+    char_limit: int
+    log_lines: list[str]
+
+    def __init__(self, max_lines=10, char_limit=80):
+        self.max_lines = max_lines
+        self.char_limit = char_limit
+        self.log_lines = [''] * self.max_lines
+
+    def is_enabled(self):
+        return DEBUG and sys.stdout.isatty()
+
+    def start(self, message=''):
+        if message:
+            print(message)
+        self._write('\n' * self.max_lines)
+        self._flush()
+
+    def add_line(self, line):
+        self.log_lines.pop(0)
+        self.log_lines.append(line[: self.char_limit])
+        self.print_lines()
+
+    def write_immediately(self, line):
+        self._write(line)
+        self._flush()
+
+    def print_lines(self):
+        """Display the last n log_lines in the console (not for file logging).
+
+        This will create the effect of a rolling display in the console.
+        """
+        self.move_back()
+        for line in self.log_lines:
+            self.replace_current_line(line)
+
+    def move_back(self, amount=-1):
+        r"""'\033[F' moves the cursor up one line."""
+        if amount == -1:
+            amount = self.max_lines
+        self._write('\033[F' * (self.max_lines))
+        self._flush()
+
+    def replace_current_line(self, line=''):
+        r"""'\033[2K\r' clears the line and moves the cursor to the beginning of the line."""
+        self._write('\033[2K' + line + '\n')
+        self._flush()
+
+    def _write(self, line):
+        if not self.is_enabled():
+            return
+        sys.stdout.write(line)
+
+    def _flush(self):
+        if not self.is_enabled():
+            return
+        sys.stdout.flush()
+
+
+class SensitiveDataFilter(logging.Filter):
+    def filter(self, record):
+        # start with attributes
+        sensitive_patterns = [
+            'api_key',
+            'aws_access_key_id',
+            'aws_secret_access_key',
+            'e2b_api_key',
+            'github_token',
+            'jwt_secret',
+            'modal_api_token_id',
+            'modal_api_token_secret',
+        ]
+
+        # add env var names
+        env_vars = [attr.upper() for attr in sensitive_patterns]
+        sensitive_patterns.extend(env_vars)
+
+        # and some special cases
+        sensitive_patterns.append('JWT_SECRET')
+        sensitive_patterns.append('LLM_API_KEY')
+        sensitive_patterns.append('GITHUB_TOKEN')
+        sensitive_patterns.append('SANDBOX_ENV_GITHUB_TOKEN')
+
+        # this also formats the message with % args
+        msg = record.getMessage()
+        record.args = ()
+
+        for attr in sensitive_patterns:
+            pattern = rf"{attr}='?([\w-]+)'?"
+            msg = re.sub(pattern, f"{attr}='******'", msg)
+
+        # passed with msg
+        record.msg = msg
+        return True
+
+
+def get_console_handler(log_level: int = logging.INFO, extra_info: str | None = None):
+    """Returns a console handler for logging."""
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(log_level)
+    formatter_str = '\033[92m%(asctime)s - %(name)s:%(levelname)s\033[0m: %(filename)s:%(lineno)s - %(message)s'
+    if extra_info:
+        formatter_str = f'{extra_info} - ' + formatter_str
+    console_handler.setFormatter(ColoredFormatter(formatter_str, datefmt='%H:%M:%S'))
+    return console_handler
+
+
+def get_file_handler(log_dir: str, log_level: int = logging.INFO):
+    """Returns a file handler for logging."""
+    os.makedirs(log_dir, exist_ok=True)
+    timestamp = datetime.now().strftime('%Y-%m-%d')
+    file_name = f'openhands_{timestamp}.log'
+    file_handler = logging.FileHandler(os.path.join(log_dir, file_name))
+    file_handler.setLevel(log_level)
+    file_handler.setFormatter(file_formatter)
+    return file_handler
+
+
+# Set up logging
+logging.basicConfig(level=logging.ERROR)
+
+
+def log_uncaught_exceptions(
+    ex_cls: type[BaseException], ex: BaseException, tb: TracebackType | None
+) -> Any:
+    """Logs uncaught exceptions along with the traceback.
+
+    Args:
+        ex_cls: The type of the exception.
+        ex: The exception instance.
+        tb: The traceback object.
+
+    Returns:
+        None
+    """
+    if tb:  # Add check since tb can be None
+        logging.error(''.join(traceback.format_tb(tb)))
+    logging.error('{0}: {1}'.format(ex_cls, ex))
+
+
+sys.excepthook = log_uncaught_exceptions
+openhands_logger = logging.getLogger('openhands')
+current_log_level = logging.INFO
+
+if LOG_LEVEL in logging.getLevelNamesMapping():
+    current_log_level = logging.getLevelNamesMapping()[LOG_LEVEL]
+openhands_logger.setLevel(current_log_level)
+
+if current_log_level == logging.DEBUG:
+    LOG_TO_FILE = True
+    openhands_logger.debug('DEBUG mode enabled.')
+
+openhands_logger.addHandler(get_console_handler(current_log_level))
+openhands_logger.addFilter(SensitiveDataFilter(openhands_logger.name))
+openhands_logger.propagate = False
+openhands_logger.debug('Logging initialized')
+
+LOG_DIR = os.path.join(
+    # parent dir of openhands/core (i.e., root of the repo)
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    'logs',
+)
+
+if LOG_TO_FILE:
+    openhands_logger.addHandler(
+        get_file_handler(LOG_DIR, current_log_level)
+    )  # default log to project root
+    openhands_logger.debug(f'Logging to file in: {LOG_DIR}')
+
+# Exclude LiteLLM from logging output
+logging.getLogger('LiteLLM').disabled = True
+logging.getLogger('LiteLLM Router').disabled = True
+logging.getLogger('LiteLLM Proxy').disabled = True
+
+
+class LlmFileHandler(logging.FileHandler):
+    """LLM prompt and response logging."""
+
+    def __init__(self, filename, mode='a', encoding='utf-8', delay=False):
+        """Initializes an instance of LlmFileHandler.
+
+        Args:
+            filename (str): The name of the log file.
+            mode (str, optional): The file mode. Defaults to 'a'.
+            encoding (str, optional): The file encoding. Defaults to None.
+            delay (bool, optional): Whether to delay file opening. Defaults to False.
+        """
+        self.filename = filename
+        self.message_counter = 1
+        if DEBUG:
+            self.session = datetime.now().strftime('%y-%m-%d_%H-%M')
+        else:
+            self.session = 'default'
+        self.log_directory = os.path.join(LOG_DIR, 'llm', self.session)
+        os.makedirs(self.log_directory, exist_ok=True)
+        if not DEBUG:
+            # Clear the log directory if not in debug mode
+            for file in os.listdir(self.log_directory):
+                file_path = os.path.join(self.log_directory, file)
+                try:
+                    os.unlink(file_path)
+                except Exception as e:
+                    openhands_logger.error(
+                        'Failed to delete %s. Reason: %s', file_path, e
+                    )
+        filename = f'{self.filename}_{self.message_counter:03}.log'
+        self.baseFilename = os.path.join(self.log_directory, filename)
+        super().__init__(self.baseFilename, mode, encoding, delay)
+
+    def emit(self, record):
+        """Emits a log record.
+
+        Args:
+            record (logging.LogRecord): The log record to emit.
+        """
+        filename = f'{self.filename}_{self.message_counter:03}.log'
+        self.baseFilename = os.path.join(self.log_directory, filename)
+        self.stream = self._open()
+        super().emit(record)
+        self.stream.close()
+        openhands_logger.debug('Logging to %s', self.baseFilename)
+        self.message_counter += 1
+
+
+def _get_llm_file_handler(name: str, log_level: int):
+    # The 'delay' parameter, when set to True, postpones the opening of the log file
+    # until the first log message is emitted.
+    llm_file_handler = LlmFileHandler(name, delay=True)
+    llm_file_handler.setFormatter(llm_formatter)
+    llm_file_handler.setLevel(log_level)
+    return llm_file_handler
+
+
+def _setup_llm_logger(name: str, log_level: int):
+    logger = logging.getLogger(name)
+    logger.propagate = False
+    logger.setLevel(log_level)
+    if LOG_TO_FILE:
+        logger.addHandler(_get_llm_file_handler(name, log_level))
+    return logger
+
+
+llm_prompt_logger = _setup_llm_logger('prompt', current_log_level)
+llm_response_logger = _setup_llm_logger('response', current_log_level)
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`TROUBLESHOOTING_URL = 'https://docs.all-hands.dev/modules/usage/troubleshooting'`