patch fo reval

update runtime to be compatible
2026-04-29 03:00:45 -04:00 · 2024-10-15 12:45:02 +00:00 · 2024-10-10 18:20:14 +00:00
161 changed files with 2902 additions and 5542 deletions
@@ -1 +1 @@
-The files in this directory configure a development container for GitHub Codespaces.
+The files in this directory configure a development container for GitHub Codespaces.
@@ -115,6 +115,15 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -145,7 +154,8 @@ jobs:
        run: make install-python-dependencies
      - name: Run runtime tests
        run: |
-          # We install pytest-xdist in order to run tests across CPUs
+          # We install pytest-xdist in order to run tests across CPUs. However, tests start to fail when we run
+          # then across more than 2 CPUs for some reason
          poetry run pip install pytest-xdist

          # Install to be able to retry on failures for flaky tests
@@ -157,10 +167,10 @@ jobs:
          SKIP_CONTAINER_LOGS=true \
          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
+          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -176,6 +186,15 @@ jobs:
        base_image: ['nikolaik']
    steps:
      - uses: actions/checkout@v4
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: true
      # Forked repos can't push to GHCR, so we need to download the image as an artifact
      - name: Download runtime image for fork
        if: github.event.pull_request.head.repo.fork
@@ -206,7 +225,8 @@ jobs:
        run: make install-python-dependencies
      - name: Run runtime tests
        run: |
-          # We install pytest-xdist in order to run tests across CPUs
+          # We install pytest-xdist in order to run tests across CPUs. However, tests start to fail when we run
+          # then across more than 2 CPUs for some reason
          poetry run pip install pytest-xdist

          # Install to be able to retry on failures for flaky tests
@@ -218,10 +238,10 @@ jobs:
          SKIP_CONTAINER_LOGS=true \
          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
+          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raR --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 --reruns 1 --reruns-delay 3 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        env:
@@ -273,7 +293,7 @@ jobs:

          TEST_RUNTIME=eventstream \
          SANDBOX_USER_ID=$(id -u) \
-          SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
+          SANDBOX_BASE_CONTAINER_IMAGE=$image_name \
          TEST_IN_CI=true \
          TEST_ONLY=true \
          ./tests/integration/regenerate.sh
@@ -228,4 +228,3 @@ runtime_*.tar
 # docker build
 containers/runtime/Dockerfile
 containers/runtime/project.tar.gz
-containers/runtime/code
@@ -97,28 +97,3 @@ Please refer to [this README](./tests/integration/README.md) for details.
 ### 9. Add or update dependency
 1. Add your dependency in `pyproject.toml` or use `poetry add xxx`
 2. Update the poetry.lock file via `poetry lock --no-update`
-
-## Develop inside Docker container
-
-TL;DR
-
-```bash
-make docker-dev
-```
-
-See more details [here](./containers/dev/README.md)
-
-If you are just interested in running `OpenHands` without installing all the required tools on your host.
-
-```bash
-make docker-run
-```
-
-If you do not have `make` on your host, run:
-
-```bash
-cd ./containers/dev
-./dev.sh
-```
-
-You do need [Docker](https://docs.docker.com/engine/install/) installed on your host though.
@@ -2,9 +2,8 @@ SHELL=/bin/bash
 # Makefile for OpenHands project

 # Variables
-BACKEND_HOST ?= "127.0.0.1"
 BACKEND_PORT = 3000
-BACKEND_HOST_PORT = "$(BACKEND_HOST):$(BACKEND_PORT)"
+BACKEND_HOST = "127.0.0.1:$(BACKEND_PORT)"
 FRONTEND_PORT = 3001
 DEFAULT_WORKSPACE_DIR = "./workspace"
 DEFAULT_MODEL = "gpt-4o"
@@ -190,12 +189,12 @@ build-frontend:
 # Start backend
 start-backend:
 	@echo "$(YELLOW)Starting backend...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
+	@poetry run uvicorn openhands.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"

 # Start frontend
 start-frontend:
 	@echo "$(YELLOW)Starting frontend...$(RESET)"
-	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST_PORT) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start
+	@cd frontend && VITE_BACKEND_HOST=$(BACKEND_HOST) VITE_FRONTEND_PORT=$(FRONTEND_PORT) npm run start

 # Common setup for running the app (non-callable)
 _run_setup:
@@ -205,7 +204,7 @@ _run_setup:
 	fi
 	@mkdir -p logs
 	@echo "$(YELLOW)Starting backend server...$(RESET)"
-	@poetry run uvicorn openhands.server.listen:app --host $(BACKEND_HOST) --port $(BACKEND_PORT) &
+	@poetry run uvicorn openhands.server.listen:app --port $(BACKEND_PORT) &
 	@echo "$(YELLOW)Waiting for the backend to start...$(RESET)"
 	@until nc -z localhost $(BACKEND_PORT); do sleep 0.1; done
 	@echo "$(GREEN)Backend started successfully.$(RESET)"
@@ -217,20 +216,6 @@ run:
 	@cd frontend && echo "$(BLUE)Starting frontend with npm...$(RESET)" && npm run start -- --port $(FRONTEND_PORT)
 	@echo "$(GREEN)Application started successfully.$(RESET)"

-# Run the app (in docker)
-docker-run: WORKSPACE_BASE ?= $(PWD)/workspace
-docker-run:
-	@if [ -f /.dockerenv ]; then \
-		echo "Running inside a Docker container. Exiting..."; \
-		exit 0; \
-	else \
-		echo "$(YELLOW)Running the app in Docker $(OPTIONS)...$(RESET)"; \
-		export WORKSPACE_BASE=${WORKSPACE_BASE}; \
-		export SANDBOX_USER_ID=$(shell id -u); \
-		export DATE=$(shell date +%Y%m%d%H%M%S); \
-		docker compose up $(OPTIONS); \
-	fi
-
 # Run the app (WSL mode)
 run-wsl:
 	@echo "$(YELLOW)Running the app in WSL mode...$(RESET)"
@@ -295,16 +280,6 @@ setup-config-prompts:
 		fi


-# Develop in container
-docker-dev:
-	@if [ -f /.dockerenv ]; then \
-		echo "Running inside a Docker container. Exiting..."; \
-		exit 0; \
-	else \
-		echo "$(YELLOW)Build and run in Docker $(OPTIONS)...$(RESET)"; \
-		./containers/dev/dev.sh $(OPTIONS); \
-	fi
-
 # Clean up all caches
 clean:
 	@echo "$(YELLOW)Cleaning up caches...$(RESET)"
@@ -323,10 +298,7 @@ help:
 	@echo "  $(GREEN)start-frontend$(RESET)      - Start the frontend server for the OpenHands project."
 	@echo "  $(GREEN)run$(RESET)                 - Run the OpenHands application, starting both backend and frontend servers."
 	@echo "                        Backend Log file will be stored in the 'logs' directory."
-	@echo "  $(GREEN)docker-dev$(RESET)          - Build and run the OpenHands application in Docker."
-	@echo "  $(GREEN)docker-run$(RESET)          - Run the OpenHands application, starting both backend and frontend servers in Docker."
 	@echo "  $(GREEN)help$(RESET)                - Display this help message, providing information on available targets."

 # Phony targets
 .PHONY: build check-dependencies check-python check-npm check-docker check-poetry install-python-dependencies install-frontend-dependencies install-pre-commit-hooks lint start-backend start-frontend run run-wsl setup-config setup-config-prompts help
-.PHONY: docker-dev docker-run
@@ -216,8 +216,10 @@ class BrowsingAgent(Agent):
        prompt = get_prompt(error_prefix, cur_url, cur_axtree_txt, prev_action_str)
        messages.append(Message(role='user', content=[TextContent(text=prompt)]))

+        flat_messages = self.llm.format_messages_for_llm(messages)
+
        response = self.llm.completion(
-            messages=self.llm.format_messages_for_llm(messages),
+            messages=flat_messages,
            temperature=0.0,
            stop=[')```', ')\n```'],
        )
@@ -57,7 +57,7 @@ class Flags:

    @classmethod
    def from_dict(self, flags_dict):
-        """Helper for JSON serializable requirement."""
+        """Helper for JSON serializble requirement."""
        if isinstance(flags_dict, Flags):
            return flags_dict

@@ -6,7 +6,6 @@ from openhands.events.action import (
    AgentDelegateAction,
    AgentFinishAction,
    CmdRunAction,
-    FileEditAction,
    IPythonRunCellAction,
    MessageAction,
 )
@@ -17,7 +16,6 @@ class CodeActResponseParser(ResponseParser):
    - CmdRunAction(command) - bash command to run
    - IPythonRunCellAction(code) - IPython code to run
    - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
-    - FileEditAction(diff_block) - Search/Replace block to edit.
    - MessageAction(content) - Message action to run (e.g. ask for clarification)
    - AgentFinishAction() - end the interaction
    """
@@ -30,7 +28,6 @@ class CodeActResponseParser(ResponseParser):
            CodeActActionParserCmdRun(),
            CodeActActionParserIPythonRunCell(),
            CodeActActionParserAgentDelegate(),
-            CodeActActionParserFileEdit(),
        ]
        self.default_parser = CodeActActionParserMessage()

@@ -42,7 +39,7 @@ class CodeActResponseParser(ResponseParser):
        action = response.choices[0].message.content
        if action is None:
            return ''
-        for lang in ['bash', 'ipython', 'edit', 'browse']:
+        for lang in ['bash', 'ipython', 'browse']:
            if f'<execute_{lang}>' in action and f'</execute_{lang}>' not in action:
                action += f'</execute_{lang}>'
        return action
@@ -161,33 +158,6 @@ class CodeActActionParserAgentDelegate(ActionParser):
        return AgentDelegateAction(agent='BrowsingAgent', inputs={'task': task})


-class CodeActActionParserFileEdit(ActionParser):
-    """Parser action:
-    - FileEditAction(diff_block) - Search/Replace block to edit.
-    """
-
-    def __init__(
-        self,
-    ):
-        self.diff_block = None
-
-    def check_condition(self, action_str: str) -> bool:
-        self.diff_block = re.search(
-            r'<execute_edit>(.*)</execute_edit>', action_str, re.DOTALL
-        )
-        return self.diff_block is not None
-
-    def parse(self, action_str: str) -> Action:
-        assert (
-            self.diff_block is not None
-        ), 'self.diff_block should not be None when parse is called'
-        thought = action_str.replace(self.diff_block.group(0), '').strip()
-        return FileEditAction(
-            diff_block=self.diff_block.group(1).strip(),
-            thought=thought,
-        )
-
-
 class CodeActActionParserMessage(ActionParser):
    """Parser action:
    - MessageAction(content) - Message action to run (e.g. ask for clarification)
@@ -5,7 +5,6 @@ from agenthub.codeact_agent.action_parser import CodeActResponseParser
 from openhands.controller.agent import Agent
 from openhands.controller.state.state import State
 from openhands.core.config import AgentConfig
-from openhands.core.exceptions import OperationCancelled
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.message import ImageContent, Message, TextContent
 from openhands.events.action import (
@@ -13,14 +12,12 @@ from openhands.events.action import (
    AgentDelegateAction,
    AgentFinishAction,
    CmdRunAction,
-    FileEditAction,
    IPythonRunCellAction,
    MessageAction,
 )
 from openhands.events.observation import (
    AgentDelegateObservation,
    CmdOutputObservation,
-    FileEditObservation,
    IPythonRunCellObservation,
    UserRejectObservation,
 )
@@ -38,7 +35,7 @@ from openhands.utils.prompt import PromptManager


 class CodeActAgent(Agent):
-    VERSION = '1.10'
+    VERSION = '1.9'
    """
    The Code Act Agent is a minimalist agent.
    The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
@@ -106,8 +103,6 @@ class CodeActAgent(Agent):
            return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
        elif isinstance(action, AgentDelegateAction):
            return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
-        elif isinstance(action, FileEditAction):
-            return f'{action.thought}\n<execute_edit>\n{action.diff_block}\n</execute_edit>'
        elif isinstance(action, MessageAction):
            return action.content
        elif isinstance(action, AgentFinishAction) and action.source == 'agent':
@@ -117,7 +112,6 @@ class CodeActAgent(Agent):
    def get_action_message(self, action: Action) -> Message | None:
        if (
            isinstance(action, AgentDelegateAction)
-            or isinstance(action, FileEditAction)
            or isinstance(action, CmdRunAction)
            or isinstance(action, IPythonRunCellAction)
            or isinstance(action, MessageAction)
@@ -158,21 +152,15 @@ class CodeActAgent(Agent):
            text = '\n'.join(splitted)
            text = truncate_content(text, max_message_chars)
            return Message(role='user', content=[TextContent(text=text)])
-        elif isinstance(obs, FileEditObservation):
-            text = obs_prefix + truncate_content(obs.content, max_message_chars)
-            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, AgentDelegateObservation):
-            text = obs_prefix + truncate_content(
-                obs.outputs['content'] if 'content' in obs.outputs else '',
-                max_message_chars,
-            )
+            text = obs_prefix + truncate_content(str(obs.outputs), max_message_chars)
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, ErrorObservation):
            text = obs_prefix + truncate_content(obs.content, max_message_chars)
            text += '\n[Error occurred in processing last action]'
            return Message(role='user', content=[TextContent(text=text)])
        elif isinstance(obs, UserRejectObservation):
-            text = obs_prefix + truncate_content(obs.content, max_message_chars)
+            text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
            text += '\n[Last action has been rejected by the user]'
            return Message(role='user', content=[TextContent(text=text)])
        else:
@@ -195,7 +183,6 @@ class CodeActAgent(Agent):
        - CmdRunAction(command) - bash command to run
        - IPythonRunCellAction(code) - IPython code to run
        - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
-        - FileEditAction(diff_block) - Search/Replace block to edit.
        - MessageAction(content) - Message action to run (e.g. ask for clarification)
        - AgentFinishAction() - end the interaction
        """
@@ -212,8 +199,8 @@ class CodeActAgent(Agent):
                '</execute_ipython>',
                '</execute_bash>',
                '</execute_browse>',
-                '</execute_edit>',
            ],
+            'temperature': 0.0,
        }

        if self.llm.is_caching_prompt_active():
@@ -221,11 +208,8 @@ class CodeActAgent(Agent):
                'anthropic-beta': 'prompt-caching-2024-07-31',
            }

-        # TODO: move exception handling to agent_controller
        try:
            response = self.llm.completion(**params)
-        except OperationCancelled as e:
-            raise e
        except Exception as e:
            logger.error(f'{e}')
            error_message = '{}: {}'.format(type(e).__name__, str(e).split('\n')[0])
@@ -19,44 +19,22 @@ the assistant should retry running the command in the background.
 The assistant can browse the Internet with <execute_browse> and </execute_browse>.
 For example, <execute_browse> Tell me the usa's president using google search </execute_browse>.
 Or <execute_browse> Tell me what is in http://example.com </execute_browse>.
-{% endset %}
-{% set EDIT_DIFF_PREFIX %}
-The assistant can edit files with <execute_edit> and </execute_edit>. Each change must be described with a SEARCH/REPLACE block.
-Every SEARCH section must EXACTLY MATCH the existing file content, character for character, including all comments, docstrings, etc. SEARCH/REPLACE blocks will replace all matching occurrences. Include enough lines to make the SEARCH blocks uniquely match the lines to change.
-Keep SEARCH/REPLACE blocks as concise as possible. Break large SEARCH/REPLACE blocks into a series of smaller blocks that each change a small portion of the file.
-To move code within a file, use 2 SEARCH/REPLACE blocks: 1 to delete it from its current location, 1 to insert it in the new location.
-If you want to put code in a new file, use a SEARCH/REPLACE block with: a new file path, an empty `SEARCH` section and the new file's contents in the `REPLACE` section.
-
-Every SEARCH/REPLACE block must use this format:
-1. The FULL file path alone on a line, verbatim. No bold asterisks, no quotes around it, no escaping of characters, etc.
-2. The start of search block: <<<<<<< SEARCH
-3. A contiguous chunk of lines to search for in the existing source code
-4. The dividing line: =======
-5. The lines to replace into the source code
-6. The end of the replace block: >>>>>>> REPLACE
-
-For example,
-<execute_edit>
-demo.py
-<<<<<<< SEARCH
-    print("hello")
-=======
-    print("goodbye")
->>>>>>> REPLACE
-</execute_edit>
-
 {% endset %}
 {% set PIP_INSTALL_PREFIX %}
 The assistant can install Python packages using the %pip magic command in an IPython environment by using the following syntax: <execute_ipython> %pip install [package needed] </execute_ipython> and should always import packages and define variables before starting to use them.
 {% endset %}
-{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + EDIT_DIFF_PREFIX + PIP_INSTALL_PREFIX %}
+{% set SYSTEM_PREFIX = MINIMAL_SYSTEM_PREFIX + BROWSING_PREFIX + PIP_INSTALL_PREFIX %}
 {% set COMMAND_DOCS %}
 Apart from the standard Python library, the assistant can also use the following functions (already imported) in <execute_ipython> environment:
 {{ agent_skills_docs }}
 IMPORTANT:
 - `open_file` only returns the first 100 lines of the file by default! The assistant MUST use `scroll_down` repeatedly to read the full file BEFORE making edits!
+- The assistant shall adhere to THE `edit_file_by_replace`, `append_file` and `insert_content_at_line` FUNCTIONS REQUIRING PROPER INDENTATION. If the assistant would like to add the line '        print(x)', it must fully write the line out, with all leading spaces before the code!
 - Indentation is important and code that is not indented correctly will fail and require fixing before it can be run.
 - Any code issued should be less than 50 lines to avoid context being cut off!
+- After EVERY `create_file` the method `append_file` shall be used to write the FIRST content!
+- For `edit_file_by_replace` NEVER provide empty parameters!
+- For `edit_file_by_replace` the file must be read fully before any replacements!
 {% endset %}
 {% set SYSTEM_SUFFIX %}
 Responses should be concise.
@@ -4,12 +4,23 @@
 USER: Create a list of numbers from 1 to 10, and display them in a web page at port 5000.

 ASSISTANT:
-Sure! Let me write the Python code for starting a web server and save it to a new Python file `app.py`:
-<execute_edit>
-app.py
-<<<<<<< SEARCH
-=======
-from flask import Flask
+Sure! Let me create a Python file `app.py`:
+<execute_ipython>
+create_file('app.py')
+</execute_ipython>
+
+USER:
+OBSERVATION:
+[File: /workspace/app.py (1 lines total)]
+(this is the beginning of the file)
+1|
+(this is the end of the file)
+[File app.py created.]
+
+ASSISTANT:
+Now I will write the Python code for starting a web server and save it to the file `app.py`:
+<execute_ipython>
+EDITED_CODE="""from flask import Flask
 app = Flask(__name__)

@app.route('/')
@@ -18,17 +29,30 @@ def index():
    return str(numbers)

 if __name__ == '__main__':
-    app.run(port=5000)
->>>>>>> REPLACE
-</execute_edit>
+    app.run(port=5000)"""

+insert_content_at_line(
+  'app.py',
+  1,
+  EDITED_CODE,
+)
+</execute_ipython>

 USER:
 OBSERVATION:
-I updated the file app.py by
-appending:
-from flask import Flask
-app = Flask(__name__)
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return str(numbers)
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 1). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]

 ASSISTANT:
 I have created a Python file `app.py` that will display a list of numbers from 1 to 10 when you run it. Let me run the Python file for you:
@@ -153,22 +177,30 @@ USER:

 ASSISTANT:
 I should edit the file to display the numbers in a table format. I should include correct indentation. Let me update the file:
-<execute_edit>
-app.py
-<<<<<<< SEARCH
-    return str(numbers)
-=======
-    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
->>>>>>> REPLACE
-</execute_edit>
+<execute_ipython>
+edit_file_by_replace(
+  'app.py',
+  to_replace="    return str(numbers)",
+  new_content="    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'",
+)
+</execute_ipython>

 USER:
 Observation:
-I updated the file app.py by
-replacing:
-    return str(numbers)
-with:
-    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+[File: /workspace/app.py (10 lines total after edit)]
+(this is the beginning of the file)
+1|from flask import Flask
+2|app = Flask(__name__)
+3|
+4|@app.route('/')
+5|def index():
+6|    numbers = list(range(1, 11))
+7|    return '<table>' + ''.join([f'<tr><td>{i}</td></tr>' for i in numbers]) + '</table>'
+8|
+9|if __name__ == '__main__':
+10|    app.run(port=5000)
+(this is the end of the file)
+[File updated (edited at line 7). Please review the changes and make sure they are correct (correct indentation, no duplicate lines, etc). Edit the file again if necessary.]

 ASSISTANT:
 Running the updated file:
@@ -1,22 +0,0 @@
-#
-services:
-  openhands:
-    build:
-      context: ./
-      dockerfile: ./containers/app/Dockerfile
-    image: openhands:latest
-    container_name: openhands-app-${DATE:-}
-    environment:
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.9-nikolaik}
-      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
-      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
-    ports:
-      - "3000:3000"
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock
-      - ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
-    pull_policy: build
-    stdin_open: true
-    tty: true
@@ -159,17 +159,17 @@ model = "gpt-4o"
 #timeout = 0

 # Top p for the API
-#top_p = 1.0
+#top_p = 0.5

 # If model is vision capable, this option allows to disable image processing (useful for cost reduction).
 #disable_vision = true

-[llm.gpt4o-mini]
+[llm.gpt3]
 # API key to use
 api_key = "your-api-key"

 # Model to use
-model = "gpt-4o-mini"
+model = "gpt-3.5"

 #################################### Agent ###################################
 # Configuration for agents (group name starts with 'agent')
@@ -1,124 +0,0 @@
-# syntax=docker/dockerfile:1
-
-###
-FROM ubuntu:22.04 AS dind
-
-# https://docs.docker.com/engine/install/ubuntu/
-RUN apt-get update && apt-get install -y \
-	ca-certificates \
-	curl \
-	&& install -m 0755 -d /etc/apt/keyrings \
-	&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
-	&& chmod a+r /etc/apt/keyrings/docker.asc \
-	&& echo \
-		"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
-		$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
-
-RUN apt-get update && apt-get install -y \
-	docker-ce \
-	docker-ce-cli \
-	containerd.io \
-	docker-buildx-plugin \
-	docker-compose-plugin \
-	&& rm -rf /var/lib/apt/lists/* \
-	&& apt-get clean \
-	&& apt-get autoremove -y
-
-###
-FROM dind AS openhands
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-#
-RUN apt-get update && apt-get install -y \
-	bash \
-    build-essential \
-    curl \
-	git \
-	git-lfs \
-	software-properties-common \
-	make \
-    netcat \
-    sudo \
-	wget \
-	&& rm -rf /var/lib/apt/lists/* \
-	&& apt-get clean \
-	&& apt-get autoremove -y
-
-# https://github.com/cli/cli/blob/trunk/docs/install_linux.md
-RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \
-	&& chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \
-	&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
-	&& apt-get update && apt-get -y install \
-    gh \
-  && rm -rf /var/lib/apt/lists/* \
-  && apt-get clean \
-  && apt-get autoremove -y
-
-# Python 3.11
-RUN add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update \
-    && apt-get install -y python3.11 python3.11-venv python3.11-dev python3-pip \
-    && ln -s /usr/bin/python3.11 /usr/bin/python
-
-# NodeJS >= 18.17.1
-RUN curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
-    && apt-get install -y nodejs
-
-# Poetry >= 1.8
-RUN curl -fsSL https://install.python-poetry.org | python3.11 - \
-    && ln -s ~/.local/bin/poetry /usr/local/bin/poetry
-
-#
-RUN <<EOF
-#!/bin/bash
-printf "#!/bin/bash
-set +x
-uname -a
-docker --version
-gh --version | head -n 1
-git --version
-#
-python --version
-echo node `node --version`
-echo npm `npm --version`
-poetry --version
-netcat -h 2>&1 | head -n 1
-" > /version.sh
-chmod a+x /version.sh
-EOF
-
-###
-FROM openhands AS dev
-
-RUN apt-get update && apt-get install -y \
-	dnsutils \
-	file \
-	iproute2 \
-	jq \
-	lsof \
-	ripgrep \
-	silversearcher-ag \
-	vim \
-	&& rm -rf /var/lib/apt/lists/* \
-	&& apt-get clean \
-	&& apt-get autoremove -y
-
-WORKDIR /app
-
-# cache build dependencies
-RUN \
-  --mount=type=bind,source=./,target=/app/ \
-  <<EOF
-#!/bin/bash
-make -s clean
-make -s check-dependencies
-make -s install-python-dependencies
-
-# NOTE
-# node_modules are .dockerignore-d therefore not mountable
-# make -s install-frontend-dependencies
-EOF
-
-#
-CMD ["bash"]
@@ -1,54 +0,0 @@
-# Develop in Docker
-
-Install [Docker](https://docs.docker.com/engine/install/) on your host machine and run:
-
-```bash
-make docker-dev
-# same as:
-cd ./containers/dev
-./dev.sh
-```
-
-It could take some time if you are running for the first time as Docker will pull all the  tools required for building OpenHands. The next time you run again, it should be instant.
-
-## Build and run
-
-If everything goes well, you should be inside a container after Docker finishes building the `openhands:dev` image similar to the following:
-
-```bash
-Build and run in Docker ...
-root@93fc0005fcd2:/app#
-```
-
-You may now proceed with the normal [build and run](../../Development.md) workflow as if you were on the host.
-
-## Make changes
-
-The source code on the host is mounted as `/app` inside docker. You may edit the files as usual either inside the Docker container or on your host with your favorite IDE/editors.
-
-The following are also mapped as readonly from your host:
-
-```yaml
-# host credentials
- $HOME/.git-credentials:/root/.git-credentials:ro
- $HOME/.gitconfig:/root/.gitconfig:ro
- $HOME/.npmrc:/root/.npmrc:ro
-```
-
-## VSCode
-
-Alternatively, if you use VSCode, you could also [attach to the running container](https://code.visualstudio.com/docs/devcontainers/attach-container).
-
-See details for [developing in docker](https://code.visualstudio.com/docs/devcontainers/containers) or simply ask `OpenHands` ;-)
-
-## Rebuild dev image
-
-You could optionally pass additional options to the build script.
-
-```bash
-make docker-dev OPTIONS="--build"
-# or
-./containers/dev/dev.sh --build
-```
-
-See [docker compose run](https://docs.docker.com/reference/cli/docker/compose/run/) for more options.
@@ -1,38 +0,0 @@
-#
-services:
-  dev:
-    privileged: true
-    build:
-      context: ${OPENHANDS_WORKSPACE:-../../}
-      dockerfile: ./containers/dev/Dockerfile
-    image: openhands:dev
-    container_name: openhands-dev
-    environment:
-      - BACKEND_HOST=${BACKEND_HOST:-"0.0.0.0"}
-      - SANDBOX_API_HOSTNAME=host.docker.internal
-      #
-      - SANDBOX_RUNTIME_CONTAINER_IMAGE=${SANDBOX_RUNTIME_CONTAINER_IMAGE:-ghcr.io/all-hands-ai/runtime:0.9-nikolaik}
-      - SANDBOX_USER_ID=${SANDBOX_USER_ID:-1234}
-      - WORKSPACE_MOUNT_PATH=${WORKSPACE_BASE:-$PWD/workspace}
-    ports:
-      - "3000:3000"
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock
-      - ${WORKSPACE_BASE:-$PWD/workspace}:/opt/workspace_base
-      # source code
-      - ${OPENHANDS_WORKSPACE:-../../}:/app
-      # host credentials
-      - $HOME/.git-credentials:/root/.git-credentials:ro
-      - $HOME/.gitconfig:/root/.gitconfig:ro
-      - $HOME/.npmrc:/root/.npmrc:ro
-      # cache
-      - cache-data:/root/.cache
-    pull_policy: never
-    stdin_open: true
-    tty: true
-
-##
-volumes:
-  cache-data:
@@ -1,39 +0,0 @@
-#!/bin/bash
-set -o pipefail
-
-function get_docker() {
-    echo "Docker is required to build and run OpenHands."
-    echo "https://docs.docker.com/get-started/get-docker/"
-    exit 1
-}
-
-function check_tools() {
-	command -v docker &>/dev/null || get_docker
-}
-
-function exit_if_indocker() {
-    if [ -f /.dockerenv ]; then
-        echo "Running inside a Docker container. Exiting..."
-        exit 1
-    fi
-}
-
-#
-exit_if_indocker
-
-check_tools
-
-##
-OPENHANDS_WORKSPACE=$(git rev-parse --show-toplevel)
-
-cd "$OPENHANDS_WORKSPACE/containers/dev/" || exit 1
-
-##
-export BACKEND_HOST="0.0.0.0"
-#
-export SANDBOX_USER_ID=$(id -u)
-export WORKSPACE_BASE=${WORKSPACE_BASE:-$OPENHANDS_WORKSPACE/workspace}
-
-docker compose run --rm --service-ports "$@" dev
-
-##
@@ -1,12 +1,11 @@
-# Dynamically constructed Dockerfile
+# Dynamic constructed Dockerfile

-This folder builds a runtime image (sandbox), which will use a dynamically generated `Dockerfile`
-that depends on the `base_image` **AND** a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that is based on the current commit of `openhands`.
+This folder builds runtime image (sandbox), which will use a `Dockerfile` that is dynamically generated depends on the `base_image` AND a [Python source distribution](https://docs.python.org/3.10/distutils/sourcedist.html) that's based on the current commit of `openhands`.

-The following command will generate a `Dockerfile` file for `nikolaik/python-nodejs:python3.11-nodejs22` (the default base image), an updated `config.sh` and the runtime source distribution files/folders into `containers/runtime`:
+The following command will generate Dockerfile for `ubuntu:22.04` and the source distribution `.tar` into `containers/runtime`.

 ```bash
 poetry run python3 openhands/runtime/utils/runtime_build.py \
-    --base_image nikolaik/python-nodejs:python3.11-nodejs22 \
+    --base_image ubuntu:22.04 \
    --build_folder containers/runtime
 ```
@@ -59,6 +59,10 @@ Félicitations !

 ## Explication technique

+Le code pertinent est défini dans [ssh_box.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/ssh_box.py) et [image_agnostic_util.py](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py).
+
+En particulier, ssh_box.py vérifie l'objet config pour ```config.sandbox.base_container_image``` et ensuite tente de récupérer l'image à l'aide de [get_od_sandbox_image](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L72), qui est défini dans image_agnostic_util.py.
+
 Lorsqu'une image personnalisée est utilisée pour la première fois, elle ne sera pas trouvée et donc elle sera construite (à l'exécution ultérieure, l'image construite sera trouvée et renvoyée).

 L'image personnalisée est construite avec [_build_sandbox_image()](https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/runtime/docker/image_agnostic_util.py#L29), qui crée un fichier docker en utilisant votre image personnalisée comme base et configure ensuite l'environnement pour OpenHands, comme ceci:
@@ -47,8 +47,8 @@ graph TD
 ```

 1. User Input: The user provides a custom base Docker image
-2. Image Building: OpenHands builds a new Docker image (the "OH runtime image") based on the user-provided image. This new image includes OpenHands-specific code, primarily the "runtime client"
-3. Container Launch: When OpenHands starts, it launches a Docker container using the OH runtime image
+2. Image Building: OpenHands builds a new Docker image (the "OD runtime image") based on the user-provided image. This new image includes OpenHands-specific code, primarily the "runtime client"
+3. Container Launch: When OpenHands starts, it launches a Docker container using the OD runtime image
 4. Client Initialization: The runtime client initializes inside the container, setting up necessary components like a bash shell and loading any specified plugins
 5. Communication: The OpenHands backend (`runtime.py`) communicates with the runtime client over RESTful API, sending actions and receiving observations
 6. Action Execution: The runtime client receives actions from the backend, executes them in the sandboxed environment, and sends back observations
@@ -62,7 +62,7 @@ The role of the client:
 - It formats and returns observations to the backend, ensuring a consistent interface for processing results


-## How OpenHands builds and maintains OH Runtime images
+## How OpenHands builds and maintains OD Runtime images

 OpenHands' approach to building and managing runtime images ensures efficiency, consistency, and flexibility in creating and maintaining Docker images for both production and development environments.

@@ -80,9 +80,9 @@ OpenHands uses a dual-tagging system for its runtime images to balance reproduci
   - This ensures reproducibility; the same hash always means the same image contents

 2. Generic tag: `{target_image_repo}:{target_image_tag}`.
-   Example: `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+   Example: `runtime:od_v0.8.3_ubuntu_tag_22.04`

-   - This tag follows the format: `runtime:oh_v{OH_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
+   - This tag follows the format: `runtime:od_v{OD_VERSION}_{BASE_IMAGE_NAME}_tag_{BASE_IMAGE_TAG}`
   - It represents the latest build for a particular base image and OpenHands version combination
   - This tag is updated whenever a new image is built from the same base image, even if the source code changes

@@ -94,11 +94,11 @@ The hash-based tag ensures reproducibility, while the generic tag provides a sta
   - Hash-based tag: `{target_image_repo}:{target_image_hash_tag}`.
     Example: `runtime:abc123def456`
   - Generic tag: `{target_image_repo}:{target_image_tag}`.
-     Example: `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+     Example: `runtime:od_v0.8.3_ubuntu_tag_22.04`

 2. Build Process:
-   - a. Convert the base image name to an OH runtime image name
-      Example: `ubuntu:22.04` -> `runtime:oh_v0.9.3_ubuntu_tag_22.04`
+   - a. Convert the base image name to an OD runtime image name
+      Example: `ubuntu:22.04` -> `runtime:od_v0.8.3_ubuntu_tag_22.04`
   - b. Generate a build context (Dockerfile and OpenHands source code) and calculate its hash
   - c. Check for an existing image with the calculated hash
   - d. If not found, check for a recent compatible image to use as a base
@@ -108,7 +108,7 @@ The hash-based tag ensures reproducibility, while the generic tag provides a sta
 3. Image Reuse and Rebuilding Logic:
   The system follows these steps to determine whether to build a new image or use an existing one from a user-provided (base) image (e.g., `ubuntu:22.04`):
   - a. If an image exists with the same hash (e.g., `runtime:abc123def456`), it will be reused as is
-   - b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `runtime:oh_v0.9.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies
+   - b. If the exact hash is not found, the system will try to rebuild using the latest generic image (e.g., `runtime:od_v0.8.3_ubuntu_tag_22.04`) as a base. This saves time by leveraging existing dependencies
   - c. If neither the hash-tagged nor the generic-tagged image is found, the system will build the image completely from scratch

 4. Caching and Efficiency:
@@ -121,10 +121,10 @@ Here's a flowchart illustrating the build process:
 ```mermaid
 flowchart TD
    A[Start] --> B{Convert base image name}
-    B --> |ubuntu:22.04 -> runtime:oh_v0.9.3_ubuntu_tag_22.04| C[Generate build context and hash]
+    B --> |ubuntu:22.04 -> runtime:od_v0.8.3_ubuntu_tag_22.04| C[Generate build context and hash]
    C --> D{Check for existing image with hash}
    D -->|Found runtime:abc123def456| E[Use existing image]
-    D -->|Not found| F{Check for runtime:oh_v0.9.3_ubuntu_tag_22.04}
+    D -->|Not found| F{Check for runtime:od_v0.8.3_ubuntu_tag_22.04}
    F -->|Found| G[Rebuild based on recent image]
    F -->|Not found| H[Build from scratch]
    G --> I[Tag with hash and generic tags]
@@ -137,13 +137,13 @@ This approach ensures that:

 1. Identical source code and Dockerfile always produce the same image (via hash-based tags)
 2. The system can quickly rebuild images when minor changes occur (by leveraging recent compatible images)
-3. The generic tag (e.g., `runtime:oh_v0.9.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenHands version combination
+3. The generic tag (e.g., `runtime:od_v0.8.3_ubuntu_tag_22.04`) always points to the latest build for a particular base image and OpenHands version combination

 ## Runtime Plugin System

 The OpenHands Runtime supports a plugin system that allows for extending functionality and customizing the runtime environment. Plugins are initialized when the runtime client starts up.

-Check [an example of Jupyter plugin here](https://github.com/All-Hands-AI/OpenHands/blob/ecf4aed28b0cf7c18d4d8ff554883ba182fc6bdd/openhands/runtime/plugins/jupyter/__init__.py#L21-L55) if you want to implement your own plugin.
+Check [an example of Jupyter plugin here](https://github.com/All-Hands-AI/OpenHands/blob/9c44d94cef32e6426ebd8deeeb52963153b2348a/openhands/runtime/plugins/jupyter/__init__.py#L30-L63) if you want to implement your own plugin.

 *More details about the Plugin system are still under construction - contributions are welcomed!*

@@ -40,17 +40,12 @@ After running the command above, you'll find OpenHands running at [http://localh
 The agent will have access to the `./workspace` folder to do its work. You can copy existing code here, or change `WORKSPACE_BASE` in the
 command to point to an existing folder.

-Upon launching OpenHands, you'll see a settings modal. You **must** select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
+Upon launching OpenHands, you'll see a settings modal. You must select an `LLM Provider` and `LLM Model` and enter a corresponding `API Key`.
 These can be changed at any time by selecting the `Settings` button (gear icon) in the UI.
-
-If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it with the correct prefix
-in the `Custom Model` text box.
+If the required `LLM Model` does not exist in the list, you can toggle `Advanced Options` and manually enter it in the `Custom Model` text box.
 The `Advanced Options` also allow you to specify a `Base URL` if required.

-<div style={{ display: 'flex', justifyContent: 'center', gap: '20px' }}>
-  <img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />
-  <img src="/img/settings-advanced.png" alt="settings-modal" width="335" />
-</div>
+<img src="/img/settings-screenshot.png" alt="settings-modal" width="340" />

 ## Versions

@@ -1,6 +1,6 @@
-# Azure
+# Azure OpenAI LLM

-OpenHands uses LiteLLM to make calls to Azure's chat models. You can find their documentation on using Azure as a provider [here](https://docs.litellm.ai/docs/providers/azure).
+OpenHands uses LiteLLM for completion calls. You can find their documentation on Azure [here](https://docs.litellm.ai/docs/providers/azure).

 ## Azure OpenAI Configuration

@@ -27,8 +27,8 @@ You will need your ChatGPT deployment name which can be found on the deployments

 * Enable `Advanced Options`
 * `Custom Model` to azure/&lt;deployment-name&gt;
-* `Base URL` to your Azure API Base URL (e.g. `https://example-endpoint.openai.azure.com`)
-* `API Key` to your Azure API key
+* `Base URL` to your Azure API Base URL (Example: https://example-endpoint.openai.azure.com)
+* `API Key`

 ## Embeddings

@@ -1,6 +1,6 @@
-# Google Gemini/Vertex
+# Google Gemini/Vertex LLM

-OpenHands uses LiteLLM to make calls to Google's chat models. You can find their documentation on using Google as a provider:
+OpenHands uses LiteLLM for completion calls. The following resources are relevant for using OpenHands with Google's LLMs:

 - [Gemini - Google AI Studio](https://docs.litellm.ai/docs/providers/gemini)
 - [VertexAI - Google Cloud Platform](https://docs.litellm.ai/docs/providers/vertex)
@@ -10,8 +10,8 @@ OpenHands uses LiteLLM to make calls to Google's chat models. You can find their
 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `Gemini`
 * `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. gemini/&lt;model-name&gt; like `gemini/gemini-1.5-pro`).
-* `API Key` to your Gemini API key
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. gemini/&lt;model-name&gt;).
+* `API Key`

 ## VertexAI - Google Cloud Platform Configs

@@ -27,4 +27,4 @@ VERTEXAI_LOCATION="<your-gcp-location>"
 Then set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `VertexAI`
 * `LLM Model` to the model you will be using.
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. vertex_ai/&lt;model-name&gt;).
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. vertex_ai/&lt;model-name&gt;).
@@ -1,23 +1,19 @@
-# Groq
+# Running LLMs on Groq

-OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their documentation on using Groq as a provider [here](https://docs.litellm.ai/docs/providers/groq).
+OpenHands uses LiteLLM to make calls to chat models on Groq. You can find their full documentation on using Groq as provider [here](https://docs.litellm.ai/docs/providers/groq).

 ## Configuration

 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `Groq`
-* `LLM Model` to the model you will be using. [Visit here to see the list of
-models that Groq hosts](https://console.groq.com/docs/models). If the model is not in the list, toggle
-`Advanced Options`, and enter it in `Custom Model` (e.g. groq/&lt;model-name&gt; like `groq/llama3-70b-8192`)
-* `API key` to your Groq API key. To find or create your Groq API Key, [see here](https://console.groq.com/keys)
-
+* `LLM Model` to the model you will be using
+* `API key` to your Groq API key. To find or create your Groq API Key, [see **here**](https://console.groq.com/keys).

+Visit [here](https://console.groq.com/docs/models) to see the list of models that Groq hosts.

 ## Using Groq as an OpenAI-Compatible Endpoint

-The Groq endpoint for chat completion is [mostly OpenAI-compatible](https://console.groq.com/docs/openai). Therefore, you can access Groq models as you
-would access any OpenAI-compatible endpoint. You can set the following in the OpenHands UI through the Settings:
-* Enable `Advanced Options`
-* `Custom Model` to the prefix `openai/` + the model you will be using (e.g. `openai/llama3-70b-8192`)
-* `Base URL` to `https://api.groq.com/openai/v1`
+The Groq endpoint for chat completion is [mostly OpenAI-compatible](https://console.groq.com/docs/openai). Therefore, if you wish, you can access Groq models as you would access any OpenAI-compatible endpoint. You can toggle `Advanced Options` and set the following:
+* `Custom Model` to the prefix `openai/` + the model you will be using, e.g. `openai/llama3-8b-8192`
 * `API Key` to your Groq API key
+* `Base URL` to `https://api.groq.com/openai/v1`
@@ -53,9 +53,8 @@ We have a few guides for running OpenHands with specific model providers:

 * [Azure](llms/azure-llms)
 * [Google](llms/google-llms)
-* [Groq](llms/groq)
+* [ollama](llms/local-llms)
 * [OpenAI](llms/openai-llms)
-* [OpenRouter](llms/openrouter)

 ### API retries and rate limits

@@ -28,14 +28,17 @@ mistral:7b-instruct-v0.2-q4_K_M eb14864c7427    4.4 GB  2 weeks ago
 starcoder2:latest               f67ae0f64584    1.7 GB  19 hours ago
 ```

-## Run OpenHands with Docker
+## Start OpenHands
+
+### Docker

-### Start OpenHands
 Use the instructions [here](../getting-started) to start OpenHands using Docker.
 But when running `docker run`, you'll need to add a few more arguments:

 ```bash
 --add-host host.docker.internal:host-gateway \
+-e LLM_API_KEY="ollama" \
+-e LLM_BASE_URL="http://host.docker.internal:11434" \
 -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
 ```

@@ -52,6 +55,8 @@ docker run \
    --pull=always \
    --add-host host.docker.internal:host-gateway \
    -e SANDBOX_USER_ID=$(id -u) \
+    -e LLM_API_KEY="ollama" \
+    -e LLM_BASE_URL="http://host.docker.internal:11434" \
    -e LLM_OLLAMA_BASE_URL="http://host.docker.internal:11434" \
    -e WORKSPACE_MOUNT_PATH=$WORKSPACE_BASE \
    -v $WORKSPACE_BASE:/opt/workspace_base \
@@ -62,16 +67,6 @@ docker run \

 You should now be able to connect to `http://localhost:3000/`

-### Configure the Web Application
-
-When running `openhands`, you'll need to set the following in the OpenHands UI through the Settings:
- the model to "ollama/&lt;model-name&gt;"
- the base url to `http://host.docker.internal:11434`
- the API key is optional, you can use any string, such as `ollama`.
-
-
-## Run OpenHands in Development Mode
-
 ### Build from Source

 Use the instructions in [Development.md](https://github.com/All-Hands-AI/OpenHands/blob/main/Development.md) to build OpenHands.
@@ -82,22 +77,23 @@ Make sure `config.toml` is there by running `make setup-config` which will creat
 workspace_base="./workspace"

 [llm]
+model="ollama/codellama:7b"
+api_key="ollama"
 embedding_model="local"
+base_url="http://localhost:11434"
 ollama_base_url="http://localhost:11434"

 ```

-Done! Now you can start OpenHands by: `make run`. You now should be able to connect to `http://localhost:3000/`
+Replace `LLM_MODEL` of your choice if you need to.

-### Configure the Web Application
+Done! Now you can start OpenHands by: `make run` without Docker. You now should be able to connect to `http://localhost:3000/`
+
+## Select your Model

 In the OpenHands UI, click on the Settings wheel in the bottom-left corner.
 Then in the `Model` input, enter `ollama/codellama:7b`, or the name of the model you pulled earlier.
-If it doesn’t show up in the dropdown, enable `Advanced Settings` and type it in. Please note: you need the model name as listed by `ollama list`, with the prefix `ollama/`.
-
-In the API Key field, enter `ollama` or any value, since you don't need a particular key.
-
-In the Base URL field, enter `http://localhost:11434`.
+If it doesn’t show up in a dropdown, that’s fine, just type it in. Click Save when you’re done.

 And now you're ready to go!

@@ -1,15 +1,15 @@
 # OpenAI

-OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their documentation on using OpenAI as a provider [here](https://docs.litellm.ai/docs/providers/openai).
+OpenHands uses LiteLLM to make calls to OpenAI's chat models. You can find their full documentation on OpenAI chat calls [here](https://docs.litellm.ai/docs/providers/openai).

 ## Configuration

 When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
 * `LLM Provider` to `OpenAI`
 * `LLM Model` to the model you will be using.
-[Visit here to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openai/&lt;model-name&gt; like `openai/gpt-4o`).
-* `API Key` to your OpenAI API key. To find or create your OpenAI Project API Key, [see here](https://platform.openai.com/api-keys).
+[Visit **here** to see a full list of OpenAI models that LiteLLM supports.](https://docs.litellm.ai/docs/providers/openai#openai-chat-completion-models)
+If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (i.e. openai/&lt;model-name&gt;).
+* `API Key`. To find or create your OpenAI Project API Key, [see **here**](https://platform.openai.com/api-keys).

 ## Using OpenAI-Compatible Endpoints

@@ -17,8 +17,7 @@ Just as for OpenAI Chat completions, we use LiteLLM for OpenAI-compatible endpoi

 ## Using an OpenAI Proxy

-If you're using an OpenAI proxy, you'll need to set the following in the OpenHands UI through the Settings:
-* Enable `Advanced Options`
-* `Custom Model` to openai/&lt;model-name&gt; (e.g. `openai/gpt-4o` or openai/&lt;proxy-prefix&gt;/&lt;model-name&gt;)
-* `Base URL` to the URL of your OpenAI proxy
-* `API Key` to your OpenAI API key
+If you're using an OpenAI proxy, you'll need to toggle `Advanced Options` in the OpenHands Settings, and set the following:
+* `Custom Model` to the model you will be using, e.g. `openai/gpt-4o` or `openai/&lt;proxy_prefix&gt;/&lt;model_name&gt;`
+* `API Key` to your API key.
+* `Base URL` to the URL of your OpenAI proxy.
@@ -1,12 +0,0 @@
-# OpenRouter
-
-OpenHands uses LiteLLM to make calls to chat models on OpenRouter. You can find their documentation on using OpenRouter as a provider [here](https://docs.litellm.ai/docs/providers/openrouter).
-
-## Configuration
-
-When running OpenHands, you'll need to set the following in the OpenHands UI through the Settings:
-* `LLM Provider` to `OpenRouter`
-* `LLM Model` to the model you will be using.
-[Visit here to see a full list of OpenRouter models](https://openrouter.ai/models).
-If the model is not in the list, toggle `Advanced Options`, and enter it in `Custom Model` (e.g. openrouter/&lt;model-name&gt; like `openrouter/anthropic/claude-3.5-sonnet`).
-* `API Key` to your OpenRouter API key.
@@ -17,6 +17,7 @@ Check out [Notes for WSL on Windows Users](troubleshooting/windows) for some tro
 ## Common Issues

 * [Unable to connect to Docker](#unable-to-connect-to-docker)
+* [Unable to connect to LLM](#unable-to-connect-to-llm)
 * [404 Resource not found](#404-resource-not-found)
 * [`make build` getting stuck on package installations](#make-build-getting-stuck-on-package-installations)
 * [Sessions are not restored](#sessions-are-not-restored)
@@ -46,6 +47,33 @@ OpenHands uses a Docker container to do its work safely, without potentially bre
 * If you are on a Mac, check the [permissions requirements](https://docs.docker.com/desktop/mac/permission-requirements/) and in particular consider enabling the `Allow the default Docker socket to be used` under `Settings > Advanced` in Docker Desktop.
 * In addition, upgrade your Docker to the latest version under `Check for Updates`

+---
+### Unable to connect to LLM
+
+[GitHub Issue](https://github.com/All-Hands-AI/OpenHands/issues/1208)
+
+**Symptoms**
+
+```python
+  File "/app/.venv/lib/python3.12/site-packages/openai/_exceptions.py", line 81, in __init__
+    super().__init__(message, response.request, body=body)
+                              ^^^^^^^^^^^^^^^^
+AttributeError: 'NoneType' object has no attribute 'request'
+```
+
+**Details**
+
+[GitHub Issues](https://github.com/All-Hands-AI/OpenHands/issues?q=is%3Aissue+is%3Aopen+404)
+
+This usually happens with *local* LLM setups, when OpenHands can't connect to the LLM server.
+See our guide for [local LLMs](llms/local-llms) for more information.
+
+**Workarounds**
+
+* Check your `base_url` in your config.toml (if it exists) under the "llm" section
+* Check that ollama (or whatever LLM you're using) is running OK
+* Make sure you're using `--add-host host.docker.internal:host-gateway` when running in Docker
+
 ---
 ### `404 Resource not found`

@@ -87,6 +115,7 @@ the API endpoint you're trying to connect to. Most often this happens for Azure
  * If you're running inside the UI, be sure to set the `model` in the settings modal
  * If you're running headless (via main.py) be sure to set `LLM_MODEL` in your env/config
 * Make sure you've followed any special instructions for your LLM provider
+  * [ollama](/modules/usage/llms/local-llms)
  * [Azure](/modules/usage/llms/azure-llms)
  * [Google](/modules/usage/llms/google-llms)
 * Make sure your API key is correct
@@ -2,112 +2,78 @@ import type { SidebarsConfig } from "@docusaurus/plugin-content-docs";

 const sidebars: SidebarsConfig = {
  apiSidebar: [require("./modules/python/sidebar.json")],
-  docsSidebar: [
-    {
+  docsSidebar: [{
+    type: 'doc',
+    label: 'Getting Started',
+    id: 'usage/getting-started',
+  }, {
+    type: 'doc',
+    label: 'Troubleshooting',
+    id: 'usage/troubleshooting/troubleshooting',
+  }, {
+    type: 'doc',
+    label: 'Feedback',
+    id: 'usage/feedback',
+  }, {
+    type: 'category',
+    label: 'How-to Guides',
+    items: [{
      type: 'doc',
-      label: 'Getting Started',
-      id: 'usage/getting-started',
-    },
-    {
-      type: 'category',
-      label: 'LLMs',
-      items: [
-        {
-          type: 'doc',
-          label: 'Overview',
-          id: 'usage/llms/llms',
-        },
-        {
-          type: 'category',
-          label: 'Providers',
-          items: [
-            {
-              type: 'doc',
-              label: 'Azure',
-              id: 'usage/llms/azure-llms',
-            },
-            {
-              type: 'doc',
-              label: 'Google',
-              id: 'usage/llms/google-llms',
-            },
-            {
-              type: 'doc',
-              label: 'Groq',
-              id: 'usage/llms/groq',
-            },
-            {
-              type: 'doc',
-              label: 'OpenAI',
-              id: 'usage/llms/openai-llms',
-            },
-            {
-              type: 'doc',
-              label: 'OpenRouter',
-              id: 'usage/llms/openrouter',
-            },
-          ],
-        },
-      ],
-    },
-    {
+      id: 'usage/how-to/cli-mode',
+    }, {
      type: 'doc',
-      label: 'Troubleshooting',
-      id: 'usage/troubleshooting/troubleshooting',
-    },
-    {
+      id: 'usage/how-to/headless-mode',
+    }, {
      type: 'doc',
-      label: 'Feedback',
-      id: 'usage/feedback',
-    },
-    {
-      type: 'category',
-      label: 'How-to Guides',
-      items: [
-        {
-          type: 'doc',
-          id: 'usage/how-to/cli-mode',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/headless-mode',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/custom-sandbox-guide',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/evaluation-harness',
-        },
-        {
-          type: 'doc',
-          id: 'usage/how-to/openshift-example',
-        }
-      ]
-    },
-    {
-      type: 'category',
-      label: 'Architecture',
-      items: [
-        {
-          type: 'doc',
-          label: 'Backend',
-          id: 'usage/architecture/backend',
-        },
-        {
-          type: 'doc',
-          label: 'Runtime',
-          id: 'usage/architecture/runtime',
-        }
-      ],
-    },
-    {
+      id: 'usage/how-to/custom-sandbox-guide',
+    }, {
      type: 'doc',
-      label: 'About',
-      id: 'usage/about',
-    }
-  ],
+      id: 'usage/how-to/evaluation-harness',
+    }, {
+      type: 'doc',
+      id: 'usage/how-to/openshift-example',
+    }]
+  }, {
+    type: 'category',
+    label: 'LLMs',
+    items: [{
+      type: 'doc',
+      label: 'Overview',
+      id: 'usage/llms/llms',
+    }, {
+      type: 'doc',
+      label: 'OpenAI',
+      id: 'usage/llms/openai-llms',
+    }, {
+      type: 'doc',
+      label: 'Azure',
+      id: 'usage/llms/azure-llms',
+    }, {
+      type: 'doc',
+      label: 'Google',
+      id: 'usage/llms/google-llms',
+    }, {
+      type: 'doc',
+      label: 'Local/ollama',
+      id: 'usage/llms/local-llms',
+    }],
+  }, {
+    type: 'category',
+    label: 'Architecture',
+    items: [{
+      type: 'doc',
+      label: 'Backend',
+      id: 'usage/architecture/backend',
+    }, {
+      type: 'doc',
+      label: 'Runtime',
+      id: 'usage/architecture/runtime',
+    }],
+  }, {
+    type: 'doc',
+    label: 'About',
+    id: 'usage/about',
+  }],
 };

 export default sidebars;
@@ -28,6 +28,6 @@
  --secondary-light: #ccc;
 }

-article a, .a {
+p a, .a {
  text-decoration: underline;
 }
@@ -256,7 +256,6 @@ def process_instance(
 if __name__ == '__main__':
    args = parse_arguments()
    dataset = load_dataset('RajMaheshwari/Exercism-Python')
-    dataset = dataset.shuffle(seed=42)
    aider_bench_tests = dataset['train'].to_pandas()

    llm_config = None
@@ -49,4 +49,4 @@ fact8
 		facts.Smart(Dave, True)
 		facts.Kind(Dave, True)
 	assert
-		facts.Quiet(Dave, True)
+		facts.Quiet(Dave, True)
@@ -52,7 +52,7 @@ def get_config(
            base_container_image='xingyaoww/od-eval-logic-reasoning:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
-            runtime_extra_deps='$OH_INTERPRETER_PATH -m pip install scitools-pyke',
+            runtime_extra_deps='$OD_INTERPRETER_PATH -m pip install scitools-pyke',
        ),
        # do not mount workspace
        workspace_base=None,
@@ -105,7 +105,7 @@ def get_config(
            base_container_image='xingyaoww/od-eval-mint:v1.0',
            enable_auto_lint=True,
            use_host_network=False,
-            runtime_extra_deps=f'$OH_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
+            runtime_extra_deps=f'$OD_INTERPRETER_PATH -m pip install {" ".join(MINT_DEPENDENCIES)}',
        ),
        # do not mount workspace
        workspace_base=None,
@@ -14,9 +14,9 @@ To run the tests for OpenHands project, you can use the provided test runner scr
 3. Navigate to the root directory of the project.
 4. Run the test suite using the test runner script with the required arguments:
   ```
-   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-4o
+   python evaluation/regression/run_tests.py --OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxx --model=gpt-3.5-turbo
   ```
-   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-4o`, but you can specify a different model if needed.
+   Replace `sk-xxxxxxxxxxxxxxxxxxxxxx` with your actual OpenAI API key. The default model is `gpt-3.5-turbo`, but you can specify a different model if needed.

 The test runner will discover and execute all the test cases in the `cases/` directory, and display the results of the test suite, including the status of each individual test case and the overall summary.

@@ -29,27 +29,21 @@ if __name__ == '__main__':

    if command == 'reverse':
        from commands.reverse import reverse_string
-
        print(reverse_string(input_string))
    elif command == 'uppercase':
        from commands.uppercase import to_uppercase
-
        print(to_uppercase(input_string))
    elif command == 'lowercase':
        from commands.lowercase import to_lowercase
-
        print(to_lowercase(input_string))
    elif command == 'spongebob':
        from commands.spongebob import spongebob_case
-
        print(spongebob_case(input_string))
    elif command == 'length':
        from commands.length import string_length
-
        print(string_length(input_string))
    elif command == 'scramble':
        from commands.scramble import scramble_string
-
        print(scramble_string(input_string))
    else:
        print('Invalid command!')
@@ -10,27 +10,21 @@ if __name__ == '__main__':

    if command == 'reverse':
        from commands.reverse import reverse_string
-
        print(reverse_string(input_string))
    elif command == 'uppercase':
        from commands.uppercase import to_uppercase
-
        print(to_uppercase(input_string))
    elif command == 'lowercase':
        from commands.lowercase import to_lowercase
-
        print(to_lowercase(input_string))
    elif command == 'spongebob':
        from commands.spongebob import spongebob_case
-
        print(spongebob_case(input_string))
    elif command == 'length':
        from commands.length import string_length
-
        print(string_length(input_string))
    elif command == 'scramble':
        from commands.scramble import scramble_string
-
        print(scramble_string(input_string))
    else:
        print('Invalid command!')
@@ -24,7 +24,7 @@ This is now the default behavior.

 Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the SWE-Bench set you are running on) for the [instance-level docker image](#openhands-swe-bench-instance-level-docker-support).

-When the `run_infer.sh` script is started, it will automatically pull the relevant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
+When the `run_infer.sh` script is started, it will automatically pull the relavant SWE-Bench images. For example, for instance ID `django_django-11011`, it will try to pull our pre-build docker image `sweb.eval.x86_64.django_s_django-11011` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.

 ```bash
 ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
@@ -63,7 +63,7 @@ then your command would be:
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ```

-### Run Inference on `RemoteRuntime` (experimental)
+### Run Inference on `RemoteRuntime`

 This is in limited beta. Contact Xingyao over slack if you want to try this out!

@@ -157,23 +157,6 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
 - `logs/`: a directory of test logs

-### Run evaluation with `RemoteRuntime` (experimental)
-
-This is in limited beta. Contact Xingyao over slack if you want to try this out!
-
-```bash
-# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
-# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
-```
-
-To clean-up all existing runtimes that you've already started, run:
-
-```bash
-ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
-```
-
-
 ## Visualize Results

 First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
@@ -196,7 +179,7 @@ Then, in a separate Python environment with `streamlit` library, you can run the
 ```bash
 # Make sure you are inside the cloned `evaluation` repo
 conda activate streamlit # if you follow the optional conda env setup above
-streamlit app.py --server.port 8501 --server.address 0.0.0.0
+streamlit run 0_📊_OpenHands_Benchmark.py --server.port 8501 --server.address 0.0.0.0
 ```

 Then you can access the SWE-Bench trajectory visualizer at `localhost:8501`.
@@ -1,376 +0,0 @@
-import os
-import tempfile
-import time
-
-import pandas as pd
-from swebench.harness.grading import get_eval_report
-from swebench.harness.run_evaluation import (
-    APPLY_PATCH_FAIL,
-    APPLY_PATCH_PASS,
-)
-from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
-from swebench.harness.utils import load_swebench_dataset
-
-from evaluation.swe_bench.run_infer import get_instance_docker_image
-from evaluation.utils.shared import (
-    EvalMetadata,
-    EvalOutput,
-    prepare_dataset,
-    reset_logger_for_multiprocessing,
-    run_evaluation,
-)
-from openhands.core.config import (
-    AppConfig,
-    SandboxConfig,
-    get_parser,
-)
-from openhands.core.logger import openhands_logger as logger
-from openhands.core.main import create_runtime
-from openhands.events.action import CmdRunAction
-from openhands.events.observation import CmdOutputObservation
-
-# TODO: migrate all swe-bench docker to ghcr.io/openhands
-DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
-logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
-
-
-def process_git_patch(patch):
-    if not isinstance(patch, str):
-        return ''
-
-    if not patch.strip():
-        # skip empty patches
-        return ''
-
-    patch = patch.replace('\r\n', '\n')
-    # There might be some weird characters at the beginning of the patch
-    # due to some OpenHands inference command outputs
-
-    # FOR EXAMPLE:
-    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
-    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
-    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
-    # new file mode 100644
-    # index 0000000000..fc13db5948
-
-    # We "find" the first line that starts with "diff" and then we remove lines before it
-    lines = patch.split('\n')
-    for i, line in enumerate(lines):
-        if line.startswith('diff --git'):
-            patch = '\n'.join(lines[i:])
-            break
-
-    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
-    return patch
-
-
-def get_config(instance: pd.Series) -> AppConfig:
-    # We use a different instance image for the each instance of swe-bench eval
-    base_container_image = get_instance_docker_image(instance['instance_id'])
-    logger.info(
-        f'Using instance container image: {base_container_image}. '
-        f'Please make sure this image exists. '
-        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
-    )
-    config = AppConfig(
-        run_as_openhands=False,
-        runtime=os.environ.get('RUNTIME', 'eventstream'),
-        sandbox=SandboxConfig(
-            base_container_image=base_container_image,
-            use_host_network=False,
-            # large enough timeout, since some testcases take very long to run
-            timeout=1800,
-            api_key=os.environ.get('ALLHANDS_API_KEY', None),
-        ),
-        # do not mount workspace
-        workspace_base=None,
-        workspace_mount_path=None,
-    )
-    return config
-
-
-def process_instance(
-    instance: pd.Series,
-    metadata: EvalMetadata | None = None,
-    reset_logger: bool = True,
-) -> EvalOutput:
-    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
-    if reset_logger:
-        global output_file
-        log_dir = output_file.replace('.jsonl', '.logs')
-        os.makedirs(log_dir, exist_ok=True)
-        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
-    else:
-        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
-
-    config = get_config(instance)
-    instance_id = instance.instance_id
-    model_patch = instance['model_patch']
-    test_spec: TestSpec = instance['test_spec']
-    logger.info(f'Starting evaluation for instance {instance_id}.')
-
-    if 'test_result' not in instance.keys():
-        instance['test_result'] = {}
-    instance['test_result']['report'] = {
-        'empty_generation': False,
-        'resolved': False,
-        'failed_apply_patch': False,
-        'error_eval': False,
-        'test_timeout': False,
-    }
-
-    if model_patch == '':
-        instance['test_result']['report']['empty_generation'] = True
-        return EvalOutput(
-            instance_id=instance_id,
-            test_result=instance['test_result'],
-        )
-
-    runtime = create_runtime(config, sid=instance_id)
-
-    # Get patch and save it to /tmp/patch.diff
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Patch file
-        patch_file_path = os.path.join(temp_dir, 'patch.diff')
-        with open(patch_file_path, 'w') as f:
-            f.write(model_patch)
-        runtime.copy_to(patch_file_path, '/tmp')
-        # Eval script
-        eval_script_path = os.path.join(temp_dir, 'eval.sh')
-        with open(eval_script_path, 'w') as f:
-            f.write(test_spec.eval_script)
-        runtime.copy_to(eval_script_path, '/tmp')
-
-    # Set +x
-    action = CmdRunAction(command='chmod +x /tmp/eval.sh')
-    action.timeout = 600
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert obs.exit_code == 0
-
-    # Apply patch
-    exec_command = (
-        'cd /testbed && '
-        "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
-        "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
-        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
-        "echo 'APPLY_PATCH_FAIL')))"
-    )
-    action = CmdRunAction(command=exec_command, keep_prompt=False)
-    action.timeout = 600
-    obs = runtime.run_action(action)
-    assert isinstance(obs, CmdOutputObservation)
-    apply_patch_output = obs.content
-    assert isinstance(apply_patch_output, str)
-    instance['test_result']['apply_patch_output'] = apply_patch_output
-
-    try:
-        if 'APPLY_PATCH_FAIL' in apply_patch_output:
-            logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
-            instance['test_result']['report']['failed_apply_patch'] = True
-
-            return EvalOutput(
-                instance_id=instance_id,
-                test_result=instance['test_result'],
-            )
-        elif 'APPLY_PATCH_PASS' in apply_patch_output:
-            logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
-
-            # Run eval script in background and save output to log file
-            log_file = '/tmp/eval_output.log'
-            action = CmdRunAction(
-                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
-            )
-            action.timeout = 60  # Short timeout just to get the process ID
-            obs = runtime.run_action(action)
-
-            if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
-                pid = obs.content.split()[-1].strip()
-                logger.info(
-                    f'[{instance_id}] Evaluation process started with PID: {pid}'
-                )
-
-                # Poll for completion
-                start_time = time.time()
-                timeout = 1800  # 30 minutes
-                while True:
-                    seconds_elapsed = time.time() - start_time
-                    if seconds_elapsed > timeout:
-                        logger.info(
-                            f'[{instance_id}] Evaluation timed out after {timeout} seconds'
-                        )
-                        instance['test_result']['report']['test_timeout'] = True
-                        break
-                    check_action = CmdRunAction(
-                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
-                    )
-                    check_action.timeout = 60
-                    check_obs = runtime.run_action(check_action)
-                    if (
-                        isinstance(check_obs, CmdOutputObservation)
-                        and check_obs.content.split()[-1].strip() == '1'
-                    ):
-                        logger.info(
-                            f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
-                        )
-                        break
-                    logger.info(
-                        f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
-                    )
-                    time.sleep(30)  # Wait for 30 seconds before checking again
-
-                # Read the log file
-                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
-                cat_action.timeout = 300
-                cat_obs = runtime.run_action(cat_action)
-
-                # Grade answer
-                if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
-                    test_output = cat_obs.content
-                    assert isinstance(test_output, str)
-                    instance['test_result']['test_output'] = test_output
-
-                    # Get report from test output
-                    logger.info(f'[{instance_id}] Grading answer...')
-                    with tempfile.TemporaryDirectory() as temp_dir:
-                        # Create a directory structure that matches the expected format
-                        # NOTE: this is a hack to make the eval report format consistent
-                        # with the original SWE-Bench eval script
-                        log_dir = os.path.join(temp_dir, 'logs', instance_id)
-                        os.makedirs(log_dir, exist_ok=True)
-                        test_output_path = os.path.join(log_dir, 'test_output.txt')
-                        with open(test_output_path, 'w') as f:
-                            f.write(test_output)
-
-                        _report = get_eval_report(
-                            test_spec=test_spec,
-                            prediction={
-                                'model_patch': model_patch,
-                                'instance_id': instance_id,
-                            },
-                            log_path=test_output_path,
-                            include_tests_status=True,
-                        )
-                        report = _report[instance_id]
-                        logger.info(
-                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
-                        )
-                        instance['test_result']['report']['resolved'] = report[
-                            'resolved'
-                        ]
-            else:
-                logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
-                instance['test_result']['report']['error_eval'] = True
-
-            return EvalOutput(
-                instance_id=instance_id,
-                test_result=instance['test_result'],
-            )
-        else:
-            logger.info(
-                f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
-            )
-            raise RuntimeError(
-                instance_id,
-                f'Unexpected output when applying patch:\n{apply_patch_output}',
-                logger,
-            )
-    finally:
-        runtime.close()
-
-
-if __name__ == '__main__':
-    parser = get_parser()
-    parser.add_argument(
-        '--input-file',
-        type=str,
-        help='Path to input predictions file',
-        required=True,
-    )
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        default='princeton-nlp/SWE-bench',
-        help='data set to evaluate on, either full-test or lite-test',
-    )
-    parser.add_argument(
-        '--split',
-        type=str,
-        default='test',
-        help='split to evaluate on',
-    )
-    args, _ = parser.parse_known_args()
-
-    # Load SWE-Bench dataset
-    full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
-        args.dataset, args.split
-    )
-    instance_id_to_instance = {
-        instance['instance_id']: instance for instance in full_dataset
-    }
-    logger.info(
-        f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
-    )
-
-    # Load predictions
-    assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
-    predictions = pd.read_json(args.input_file, lines=True)
-    assert (
-        'instance_id' in predictions.columns
-    ), 'Input file must contain instance_id column.'
-
-    if 'model_patch' not in predictions.columns and (
-        'test_result' in predictions.columns
-        and 'model_patch' in predictions['test_result'].iloc[0]
-    ):
-        raise ValueError(
-            'Input file must contain model_patch column OR test_result column with model_patch field.'
-        )
-    assert len(predictions['instance_id'].unique()) == len(
-        predictions
-    ), 'instance_id column must be unique.'
-
-    if 'model_patch' not in predictions.columns:
-        predictions['model_patch'] = predictions['test_result'].apply(
-            lambda x: x['git_patch']
-        )
-    assert {'instance_id', 'model_patch'}.issubset(
-        set(predictions.columns)
-    ), 'Input file must contain instance_id and model_patch columns.'
-
-    # Process model_patch
-    predictions['model_patch'] = predictions['model_patch'].apply(process_git_patch)
-
-    # Merge predictions with dataset
-    predictions['instance'] = predictions['instance_id'].apply(
-        lambda x: instance_id_to_instance[x]
-    )
-    predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
-
-    # Prepare dataset
-    output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
-    instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
-
-    run_evaluation(
-        instances,
-        metadata=None,
-        output_file=output_file,
-        num_workers=args.eval_num_workers,
-        process_instance_func=process_instance,
-    )
-
-    # Load evaluated predictions & print number of resolved predictions
-    evaluated_predictions = pd.read_json(output_file, lines=True)
-    fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
-
-    def count_report_field(row, field):
-        return row['test_result']['report'][field]
-
-    for field in fields:
-        count = evaluated_predictions.apply(
-            count_report_field, args=(field,), axis=1
-        ).sum()
-        logger.info(
-            f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
-        )
@@ -2,6 +2,7 @@ import asyncio
 import json
 import os
 import tempfile
+import time
 from typing import Any

 import pandas as pd
@@ -30,9 +31,7 @@ from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import CmdRunAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
-from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.runtime import Runtime
-from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue

 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
@@ -219,7 +218,7 @@ def initialize_runtime(
        assert obs.exit_code == 0

        action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
-        action.timeout = 3600
+        action.timeout = 1800
        logger.info(action, extra={'msg_type': 'ACTION'})
        obs = runtime.run_action(action)
        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -317,10 +316,10 @@ def complete_runtime(
                break
            else:
                logger.info('Failed to get git diff, retrying...')
-                sleep_if_should_continue(10)
+                time.sleep(10)
        elif isinstance(obs, ErrorObservation):
            logger.error(f'Error occurred: {obs.content}. Retrying...')
-            sleep_if_should_continue(10)
+            time.sleep(10)
        else:
            raise ValueError(f'Unexpected observation type: {type(obs)}')

@@ -384,7 +383,10 @@ def process_instance(
    if state is None:
        raise ValueError('State should not be None.')

-    histories = [event_to_dict(event) for event in state.history.get_events()]
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
    metrics = state.metrics.get() if state.metrics else None

    # Save the output
@@ -396,7 +398,6 @@ def process_instance(
        metadata=metadata,
        history=histories,
        metrics=metrics,
-        llm_completions=state.extra_data.get('llm_completions', []),
        error=state.last_error if state and state.last_error else None,
    )
    return output
@@ -5,23 +5,17 @@
 BASE_URL="https://api.all-hands.dev/v0"

 # Get the list of runtimes
-response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
-  --header "X-API-Key: ${ALLHANDS_API_KEY}")
+runtimes=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
+  --header "X-API-Key: ${ALLHANDS_API_KEY}" | jq -r '.runtimes | .[].runtime_id')

-n_runtimes=$(echo $response | jq -r '.total')
-echo "Found ${n_runtimes} runtimes. Stopping them..."
-
-runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
 # Loop through each runtime and stop it
-counter=1
-for runtime_id in $runtime_ids; do
-  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
+for runtime_id in $runtimes; do
+  echo "Stopping runtime: ${runtime_id}"
  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
    --header "Content-Type: application/json" \
    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
  echo
-  ((counter++))
 done

 echo "All runtimes have been stopped."
@@ -0,0 +1,63 @@
+import argparse
+import os
+
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+parser.add_argument('od_output_file', type=str)
+args = parser.parse_args()
+output_filepath = args.od_output_file.replace('.jsonl', '.swebench.jsonl')
+print(f'Converting {args.od_output_file} to {output_filepath}')
+
+od_format = pd.read_json(args.od_output_file, orient='records', lines=True)
+# model name is the folder name of od_output_file
+model_name = os.path.basename(os.path.dirname(args.od_output_file))
+
+
+def process_git_patch(patch):
+    if not isinstance(patch, str):
+        return ''
+
+    if not patch.strip():
+        # skip empty patches
+        return ''
+
+    patch = patch.replace('\r\n', '\n')
+    # There might be some weird characters at the beginning of the patch
+    # due to some OpenHands inference command outputs
+
+    # FOR EXAMPLE:
+    # git diff --no-color --cached 895f28f9cbed817c00ab68770433170d83132d90
+    # [A[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[C[K0
+    # diff --git a/django/db/models/sql/.backup.query.py b/django/db/models/sql/.backup.query.py
+    # new file mode 100644
+    # index 0000000000..fc13db5948
+
+    # We "find" the first line that starts with "diff" and then we remove lines before it
+    lines = patch.split('\n')
+    for i, line in enumerate(lines):
+        if line.startswith('diff --git'):
+            patch = '\n'.join(lines[i:])
+            break
+
+    patch = patch.rstrip() + '\n'  # Make sure the last line ends with a newline
+    return patch
+
+
+def convert_row_to_swebench_format(row):
+    if 'git_patch' in row:
+        model_patch = row['git_patch']
+    elif 'test_result' in row and 'git_patch' in row['test_result']:
+        model_patch = row['test_result']['git_patch']
+    else:
+        raise ValueError(f'Row {row} does not have a git_patch')
+
+    return {
+        'instance_id': row['instance_id'],
+        'model_patch': process_git_patch(model_patch),
+        'model_name_or_path': model_name,
+    }
+
+
+swebench_format = od_format.apply(convert_row_to_swebench_format, axis=1)
+swebench_format.to_json(output_filepath, lines=True, orient='records')
@@ -1,35 +0,0 @@
-import argparse
-import os
-
-import pandas as pd
-
-from evaluation.swe_bench.eval_infer import process_git_patch
-
-parser = argparse.ArgumentParser()
-parser.add_argument('oh_output_file', type=str)
-args = parser.parse_args()
-output_filepath = args.oh_output_file.replace('.jsonl', '.swebench.jsonl')
-print(f'Converting {args.oh_output_file} to {output_filepath}')
-
-oh_format = pd.read_json(args.oh_output_file, orient='records', lines=True)
-# model name is the folder name of oh_output_file
-model_name = os.path.basename(os.path.dirname(args.oh_output_file))
-
-
-def convert_row_to_swebench_format(row):
-    if 'git_patch' in row:
-        model_patch = row['git_patch']
-    elif 'test_result' in row and 'git_patch' in row['test_result']:
-        model_patch = row['test_result']['git_patch']
-    else:
-        raise ValueError(f'Row {row} does not have a git_patch')
-
-    return {
-        'instance_id': row['instance_id'],
-        'model_patch': process_git_patch(model_patch),
-        'model_name_or_path': model_name,
-    }
-
-
-swebench_format = oh_format.apply(convert_row_to_swebench_format, axis=1)
-swebench_format.to_json(output_filepath, lines=True, orient='records')
@@ -1,27 +0,0 @@
-import argparse
-
-import pandas as pd
-from datasets import load_dataset
-
-parser = argparse.ArgumentParser()
-parser.add_argument('output_filepath', type=str, help='Path to save the output file')
-parser.add_argument(
-    '--dataset_name',
-    type=str,
-    help='Name of the dataset to download',
-    default='princeton-nlp/SWE-bench_Lite',
-)
-parser.add_argument('--split', type=str, help='Split to download', default='test')
-args = parser.parse_args()
-
-dataset = load_dataset(args.dataset_name, split=args.split)
-output_filepath = args.output_filepath
-print(
-    f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
-)
-patches = [
-    {'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
-]
-print(f'{len(patches)} gold patches loaded')
-pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
-print(f'Patches saved to {output_filepath}')
@@ -28,9 +28,9 @@ FILE_NAME=$(basename $PROCESS_FILEPATH)
 echo "Evaluating $FILE_NAME @ $FILE_DIR"

 # ================================================
-# detect whether PROCESS_FILEPATH is in OH format or in SWE-bench format
+# detect whether PROCESS_FILEPATH is in OD format or in SWE-bench format
 echo "=============================================================="
-echo "Detecting whether PROCESS_FILEPATH is in OH format or in SWE-bench format"
+echo "Detecting whether PROCESS_FILEPATH is in OD format or in SWE-bench format"
 echo "=============================================================="
 # SWE-bench format is a JSONL where every line has three fields: model_name_or_path, instance_id, and model_patch
 function is_swebench_format() {
@@ -56,9 +56,9 @@ if [ $IS_SWEBENCH_FORMAT -eq 0 ]; then
 else
    echo "The file IS NOT in SWE-bench format."

-    # ==== Convert OH format to SWE-bench format ====
+    # ==== Convert OD format to SWE-bench format ====
    echo "Merged output file with fine-grained report will be saved to $FILE_DIR"
-    poetry run python3 evaluation/swe_bench/scripts/eval/convert_oh_output_to_swe_json.py $PROCESS_FILEPATH
+    poetry run python3 evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py $PROCESS_FILEPATH
    # replace .jsonl with .swebench.jsonl in filename
    SWEBENCH_FORMAT_JSONL=${PROCESS_FILEPATH/.jsonl/.swebench.jsonl}
    echo "SWEBENCH_FORMAT_JSONL: $SWEBENCH_FORMAT_JSONL"
@@ -1,43 +0,0 @@
-#!/bin/bash
-set -eo pipefail
-
-INPUT_FILE=$1
-NUM_WORKERS=$2
-DATASET=$3
-SPLIT=$4
-
-if [ -z "$INPUT_FILE" ]; then
-  echo "INPUT_FILE not specified (should be a path to a jsonl file)"
-  exit 1
-fi
-
-if [ -z "$DATASET" ]; then
-  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
-  DATASET="princeton-nlp/SWE-bench_Lite"
-fi
-
-if [ -z "$SPLIT" ]; then
-  echo "SPLIT not specified, use default test"
-  SPLIT="test"
-fi
-
-if [ -z "$NUM_WORKERS" ]; then
-  echo "NUM_WORKERS not specified, use default 1"
-  NUM_WORKERS=1
-fi
-
-echo "... Evaluating on $INPUT_FILE ..."
-
-COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \
-  --eval-num-workers $NUM_WORKERS \
-  --input-file $INPUT_FILE \
-  --dataset $DATASET \
-  --split $SPLIT"
-
-if [ -n "$EVAL_LIMIT" ]; then
-  echo "EVAL_LIMIT: $EVAL_LIMIT"
-  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
-fi
-
-# Run the command
-eval $COMMAND
@@ -66,11 +66,6 @@ if [ "$USE_HINT_TEXT" = false ]; then
  EVAL_NOTE="$EVAL_NOTE-no-hint"
 fi

-if [ -n "$EXP_NAME" ]; then
-  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
-fi
-echo "EVAL_NOTE: $EVAL_NOTE"
-
 unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push

 COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
@@ -19,10 +19,10 @@ def extract_modified_files(patch):
    return modified_files


-def process_report(oh_output_file):
+def process_report(od_output_file):
    succ = 0
    fail = 0
-    for line in open(oh_output_file):
+    for line in open(od_output_file):
        line = json.loads(line)
        instance_id = line['instance_id']
        gold_patch = line['swe_instance']['patch']
@@ -48,7 +48,7 @@ def process_report(oh_output_file):

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
-    parser.add_argument('--oh_output_file', help='Path to the OH output file')
+    parser.add_argument('--od_output_file', help='Path to the OD output file')
    args = parser.parse_args()

-    process_report(args.oh_output_file)
+    process_report(args.od_output_file)
@@ -6,15 +6,15 @@ mkdir -p $EVAL_WORKSPACE

 # 1. Prepare REPO
 echo "==== Prepare SWE-bench repo ===="
-OH_SWE_BENCH_REPO_PATH="https://github.com/All-Hands-AI/SWE-bench.git"
-OH_SWE_BENCH_REPO_BRANCH="eval"
-git clone -b $OH_SWE_BENCH_REPO_BRANCH $OH_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OH-SWE-bench
+OD_SWE_BENCH_REPO_PATH="https://github.com/All-Hands-AI/OD-SWE-bench.git"
+OD_SWE_BENCH_REPO_BRANCH="eval"
+git clone -b $OD_SWE_BENCH_REPO_BRANCH $OD_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OD-SWE-bench

 # 2. Prepare DATA
 echo "==== Prepare SWE-bench data ===="
 EVAL_IMAGE=ghcr.io/all-hands-ai/eval-swe-bench:builder_with_conda
 EVAL_WORKSPACE=$(realpath $EVAL_WORKSPACE)
-chmod +x $EVAL_WORKSPACE/OH-SWE-bench/swebench/harness/prepare_data.sh
+chmod +x $EVAL_WORKSPACE/OD-SWE-bench/swebench/harness/prepare_data.sh
 if [ -d $EVAL_WORKSPACE/eval_data ]; then
    rm -r $EVAL_WORKSPACE/eval_data
 fi
@@ -24,4 +24,4 @@ docker run \
    -u $(id -u):$(id -g) \
    -e HF_DATASETS_CACHE="/tmp" \
    --rm -it $EVAL_IMAGE \
-    bash -c "cd OH-SWE-bench/swebench/harness && /swe_util/miniforge3/bin/conda run -n swe-bench-eval ./prepare_data.sh && mv eval_data /workspace/"
+    bash -c "cd OD-SWE-bench/swebench/harness && /swe_util/miniforge3/bin/conda run -n swe-bench-eval ./prepare_data.sh && mv eval_data /workspace/"
@@ -60,7 +60,7 @@ conda activate swe-bench-eval

 mkdir -p $SWE_TASK_DIR/reset_testbed_temp
 mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
-SWE_BENCH_DIR=/swe_util/OH-SWE-bench
+SWE_BENCH_DIR=/swe_util/OD-SWE-bench
 output=$(
    export PYTHONPATH=$SWE_BENCH_DIR && \
    cd $SWE_BENCH_DIR && \
@@ -6,6 +6,7 @@ import pathlib
 import subprocess
 import time
 import traceback
+from concurrent.futures import Future, ProcessPoolExecutor
 from typing import Any, Awaitable, Callable, TextIO

 import pandas as pd
@@ -49,20 +50,15 @@ class EvalMetadata(BaseModel):
 class EvalOutput(BaseModel):
    # NOTE: User-specified
    instance_id: str
+    instruction: str
    # output of the evaluation
    # store anything that is needed for the score calculation
    test_result: dict[str, Any]

-    instruction: str | None = None
-
    # Interaction info
-    metadata: EvalMetadata | None = None
-    # list[tuple[dict[str, Any], dict[str, Any]]] - for compatibility with the old format
-    history: (
-        list[dict[str, Any]] | list[tuple[dict[str, Any], dict[str, Any]]] | None
-    ) = None
-    llm_completions: list[dict[str, Any]]
-    metrics: dict[str, Any] | None = None
+    metadata: EvalMetadata
+    history: list[tuple[dict[str, Any], dict[str, Any]]]
+    metrics: dict[str, Any]
    error: str | None = None

    # Optionally save the input test instance
@@ -70,22 +66,24 @@ class EvalOutput(BaseModel):

    def model_dump(self, *args, **kwargs):
        dumped_dict = super().model_dump(*args, **kwargs)
-        # Remove None values
-        dumped_dict = {k: v for k, v in dumped_dict.items() if v is not None}
        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        if self.metadata is not None:
-            dumped_dict['metadata'] = self.metadata.model_dump()
+        dumped_dict['metadata'] = self.metadata.model_dump()
        return dumped_dict

    def model_dump_json(self, *args, **kwargs):
        dumped = super().model_dump_json(*args, **kwargs)
        dumped_dict = json.loads(dumped)
        # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        if 'metadata' in dumped_dict:
-            dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
+        dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
        return json.dumps(dumped_dict)


+class EvalError(BaseModel):
+    instance_id: str
+    error: str
+    stacktrace: str
+
+
 def codeact_user_response(
    state: State,
    encapsulate_solution: bool = False,
@@ -237,94 +235,85 @@ def prepare_dataset(


 def update_progress(
-    result: EvalOutput,
+    result_or_future: Future | EvalOutput | EvalError,
+    instance: pd.Series,
    pbar: tqdm,
    output_fp: TextIO,
+    instance_queue: mp.Queue,
 ):
    """Update the progress bar and write the result to the output file."""
-    pbar.update(1)
-    pbar.set_description(f'Instance {result.instance_id}')
-    pbar.set_postfix_str(f'Test Result: {result.test_result}')
-    logger.info(
-        f'Finished evaluation for instance {result.instance_id}: {str(result.test_result)[:300]}...\n'
+    try:
+        if isinstance(result_or_future, Future):
+            result = result_or_future.result()
+        else:
+            result = result_or_future
+    except Exception as e:
+        # Handle the error
+        # Exception may be raised in the process_instance_func and will
+        # be raised here when we try to access the .result() of the future
+        handle_error(
+            EvalError(
+                instance_id=instance.instance_id,
+                error=str(e),
+                stacktrace=traceback.format_exc(),
+            ),
+            instance,
+            pbar,
+            instance_queue,
+        )
+        return
+
+    # Update the progress bar and write the result to the output file
+    if isinstance(result, EvalOutput):
+        pbar.update(1)
+        pbar.set_description(f'Instance {result.instance_id}')
+        pbar.set_postfix_str(f'Test Result: {result.test_result}')
+        logger.info(
+            f'Finished evaluation for instance {result.instance_id}: {str(result.test_result)[:300]}...\n'
+        )
+        output_fp.write(json.dumps(result.model_dump()) + '\n')
+        output_fp.flush()
+    elif isinstance(result, EvalError):
+        handle_error(result, instance, pbar, instance_queue)
+    else:
+        raise ValueError(f'Unexpected result type: {type(result)}')
+
+
+def handle_error(
+    error: EvalError, instance: pd.Series, pbar: tqdm, instance_queue: mp.Queue
+):
+    """Handle an error that occurred during evaluation."""
+    logger.error(
+        f'Retrying instance [{instance.instance_id}] due to error: {error.error}. Stacktrace:\n{error.stacktrace}'
+        + '\n'
+        + '-' * 10
+        + '[You may ignore this error if it is a transient issue - the instance will be automatically retried.]'
+        + '-' * 10
+        + '\n'
    )
-    output_fp.write(json.dumps(result.model_dump()) + '\n')
-    output_fp.flush()
-
-
-def _process_instance_wrapper(
-    process_instance_func: Callable[[pd.Series, EvalMetadata, bool], EvalOutput],
-    instance: pd.Series,
-    metadata: EvalMetadata,
-    use_mp: bool,
-    max_retries: int = 5,
-) -> EvalOutput:
-    """Wrap the process_instance_func to handle retries and errors.
-
-    Retry an instance up to max_retries times if it fails (e.g., due to transient network/runtime issues).
-    """
-    for attempt in range(max_retries + 1):
-        try:
-            result = process_instance_func(instance, metadata, use_mp)
-            return result
-        except Exception as e:
-            error = str(e)
-            stacktrace = traceback.format_exc()
-            if attempt == max_retries:
-                logger.exception(e)
-                msg = (
-                    '-' * 10
-                    + '\n'
-                    + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
-                    + '\n'
-                    + f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
-                    + '-' * 10
-                )
-                # Raise an error after all retries & stop the evaluation
-                logger.exception(e)
-                raise RuntimeError(
-                    f'Maximum error retries reached for instance {instance.instance_id}'
-                ) from e
-            msg = (
-                '-' * 10
-                + '\n'
-                + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
-                + '\n'
-                + '-' * 10
-                + f'[The above error occurred. Retrying... (attempt {attempt + 1} of {max_retries})]'
-                + '-' * 10
-                + '\n'
-            )
-            logger.error(msg)
-            if use_mp:
-                print(msg)  # use print to directly print to console
-            time.sleep(5)
-
-
-def _process_instance_wrapper_mp(args):
-    """Wrapper for multiprocessing, especially for imap_unordered."""
-    return _process_instance_wrapper(*args)
+    instance_queue.put(instance)
+    pbar.total += 1
+    pbar.refresh()


 def run_evaluation(
    dataset: pd.DataFrame,
-    metadata: EvalMetadata | None,
+    metadata: EvalMetadata,
    output_file: str,
    num_workers: int,
    process_instance_func: Callable[
        [pd.Series, EvalMetadata, bool], Awaitable[EvalOutput]
    ],
-    max_retries: int = 5,  # number of retries for each instance
 ):
    use_multiprocessing = num_workers > 1
+    logger.info(
+        f'Evaluation started with Agent {metadata.agent_class}:\n'
+        f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
+    )

-    if metadata is not None:
-        logger.info(
-            f'Evaluation started with Agent {metadata.agent_class}:\n'
-            f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
-        )
-    else:
-        logger.info(f'Evaluation started with {num_workers} workers.')
+    instance_queue = mp.Queue()
+    for _, instance in dataset.iterrows():
+        instance_queue.put(instance)

    total_instances = len(dataset)
    pbar = tqdm(total=total_instances, desc='Instances processed')
@@ -332,24 +321,54 @@ def run_evaluation(

    try:
        if use_multiprocessing:
-            with mp.Pool(num_workers) as pool:
-                args_iter = (
-                    (process_instance_func, instance, metadata, True, max_retries)
-                    for _, instance in dataset.iterrows()
-                )
-                results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
-                for result in results:
-                    update_progress(result, pbar, output_fp)
+            with ProcessPoolExecutor(num_workers) as executor:
+                batch_futures = []
+
+                # Loop until there are *no more instances to be processed* and *all (in-progress) futures are done*
+                # since a running future may add new instances to the queue when error occurs
+                while not instance_queue.empty() or batch_futures:
+                    # Submit new tasks if there are instances to be processed and available workers
+                    while (
+                        not instance_queue.empty() and len(batch_futures) < num_workers
+                    ):
+                        try:
+                            instance = instance_queue.get(block=False)
+                            future = executor.submit(
+                                process_instance_func, instance, metadata, True
+                            )
+                            future.instance = (
+                                instance  # Attach the instance to the future
+                            )
+                            batch_futures.append(future)
+                        except mp.queues.Empty:
+                            logger.warning(
+                                'Queue is empty - This should not happen. This is a bug.'
+                            )
+                            break  # Queue is empty, stop submitting new tasks
+
+                    # Continue to wait for the futures to be done & remove completed futures
+                    new_batch_futures = []
+                    for future in batch_futures:
+                        if future.done():
+                            update_progress(
+                                future, future.instance, pbar, output_fp, instance_queue
+                            )
+                        else:
+                            new_batch_futures.append(future)
+                    batch_futures = new_batch_futures
+
+                    # Short sleep to prevent busy-waiting
+                    time.sleep(1)
+
+                assert instance_queue.empty(), 'instance_queue should be empty after all futures are done. This is a bug.'
+                assert (
+                    len(batch_futures) == 0
+                ), 'batch_futures should be empty after all futures are done. This is a bug.'
        else:
-            for _, instance in dataset.iterrows():
-                result = _process_instance_wrapper(
-                    process_instance_func=process_instance_func,
-                    instance=instance,
-                    metadata=metadata,
-                    use_mp=False,
-                    max_retries=max_retries,
-                )
-                update_progress(result, pbar, output_fp)
+            while not instance_queue.empty():
+                instance = instance_queue.get()
+                result = process_instance_func(instance, metadata, False)
+                update_progress(result, instance, pbar, output_fp, instance_queue)

    except KeyboardInterrupt:
        print('\nKeyboardInterrupt received. Cleaning up...\n')
@@ -1,6 +1,6 @@
 {
  "name": "openhands-frontend",
-  "version": "0.9.4",
+  "version": "0.9.3",
  "private": true,
  "type": "module",
  "engines": {
@@ -8,7 +8,7 @@
  },
  "dependencies": {
    "@monaco-editor/react": "^4.6.0",
-    "@nextui-org/react": "^2.4.8",
+    "@nextui-org/react": "^2.4.6",
    "@react-types/shared": "^3.24.1",
    "@reduxjs/toolkit": "^2.2.7",
    "@vitejs/plugin-react": "^4.3.1",
@@ -19,20 +19,20 @@
    "i18next": "^23.15.1",
    "i18next-browser-languagedetector": "^8.0.0",
    "i18next-http-backend": "^2.6.1",
-    "jose": "^5.9.3",
-    "monaco-editor": "^0.52.0",
+    "jose": "^5.8.0",
+    "monaco-editor": "^0.51.0",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
    "react-highlight": "^0.15.0",
    "react-hot-toast": "^2.4.1",
-    "react-i18next": "^15.0.2",
+    "react-i18next": "^15.0.1",
    "react-icons": "^5.3.0",
    "react-markdown": "^9.0.1",
    "react-redux": "^9.1.2",
    "react-syntax-highlighter": "^15.5.0",
    "remark-gfm": "^4.0.0",
    "tailwind-merge": "^2.5.2",
-    "vite": "^5.4.7",
+    "vite": "^5.4.5",
    "web-vitals": "^3.5.2"
  },
  "scripts": {
@@ -64,8 +64,8 @@
    "@testing-library/jest-dom": "^6.5.0",
    "@testing-library/react": "^16.0.1",
    "@testing-library/user-event": "^14.5.2",
-    "@types/node": "^22.6.1",
-    "@types/react": "^18.3.8",
+    "@types/node": "^22.5.4",
+    "@types/react": "^18.3.5",
    "@types/react-dom": "^18.3.0",
    "@types/react-highlight": "^0.12.8",
    "@types/react-syntax-highlighter": "^15.5.13",
@@ -85,9 +85,9 @@
    "husky": "^9.1.6",
    "jsdom": "^25.0.0",
    "lint-staged": "^15.2.10",
-    "postcss": "^8.4.47",
+    "postcss": "^8.4.45",
    "prettier": "^3.3.3",
-    "tailwindcss": "^3.4.12",
+    "tailwindcss": "^3.4.11",
    "typescript": "^5.6.2",
    "vite-tsconfig-paths": "^5.0.1",
    "vitest": "^1.6.0"
@@ -9,8 +9,8 @@
 		</clipPath>
 	</defs>
 	<style>
-		.s0 { fill: none }
-		.s1 { fill: #000000 }
+		.s0 { fill: none } 
+		.s1 { fill: #000000 } 
 	</style>
 	<g id="surface1">
 		<path class="s0" d="m1258.6 499.2c-40.8-26.1-68 13.8-64.7 68.1l-0.3 0.4c0.1-56.6-7.3-119.2-31.7-169.8-8.7-17.9-26.2-47.4-61-33.5-15.2 6.1-29 24.4-21.9 71.7 0 0 8 49.7 6.5 112.2v0.8c-9.9-172.2-47.3-224.7-100.7-221.3-17.1 3.1-40.5 10.8-32.6 63.8 0 0 8.5 55.2 11.3 99.2l0.2 2.2h-0.2c-25.2-94.9-59-96.2-83.5-92.5-22.3 3.3-46.6 27.3-34.3 74.7 38.6 148.4 31 327.2 28.2 352.9-7.9-17.6-10.3-31.4-21.3-50.7-43.9-77.1-64.8-82.8-90.4-84-25.5-1.1-53 15.2-51.2 46.3 1.9 31.1 17.1 36.3 38.7 79.6 16.9 33.8 21.7 78 55.7 158.4 28.1 66.5 101.6 139.5 235.6 130.8 108.5-3.7 270.6-43.2 242.4-302.5-7-45-1.7-82.7 1.9-121.4 5.8-60 14.1-159.4-26.7-185.5z"/>
@@ -29,4 +29,4 @@
 		<path class="s1" d="m668.8 421.6c-10.9 0.1-20.3-8.5-21.2-20-4-51.4-4.2-103.5-0.4-154.8 0.9-12 11.1-21 22.6-20.1 11.6 0.9 20.3 11.3 19.4 23.3-3.6 49.1-3.4 98.9 0.4 148 1 12-7.7 22.5-19.3 23.5-0.5 0-1 0-1.5 0z"/>
 		<path class="s1" d="m596.2 435.1c-9.4 0.1-18.2-6.5-20.6-16.4-8.9-36.3-25.9-70.8-48.9-99.7-7.4-9.3-6.1-23 2.8-30.7 9-7.6 22.3-6.3 29.7 3 26.9 33.8 46.7 74.2 57.2 116.7 2.9 11.6-4 23.5-15.2 26.5-1.8 0.4-3.4 0.6-5.1 0.7z"/>
 	</g>
-</svg>
+</svg>
@@ -18,7 +18,6 @@ enum IndicatorColor {
 function AgentStatusBar() {
  const { t } = useTranslation();
  const { curAgentState } = useSelector((state: RootState) => state.agent);
-  const { curStatusMessage } = useSelector((state: RootState) => state.status);

  const AgentStatusMap: {
    [k: string]: { message: string; indicator: IndicatorColor };
@@ -91,25 +90,14 @@ function AgentStatusBar() {
    }
  }, [curAgentState]);

-  const [statusMessage, setStatusMessage] = React.useState<string>("");
-
-  React.useEffect(() => {
-    const trimmedCustomMessage = curStatusMessage.message.trim();
-    if (trimmedCustomMessage) {
-      setStatusMessage(t(trimmedCustomMessage));
-    } else {
-      setStatusMessage(AgentStatusMap[curAgentState].message);
-    }
-  }, [curAgentState, curStatusMessage.message]);
-
  return (
-    <div className="flex flex-col items-center">
-      <div className="flex items-center">
-        <div
-          className={`w-3 h-3 mr-2 rounded-full animate-pulse ${AgentStatusMap[curAgentState].indicator}`}
-        />
-        <span className="text-sm text-stone-400">{statusMessage}</span>
-      </div>
+    <div className="flex items-center">
+      <div
+        className={`w-3 h-3 mr-2 rounded-full animate-pulse ${AgentStatusMap[curAgentState].indicator}`}
+      />
+      <span className="text-sm text-stone-400">
+        {AgentStatusMap[curAgentState].message}
+      </span>
    </div>
  );
 }
@@ -5,25 +5,16 @@ import { renderWithProviders } from "test-utils";
 import Chat from "./Chat";

 const MESSAGES: Message[] = [
-  {
-    sender: "assistant",
-    content: "Hello!",
-    imageUrls: [],
-    timestamp: new Date().toISOString(),
-  },
-  {
-    sender: "user",
-    content: "Hi!",
-    imageUrls: [],
-    timestamp: new Date().toISOString(),
-  },
+  { sender: "assistant", content: "Hello!", imageUrls: [] },
+  { sender: "user", content: "Hi!", imageUrls: [] },
+  { sender: "assistant", content: "How can I help you today?", imageUrls: [] },
 ];

 describe("Chat", () => {
  it("should render chat messages", () => {
    renderWithProviders(<Chat messages={MESSAGES} />);

-    const messages = screen.getAllByTestId("article");
+    const messages = screen.getAllByTestId("message");
    expect(messages).toHaveLength(MESSAGES.length);
  });
 });
@@ -9,62 +9,17 @@ import ActionType from "#/types/ActionType";
 import { addAssistantMessage } from "#/state/chatSlice";
 import AgentState from "#/types/AgentState";

-/// <reference types="vitest" />
-
-interface CustomMatchers<R = unknown> {
-  toMatchMessageEvent(expected: string): R;
-}
-
-declare module "vitest" {
-  interface Assertion<T> extends CustomMatchers<T> {}
-  interface AsymmetricMatchersContaining extends CustomMatchers {}
-}
-
 // This is for the scrollview ref in Chat.tsx
 // TODO: Move this into test setup
 HTMLElement.prototype.scrollTo = vi.fn().mockImplementation(() => {});
-const TEST_TIMESTAMP = new Date().toISOString();

 describe("ChatInterface", () => {
  const sessionSendSpy = vi.spyOn(Session, "send");
  vi.spyOn(Session, "isConnected").mockReturnValue(true);

-  // TODO: replace below with e.g. fake timers
-  // https://vitest.dev/guide/mocking#timers
-  // https://vitest.dev/api/vi.html#vi-usefaketimers
-  // Custom matcher for testing message events
-  expect.extend({
-    toMatchMessageEvent(received, expected) {
-      const receivedObj = JSON.parse(received);
-      const expectedObj = JSON.parse(expected);
-
-      // Compare everything except the timestamp
-      const { timestamp: receivedTimestamp, ...receivedRest } =
-        receivedObj.args;
-      const { timestamp: expectedTimestamp, ...expectedRest } =
-        expectedObj.args;
-
-      const pass =
-        this.equals(receivedRest, expectedRest) &&
-        typeof receivedTimestamp === "string";
-
-      return {
-        pass,
-        message: () =>
-          pass
-            ? `expected ${received} not to match the structure of ${expected} (ignoring exact timestamp)`
-            : `expected ${received} to match the structure of ${expected} (ignoring exact timestamp)`,
-      };
-    },
-  });
-
  const userMessageEvent = {
    action: ActionType.MESSAGE,
-    args: {
-      content: "my message",
-      images_urls: [],
-      timestamp: TEST_TIMESTAMP,
-    },
+    args: { content: "my message", images_urls: [] },
  };

  afterEach(() => {
@@ -73,26 +28,19 @@ describe("ChatInterface", () => {

  it("should render empty message list and input", () => {
    renderWithProviders(<ChatInterface />);
-    expect(screen.queryAllByTestId("article")).toHaveLength(0);
+    expect(screen.queryAllByTestId("message")).toHaveLength(0);
  });

  it("should render user and assistant messages", () => {
    const { store } = renderWithProviders(<ChatInterface />, {
      preloadedState: {
        chat: {
-          messages: [
-            {
-              sender: "user",
-              content: "Hello",
-              imageUrls: [],
-              timestamp: TEST_TIMESTAMP,
-            },
-          ],
+          messages: [{ sender: "user", content: "Hello", imageUrls: [] }],
        },
      },
    });

-    expect(screen.getAllByTestId("article")).toHaveLength(1);
+    expect(screen.getAllByTestId("message")).toHaveLength(1);
    expect(screen.getByText("Hello")).toBeInTheDocument();

    act(() => {
@@ -100,7 +48,7 @@ describe("ChatInterface", () => {
      store.dispatch(addAssistantMessage("Hello to you!"));
    });

-    expect(screen.getAllByTestId("article")).toHaveLength(2);
+    expect(screen.getAllByTestId("message")).toHaveLength(2);
    expect(screen.getByText("Hello to you!")).toBeInTheDocument();
  });

@@ -119,7 +67,7 @@ describe("ChatInterface", () => {
    await user.keyboard("{Enter}");

    expect(sessionSendSpy).toHaveBeenCalledWith(
-      expect.toMatchMessageEvent(JSON.stringify(userMessageEvent)),
+      JSON.stringify(userMessageEvent),
    );
  });

@@ -138,7 +86,7 @@ describe("ChatInterface", () => {
    await user.keyboard("{Enter}");

    expect(sessionSendSpy).toHaveBeenCalledWith(
-      expect.toMatchMessageEvent(JSON.stringify(userMessageEvent)),
+      JSON.stringify(userMessageEvent),
    );
  });

@@ -67,15 +67,8 @@ function ChatInterface() {
  };

  const handleSendMessage = (content: string, imageUrls: string[]) => {
-    const timestamp = new Date().toISOString();
-    dispatch(
-      addUserMessage({
-        content,
-        imageUrls,
-        timestamp,
-      }),
-    );
-    sendChatMessage(content, imageUrls, timestamp);
+    dispatch(addUserMessage({ content, imageUrls }));
+    sendChatMessage(content, imageUrls);
  };

  const { t } = useTranslation();
@@ -105,7 +98,6 @@ function ChatInterface() {
          ref={scrollRef}
          className="overflow-y-auto p-3"
          onScroll={(e) => onChatBodyScroll(e.currentTarget)}
-          aria-label={t(I18nKey.CHAT_INTERFACE$CHAT_CONVERSATION)}
        >
          <Chat messages={messages} curAgentState={curAgentState} />
        </div>
@@ -9,35 +9,25 @@ describe("Message", () => {
  it("should render a user message", () => {
    render(
      <ChatMessage
-        message={{
-          sender: "user",
-          content: "Hello",
-          imageUrls: [],
-          timestamp: new Date().toISOString(),
-        }}
+        message={{ sender: "user", content: "Hello", imageUrls: [] }}
        isLastMessage={false}
      />,
    );

-    expect(screen.getByTestId("article")).toBeInTheDocument();
-    expect(screen.getByTestId("article")).toHaveClass("self-end"); // user message should be on the right side
+    expect(screen.getByTestId("message")).toBeInTheDocument();
+    expect(screen.getByTestId("message")).toHaveClass("self-end"); // user message should be on the right side
  });

  it("should render an assistant message", () => {
    render(
      <ChatMessage
-        message={{
-          sender: "assistant",
-          content: "Hi",
-          imageUrls: [],
-          timestamp: new Date().toISOString(),
-        }}
+        message={{ sender: "assistant", content: "Hi", imageUrls: [] }}
        isLastMessage={false}
      />,
    );

-    expect(screen.getByTestId("article")).toBeInTheDocument();
-    expect(screen.getByTestId("article")).not.toHaveClass("self-end"); // assistant message should be on the left side
+    expect(screen.getByTestId("message")).toBeInTheDocument();
+    expect(screen.getByTestId("message")).not.toHaveClass("self-end"); // assistant message should be on the left side
  });

  it("should render markdown content", () => {
@@ -47,7 +37,6 @@ describe("Message", () => {
          sender: "user",
          content: "```js\nconsole.log('Hello')\n```",
          imageUrls: [],
-          timestamp: new Date().toISOString(),
        }}
        isLastMessage={false}
      />,
@@ -67,17 +56,12 @@ describe("Message", () => {
      const user = userEvent.setup();
      render(
        <ChatMessage
-          message={{
-            sender: "user",
-            content: "Hello",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
+          message={{ sender: "user", content: "Hello", imageUrls: [] }}
          isLastMessage={false}
        />,
      );

-      const message = screen.getByTestId("article");
+      const message = screen.getByTestId("message");
      let copyButton = within(message).queryByTestId("copy-button");
      expect(copyButton).not.toBeInTheDocument();

@@ -96,17 +80,12 @@ describe("Message", () => {
      const user = userEvent.setup();
      render(
        <ChatMessage
-          message={{
-            sender: "user",
-            content: "Hello",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
+          message={{ sender: "user", content: "Hello", imageUrls: [] }}
          isLastMessage={false}
        />,
      );

-      const message = screen.getByTestId("article");
+      const message = screen.getByTestId("message");
      fireEvent.mouseEnter(message);

      const copyButton = within(message).getByTestId("copy-button");
@@ -139,7 +118,6 @@ describe("Message", () => {
            sender: "assistant",
            content: "Are you sure?",
            imageUrls: [],
-            timestamp: new Date().toISOString(),
          }}
          isLastMessage={false}
          awaitingUserConfirmation
@@ -150,12 +128,7 @@ describe("Message", () => {
      // it should not render buttons if the message is not from the assistant
      rerender(
        <ChatMessage
-          message={{
-            sender: "user",
-            content: "Yes",
-            imageUrls: [],
-            timestamp: new Date().toISOString(),
-          }}
+          message={{ sender: "user", content: "Yes", imageUrls: [] }}
          isLastMessage
          awaitingUserConfirmation
        />,
@@ -169,7 +142,6 @@ describe("Message", () => {
            sender: "assistant",
            content: "Are you sure?",
            imageUrls: [],
-            timestamp: new Date().toISOString(),
          }}
          isLastMessage
          awaitingUserConfirmation={false}
@@ -184,7 +156,6 @@ describe("Message", () => {
            sender: "assistant",
            content: "Are you sure?",
            imageUrls: [],
-            timestamp: new Date().toISOString(),
          }}
          isLastMessage
          awaitingUserConfirmation
@@ -8,7 +8,6 @@ import { code } from "../markdown/code";
 import toast from "#/utils/toast";
 import { I18nKey } from "#/i18n/declaration";
 import ConfirmationButtons from "./ConfirmationButtons";
-import { formatTimestamp } from "#/utils/utils";

 interface MessageProps {
  message: Message;
@@ -60,30 +59,19 @@ function ChatMessage({
    }
  };

-  const copyButtonTitle = message.timestamp
-    ? `${t(I18nKey.CHAT_INTERFACE$TOOLTIP_COPY_MESSAGE)} - ${formatTimestamp(message.timestamp)}`
-    : t(I18nKey.CHAT_INTERFACE$TOOLTIP_COPY_MESSAGE);
-
  return (
-    <article
-      data-testid="article"
+    <div
+      data-testid="message"
      className={className}
      onMouseEnter={() => setIsHovering(true)}
      onMouseLeave={() => setIsHovering(false)}
-      aria-label={t(I18nKey.CHAT_INTERFACE$MESSAGE_ARIA_LABEL, {
-        sender: message.sender
-          ? message.sender.charAt(0).toUpperCase() +
-            message.sender.slice(1).toLowerCase()
-          : t(I18nKey.CHAT_INTERFACE$UNKNOWN_SENDER),
-      })}
    >
      {isHovering && (
        <button
          data-testid="copy-button"
          onClick={copyToClipboard}
          className="absolute top-1 right-1 p-1 bg-neutral-600 rounded hover:bg-neutral-700"
-          aria-label={copyButtonTitle}
-          title={copyButtonTitle}
+          aria-label={t(I18nKey.CHAT_INTERFACE$TOOLTIP_COPY_MESSAGE)}
          type="button"
        >
          {isCopy ? <FaClipboardCheck /> : <FaClipboard />}
@@ -92,9 +80,9 @@ function ChatMessage({
      <Markdown components={{ code }} remarkPlugins={[remarkGfm]}>
        {message.content}
      </Markdown>
-      {(message.imageUrls?.length ?? 0) > 0 && (
+      {message.imageUrls.length > 0 && (
        <div className="flex space-x-2 mt-2">
-          {message.imageUrls?.map((url, index) => (
+          {message.imageUrls.map((url, index) => (
            <img
              key={index}
              src={url}
@@ -107,7 +95,7 @@ function ChatMessage({
      {isLastMessage &&
        message.sender === "assistant" &&
        awaitingUserConfirmation && <ConfirmationButtons />}
-    </article>
+    </div>
  );
 }

@@ -2,5 +2,4 @@ type Message = {
  sender: "user" | "assistant";
  content: string;
  imageUrls: string[];
-  timestamp: string;
 };
@@ -112,7 +112,7 @@ export function ModelSelector({
            {models[selectedProvider || ""]?.models
              .filter((model) => VERIFIED_MODELS.includes(model))
              .map((model) => (
-                <AutocompleteItem key={model} value={model} title={model}>
+                <AutocompleteItem key={model} value={model}>
                  {model}
                </AutocompleteItem>
              ))}
@@ -121,7 +121,7 @@ export function ModelSelector({
            {models[selectedProvider || ""]?.models
              .filter((model) => !VERIFIED_MODELS.includes(model))
              .map((model) => (
-                <AutocompleteItem key={model} value={model} title={model}>
+                <AutocompleteItem key={model} value={model}>
                  {model}
                </AutocompleteItem>
              ))}
@@ -52,17 +52,13 @@ function SettingsForm({
  const [enableAdvanced, setEnableAdvanced] =
    React.useState(advancedAlreadyInUse);

-  const handleAdvancedChange = (value: boolean) => {
-    setEnableAdvanced(value);
-  };
-
  return (
    <>
      <Switch
        data-testid="advanced-options-toggle"
        aria-checked={enableAdvanced}
        isSelected={enableAdvanced}
-        onValueChange={handleAdvancedChange}
+        onValueChange={(value) => setEnableAdvanced(value)}
      >
        Advanced Options
      </Switch>
@@ -6,11 +6,10 @@ import {
  ActionSecurityRisk,
  appendSecurityAnalyzerInput,
 } from "#/state/securityAnalyzerSlice";
-import { setCurStatusMessage } from "#/state/statusSlice";
 import { setRootTask } from "#/state/taskSlice";
 import store from "#/store";
 import ActionType from "#/types/ActionType";
-import { ActionMessage, StatusMessage } from "#/types/Message";
+import { ActionMessage } from "#/types/Message";
 import { SocketMessage } from "#/types/ResponseType";
 import { handleObservationMessage } from "./observations";
 import { getRootTask } from "./taskService";
@@ -34,11 +33,7 @@ const messageActions = {
  [ActionType.MESSAGE]: (message: ActionMessage) => {
    if (message.source === "user") {
      store.dispatch(
-        addUserMessage({
-          content: message.args.content,
-          imageUrls: [],
-          timestamp: message.timestamp,
-        }),
+        addUserMessage({ content: message.args.content, imageUrls: [] }),
      );
    } else {
      store.dispatch(addAssistantMessage(message.args.content));
@@ -139,16 +134,6 @@ export function handleActionMessage(message: ActionMessage) {
  }
 }

-export function handleStatusMessage(message: StatusMessage) {
-  const msg = message.message == null ? "" : message.message.trim();
-  store.dispatch(
-    setCurStatusMessage({
-      ...message,
-      message: msg,
-    }),
-  );
-}
-
 export function handleAssistantMessage(data: string | SocketMessage) {
  let socketMessage: SocketMessage;

@@ -160,9 +145,7 @@ export function handleAssistantMessage(data: string | SocketMessage) {

  if ("action" in socketMessage) {
    handleActionMessage(socketMessage);
-  } else if ("observation" in socketMessage) {
+  } else {
    handleObservationMessage(socketMessage);
-  } else if ("message" in socketMessage) {
-    handleStatusMessage(socketMessage);
  }
 }
@@ -1,14 +1,10 @@
 import ActionType from "#/types/ActionType";
 import Session from "./session";

-export function sendChatMessage(
-  message: string,
-  images_urls: string[],
-  timestamp: string,
-): void {
+export function sendChatMessage(message: string, images_urls: string[]): void {
  const event = {
    action: ActionType.MESSAGE,
-    args: { content: message, images_urls, timestamp },
+    args: { content: message, images_urls },
  };
  const eventString = JSON.stringify(event);
  Session.send(eventString);
@@ -8,19 +8,11 @@ import { I18nKey } from "#/i18n/declaration";

 const translate = (key: I18nKey) => i18next.t(key);

-// Define a type for the messages
-type Message = {
-  action: ActionType;
-  args: Record<string, unknown>;
-};
-
 class Session {
  private static _socket: WebSocket | null = null;

  private static _latest_event_id: number = -1;

-  private static _messageQueue: Message[] = [];
-
  public static _history: Record<string, unknown>[] = [];

  // callbacks contain a list of callable functions
@@ -91,7 +83,6 @@ class Session {
      toast.success("ws", translate(I18nKey.SESSION$SERVER_CONNECTED_MESSAGE));
      Session._connecting = false;
      Session._initializeAgent();
-      Session._flushQueue();
      Session.callbacks.open?.forEach((callback) => {
        callback(e);
      });
@@ -103,6 +94,7 @@ class Session {
        data = JSON.parse(e.data);
        Session._history.push(data);
      } catch (err) {
+        // TODO: report the error
        toast.error(
          "ws",
          translate(I18nKey.SESSION$SESSION_HANDLING_ERROR_MESSAGE),
@@ -123,7 +115,6 @@ class Session {
    };

    Session._socket.onerror = () => {
-      // TODO report error
      toast.error(
        "ws",
        translate(I18nKey.SESSION$SESSION_CONNECTION_ERROR_MESSAGE),
@@ -154,20 +145,9 @@ class Session {
    Session._socket = null;
  }

-  private static _flushQueue(): void {
-    while (Session._messageQueue.length > 0) {
-      const message = Session._messageQueue.shift();
-      if (message) {
-        setTimeout(() => Session.send(JSON.stringify(message)), 1000);
-      }
-    }
-  }
-
  static send(message: string): void {
-    const messageObject: Message = JSON.parse(message);
-
    if (Session._connecting) {
-      Session._messageQueue.push(messageObject);
+      setTimeout(() => Session.send(message), 1000);
      return;
    }
    if (!Session.isConnected()) {
@@ -87,10 +87,10 @@ export const getSettings = (): Settings => {
 export const saveSettings = (settings: Partial<Settings>) => {
  Object.keys(settings).forEach((key) => {
    const isValid = validKeys.includes(key as keyof Settings);
-    if (!isValid) return;
-    let value = settings[key as keyof Settings];
-    if (value === undefined || value === null) value = "";
-    localStorage.setItem(key, value.toString());
+    const value = settings[key as keyof Settings];
+
+    if (isValid && typeof value !== "undefined")
+      localStorage.setItem(key, value.toString());
  });
  localStorage.setItem("SETTINGS_VERSION", LATEST_SETTINGS_VERSION.toString());
 };
@@ -12,17 +12,12 @@ export const chatSlice = createSlice({
  reducers: {
    addUserMessage(
      state,
-      action: PayloadAction<{
-        content: string;
-        imageUrls: string[];
-        timestamp: string;
-      }>,
+      action: PayloadAction<{ content: string; imageUrls: string[] }>,
    ) {
      const message: Message = {
        sender: "user",
        content: action.payload.content,
        imageUrls: action.payload.imageUrls,
-        timestamp: action.payload.timestamp || new Date().toISOString(),
      };
      state.messages.push(message);
    },
@@ -32,7 +27,6 @@ export const chatSlice = createSlice({
        sender: "assistant",
        content: action.payload,
        imageUrls: [],
-        timestamp: new Date().toISOString(),
      };
      state.messages.push(message);
    },
@@ -1,23 +0,0 @@
-import { createSlice, PayloadAction } from "@reduxjs/toolkit";
-import { StatusMessage } from "#/types/Message";
-
-const initialStatusMessage: StatusMessage = {
-  message: "",
-  is_error: false,
-};
-
-export const statusSlice = createSlice({
-  name: "status",
-  initialState: {
-    curStatusMessage: initialStatusMessage,
-  },
-  reducers: {
-    setCurStatusMessage: (state, action: PayloadAction<StatusMessage>) => {
-      state.curStatusMessage = action.payload;
-    },
-  },
-});
-
-export const { setCurStatusMessage } = statusSlice.actions;
-
-export default statusSlice.reducer;
@@ -8,7 +8,6 @@ import errorsReducer from "./state/errorsSlice";
 import taskReducer from "./state/taskSlice";
 import jupyterReducer from "./state/jupyterSlice";
 import securityAnalyzerReducer from "./state/securityAnalyzerSlice";
-import statusReducer from "./state/statusSlice";

 export const rootReducer = combineReducers({
  browser: browserReducer,
@@ -20,7 +19,6 @@ export const rootReducer = combineReducers({
  agent: agentReducer,
  jupyter: jupyterReducer,
  securityAnalyzer: securityAnalyzerReducer,
-  status: statusReducer,
 });

 const store = configureStore({
@@ -10,9 +10,6 @@ export interface ActionMessage {

  // A friendly message that can be put in the chat log
  message: string;
-
-  // The timestamp of the message
-  timestamp: string;
 }

 export interface ObservationMessage {
@@ -27,16 +24,4 @@ export interface ObservationMessage {

  // A friendly message that can be put in the chat log
  message: string;
-
-  // The timestamp of the message
-  timestamp: string;
-}
-
-export interface StatusMessage {
-  // TODO not implemented yet
-  // Whether the status is an error, default is false
-  is_error: boolean;
-
-  // A status message to display to the user
-  message: string;
 }
@@ -1,5 +1,5 @@
-import { ActionMessage, ObservationMessage, StatusMessage } from "./Message";
+import { ActionMessage, ObservationMessage } from "./Message";

-type SocketMessage = ActionMessage | ObservationMessage | StatusMessage;
+type SocketMessage = ActionMessage | ObservationMessage;

 export { type SocketMessage };
@@ -75,11 +75,3 @@ export const getExtension = (code: string) => {
  if (code.includes(".")) return code.split(".").pop() || "";
  return "";
 };
-
-export const formatTimestamp = (timestamp: string) => {
-  const date = new Date(timestamp);
-  return new Intl.DateTimeFormat(undefined, {
-    dateStyle: "short",
-    timeStyle: "short",
-  }).format(date);
-};
@@ -1,28 +0,0 @@
-def get_version():
-    try:
-        from importlib.metadata import PackageNotFoundError, version
-
-        try:
-            return version('openhands-ai')
-        except PackageNotFoundError:
-            pass
-    except ImportError:
-        pass
-
-    try:
-        from pkg_resources import DistributionNotFound, get_distribution
-
-        try:
-            return get_distribution('openhands-ai').version
-        except DistributionNotFound:
-            pass
-    except ImportError:
-        pass
-
-    return 'unknown'
-
-
-try:
-    __version__ = get_version()
-except Exception:
-    __version__ = 'unknown'
@@ -37,7 +37,6 @@ from openhands.events.observation import (
    Observation,
 )
 from openhands.llm.llm import LLM
-from openhands.runtime.utils.shutdown_listener import should_continue

 # note: RESUME is only available on web GUI
 TRAFFIC_CONTROL_REMINDER = (
@@ -54,7 +53,7 @@ class AgentController:
    confirmation_mode: bool
    agent_to_llm_config: dict[str, LLMConfig]
    agent_configs: dict[str, AgentConfig]
-    agent_task: asyncio.Future | None = None
+    agent_task: asyncio.Task | None = None
    parent: 'AgentController | None' = None
    delegate: 'AgentController | None' = None
    _pending_action: Action | None = None
@@ -115,6 +114,9 @@ class AgentController:
        # stuck helper
        self._stuck_detector = StuckDetector(self.state)

+        if not is_delegate:
+            self.agent_task = asyncio.create_task(self._start_step_loop())
+
    async def close(self):
        """Closes the agent controller, canceling any ongoing tasks and unsubscribing from the event stream."""
        if self.agent_task is not None:
@@ -129,10 +131,6 @@ class AgentController:
    async def update_state_after_step(self):
        # update metrics especially for cost
        self.state.local_metrics = self.agent.llm.metrics
-        if 'llm_completions' not in self.state.extra_data:
-            self.state.extra_data['llm_completions'] = []
-        self.state.extra_data['llm_completions'].extend(self.agent.llm.llm_completions)
-        self.agent.llm.llm_completions.clear()

    async def report_error(self, message: str, exception: Exception | None = None):
        """Reports an error to the user and sends the exception to the LLM next step, in the hope it can self-correct.
@@ -146,11 +144,11 @@ class AgentController:
            self.state.last_error += f': {exception}'
        self.event_stream.add_event(ErrorObservation(message), EventSource.AGENT)

-    async def start_step_loop(self):
+    async def _start_step_loop(self):
        """The main loop for the agent's step-by-step execution."""

        logger.info(f'[Agent Controller {self.id}] Starting step loop...')
-        while should_continue():
+        while True:
            try:
                await self._step()
            except asyncio.CancelledError:
@@ -174,83 +172,56 @@ class AgentController:
        Args:
            event (Event): The incoming event to process.
        """
-        if isinstance(event, Action):
-            await self._handle_action(event)
-        elif isinstance(event, Observation):
-            await self._handle_observation(event)
-
-    async def _handle_action(self, action: Action):
-        """Handles actions from the event stream.
-
-        Args:
-            action (Action): The action to handle.
-        """
-        if isinstance(action, ChangeAgentStateAction):
-            await self.set_agent_state_to(action.agent_state)  # type: ignore
-        elif isinstance(action, MessageAction):
-            await self._handle_message_action(action)
-        elif isinstance(action, AgentDelegateAction):
-            await self.start_delegate(action)
-        elif isinstance(action, AddTaskAction):
-            self.state.root_task.add_subtask(
-                action.parent, action.goal, action.subtasks
-            )
-        elif isinstance(action, ModifyTaskAction):
-            self.state.root_task.set_subtask_state(action.task_id, action.state)
-        elif isinstance(action, AgentFinishAction):
-            self.state.outputs = action.outputs
+        if isinstance(event, ChangeAgentStateAction):
+            await self.set_agent_state_to(event.agent_state)  # type: ignore
+        elif isinstance(event, MessageAction):
+            if event.source == EventSource.USER:
+                logger.info(
+                    event,
+                    extra={'msg_type': 'ACTION', 'event_source': EventSource.USER},
+                )
+                if self.get_agent_state() != AgentState.RUNNING:
+                    await self.set_agent_state_to(AgentState.RUNNING)
+            elif event.source == EventSource.AGENT and event.wait_for_response:
+                await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+        elif isinstance(event, AgentDelegateAction):
+            await self.start_delegate(event)
+        elif isinstance(event, AddTaskAction):
+            self.state.root_task.add_subtask(event.parent, event.goal, event.subtasks)
+        elif isinstance(event, ModifyTaskAction):
+            self.state.root_task.set_subtask_state(event.task_id, event.state)
+        elif isinstance(event, AgentFinishAction):
+            self.state.outputs = event.outputs
            self.state.metrics.merge(self.state.local_metrics)
            await self.set_agent_state_to(AgentState.FINISHED)
-        elif isinstance(action, AgentRejectAction):
-            self.state.outputs = action.outputs
+        elif isinstance(event, AgentRejectAction):
+            self.state.outputs = event.outputs
            self.state.metrics.merge(self.state.local_metrics)
            await self.set_agent_state_to(AgentState.REJECTED)
-
-    async def _handle_observation(self, observation: Observation):
-        """Handles observation from the event stream.
-
-        Args:
-            observation (observation): The observation to handle.
-        """
-        if (
-            self._pending_action
-            and hasattr(self._pending_action, 'is_confirmed')
-            and self._pending_action.is_confirmed
-            == ActionConfirmationStatus.AWAITING_CONFIRMATION
-        ):
-            return
-
-        logger.info(observation, extra={'msg_type': 'OBSERVATION'})
-        if self._pending_action and self._pending_action.id == observation.cause:
-            self._pending_action = None
-            if self.state.agent_state == AgentState.USER_CONFIRMED:
-                await self.set_agent_state_to(AgentState.RUNNING)
-            if self.state.agent_state == AgentState.USER_REJECTED:
-                await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
-            return
-
-        if isinstance(observation, CmdOutputObservation):
-            return
-        elif isinstance(observation, AgentDelegateObservation):
-            self.state.history.on_event(observation)
-        elif isinstance(observation, ErrorObservation):
-            if self.state.agent_state == AgentState.ERROR:
-                self.state.metrics.merge(self.state.local_metrics)
-
-    async def _handle_message_action(self, action: MessageAction):
-        """Handles message actions from the event stream.
-
-        Args:
-            action (MessageAction): The message action to handle.
-        """
-        if action.source == EventSource.USER:
-            logger.info(
-                action, extra={'msg_type': 'ACTION', 'event_source': EventSource.USER}
-            )
-            if self.get_agent_state() != AgentState.RUNNING:
-                await self.set_agent_state_to(AgentState.RUNNING)
-        elif action.source == EventSource.AGENT and action.wait_for_response:
-            await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+        elif isinstance(event, Observation):
+            if (
+                self._pending_action
+                and hasattr(self._pending_action, 'is_confirmed')
+                and self._pending_action.is_confirmed
+                == ActionConfirmationStatus.AWAITING_CONFIRMATION
+            ):
+                return
+            if self._pending_action and self._pending_action.id == event.cause:
+                self._pending_action = None
+                if self.state.agent_state == AgentState.USER_CONFIRMED:
+                    await self.set_agent_state_to(AgentState.RUNNING)
+                if self.state.agent_state == AgentState.USER_REJECTED:
+                    await self.set_agent_state_to(AgentState.AWAITING_USER_INPUT)
+                logger.info(event, extra={'msg_type': 'OBSERVATION'})
+            elif isinstance(event, CmdOutputObservation):
+                logger.info(event, extra={'msg_type': 'OBSERVATION'})
+            elif isinstance(event, AgentDelegateObservation):
+                self.state.history.on_event(event)
+                logger.info(event, extra={'msg_type': 'OBSERVATION'})
+            elif isinstance(event, ErrorObservation):
+                logger.info(event, extra={'msg_type': 'OBSERVATION'})
+                if self.state.agent_state == AgentState.ERROR:
+                    self.state.metrics.merge(self.state.local_metrics)

    def reset_task(self):
        """Resets the agent's task."""
@@ -271,11 +242,9 @@ class AgentController:
        if new_state == self.state.agent_state:
            return

-        if new_state == AgentState.STOPPED or new_state == AgentState.ERROR:
-            self.reset_task()
-        elif (
-            new_state == AgentState.RUNNING
-            and self.state.agent_state == AgentState.PAUSED
+        if (
+            self.state.agent_state == AgentState.PAUSED
+            and new_state == AgentState.RUNNING
            and self.state.traffic_control_state == TrafficControlState.THROTTLING
        ):
            # user intends to interrupt traffic control and let the task resume temporarily
@@ -288,7 +257,6 @@ class AgentController:
            ):
                if self.state.iteration >= self.state.max_iterations:
                    self.state.max_iterations += self._initial_max_iterations
-
            if (
                self.state.metrics.accumulated_cost is not None
                and self.max_budget_per_task is not None
@@ -296,7 +264,12 @@ class AgentController:
            ):
                if self.state.metrics.accumulated_cost >= self.max_budget_per_task:
                    self.max_budget_per_task += self._initial_max_budget_per_task
-        elif self._pending_action is not None and (
+
+        self.state.agent_state = new_state
+        if new_state == AgentState.STOPPED or new_state == AgentState.ERROR:
+            self.reset_task()
+
+        if self._pending_action is not None and (
            new_state == AgentState.USER_CONFIRMED
            or new_state == AgentState.USER_REJECTED
        ):
@@ -308,7 +281,6 @@ class AgentController:
                self._pending_action.is_confirmed = ActionConfirmationStatus.REJECTED  # type: ignore[attr-defined]
            self.event_stream.add_event(self._pending_action, EventSource.AGENT)

-        self.state.agent_state = new_state
        self.event_stream.add_event(
            AgentStateChangedObservation('', self.state.agent_state), EventSource.AGENT
        )
@@ -383,11 +355,56 @@ class AgentController:
            return

        if self.delegate is not None:
+            logger.debug(f'[Agent Controller {self.id}] Delegate not none, awaiting...')
            assert self.delegate != self
-            if self.delegate.get_agent_state() == AgentState.PAUSED:
-                await asyncio.sleep(1)
-            else:
-                await self._delegate_step()
+            await self.delegate._step()
+            logger.debug(f'[Agent Controller {self.id}] Delegate step done')
+            assert self.delegate is not None
+            delegate_state = self.delegate.get_agent_state()
+            logger.debug(
+                f'[Agent Controller {self.id}] Delegate state: {delegate_state}'
+            )
+            if delegate_state == AgentState.ERROR:
+                # update iteration that shall be shared across agents
+                self.state.iteration = self.delegate.state.iteration
+
+                # close the delegate upon error
+                await self.delegate.close()
+                self.delegate = None
+                self.delegateAction = None
+
+                await self.report_error('Delegator agent encounters an error')
+                return
+            delegate_done = delegate_state in (AgentState.FINISHED, AgentState.REJECTED)
+            if delegate_done:
+                logger.info(
+                    f'[Agent Controller {self.id}] Delegate agent has finished execution'
+                )
+                # retrieve delegate result
+                outputs = self.delegate.state.outputs if self.delegate.state else {}
+
+                # update iteration that shall be shared across agents
+                self.state.iteration = self.delegate.state.iteration
+
+                # close delegate controller: we must close the delegate controller before adding new events
+                await self.delegate.close()
+
+                # update delegate result observation
+                # TODO: replace this with AI-generated summary (#2395)
+                formatted_output = ', '.join(
+                    f'{key}: {value}' for key, value in outputs.items()
+                )
+                content = (
+                    f'{self.delegate.agent.name} finishes task with {formatted_output}'
+                )
+                obs: Observation = AgentDelegateObservation(
+                    outputs=outputs, content=content
+                )
+
+                # clean up delegate status
+                self.delegate = None
+                self.delegateAction = None
+                self.event_stream.add_event(obs, EventSource.AGENT)
            return

        logger.info(
@@ -395,20 +412,50 @@ class AgentController:
            extra={'msg_type': 'STEP'},
        )

-        # check if agent hit the resources limit
-        stop_step = False
        if self.state.iteration >= self.state.max_iterations:
-            stop_step = await self._handle_traffic_control(
-                'iteration', self.state.iteration, self.state.max_iterations
-            )
-        if self.max_budget_per_task is not None:
+            if self.state.traffic_control_state == TrafficControlState.PAUSED:
+                logger.info(
+                    'Hitting traffic control, temporarily resume upon user request'
+                )
+                self.state.traffic_control_state = TrafficControlState.NORMAL
+            else:
+                self.state.traffic_control_state = TrafficControlState.THROTTLING
+                if self.headless_mode:
+                    # set to ERROR state if running in headless mode
+                    # since user cannot resume on the web interface
+                    await self.report_error(
+                        'Agent reached maximum number of iterations in headless mode, task stopped.'
+                    )
+                    await self.set_agent_state_to(AgentState.ERROR)
+                else:
+                    await self.report_error(
+                        f'Agent reached maximum number of iterations, task paused. {TRAFFIC_CONTROL_REMINDER}'
+                    )
+                    await self.set_agent_state_to(AgentState.PAUSED)
+                return
+        elif self.max_budget_per_task is not None:
            current_cost = self.state.metrics.accumulated_cost
            if current_cost > self.max_budget_per_task:
-                stop_step = await self._handle_traffic_control(
-                    'budget', current_cost, self.max_budget_per_task
-                )
-        if stop_step:
-            return
+                if self.state.traffic_control_state == TrafficControlState.PAUSED:
+                    logger.info(
+                        'Hitting traffic control, temporarily resume upon user request'
+                    )
+                    self.state.traffic_control_state = TrafficControlState.NORMAL
+                else:
+                    self.state.traffic_control_state = TrafficControlState.THROTTLING
+                    if self.headless_mode:
+                        # set to ERROR state if running in headless mode
+                        # there is no way to resume
+                        await self.report_error(
+                            f'Task budget exceeded. Current cost: {current_cost:.2f}, max budget: {self.max_budget_per_task:.2f}, task stopped.'
+                        )
+                        await self.set_agent_state_to(AgentState.ERROR)
+                    else:
+                        await self.report_error(
+                            f'Task budget exceeded. Current cost: {current_cost:.2f}, Max budget: {self.max_budget_per_task:.2f}, task paused. {TRAFFIC_CONTROL_REMINDER}'
+                        )
+                        await self.set_agent_state_to(AgentState.PAUSED)
+                    return

        self.update_state_before_step()
        action: Action = NullAction()
@@ -445,89 +492,6 @@ class AgentController:
            await self.report_error('Agent got stuck in a loop')
            await self.set_agent_state_to(AgentState.ERROR)

-    async def _delegate_step(self):
-        """Executes a single step of the delegate agent."""
-        logger.debug(f'[Agent Controller {self.id}] Delegate not none, awaiting...')
-        await self.delegate._step()  # type: ignore[union-attr]
-        logger.debug(f'[Agent Controller {self.id}] Delegate step done')
-        assert self.delegate is not None
-        delegate_state = self.delegate.get_agent_state()
-        logger.debug(f'[Agent Controller {self.id}] Delegate state: {delegate_state}')
-        if delegate_state == AgentState.ERROR:
-            # update iteration that shall be shared across agents
-            self.state.iteration = self.delegate.state.iteration
-
-            # close the delegate upon error
-            await self.delegate.close()
-            self.delegate = None
-            self.delegateAction = None
-
-            await self.report_error('Delegator agent encountered an error')
-        elif delegate_state in (AgentState.FINISHED, AgentState.REJECTED):
-            logger.info(
-                f'[Agent Controller {self.id}] Delegate agent has finished execution'
-            )
-            # retrieve delegate result
-            outputs = self.delegate.state.outputs if self.delegate.state else {}
-
-            # update iteration that shall be shared across agents
-            self.state.iteration = self.delegate.state.iteration
-
-            # close delegate controller: we must close the delegate controller before adding new events
-            await self.delegate.close()
-
-            # update delegate result observation
-            # TODO: replace this with AI-generated summary (#2395)
-            formatted_output = ', '.join(
-                f'{key}: {value}' for key, value in outputs.items()
-            )
-            content = (
-                f'{self.delegate.agent.name} finishes task with {formatted_output}'
-            )
-            obs: Observation = AgentDelegateObservation(
-                outputs=outputs, content=content
-            )
-
-            # clean up delegate status
-            self.delegate = None
-            self.delegateAction = None
-            self.event_stream.add_event(obs, EventSource.AGENT)
-        return
-
-    async def _handle_traffic_control(
-        self, limit_type: str, current_value: float, max_value: float
-    ):
-        """Handles agent state after hitting the traffic control limit.
-
-        Args:
-            limit_type (str): The type of limit that was hit.
-            current_value (float): The current value of the limit.
-            max_value (float): The maximum value of the limit.
-        """
-        stop_step = False
-        if self.state.traffic_control_state == TrafficControlState.PAUSED:
-            logger.info('Hitting traffic control, temporarily resume upon user request')
-            self.state.traffic_control_state = TrafficControlState.NORMAL
-        else:
-            self.state.traffic_control_state = TrafficControlState.THROTTLING
-            if self.headless_mode:
-                # set to ERROR state if running in headless mode
-                # since user cannot resume on the web interface
-                await self.report_error(
-                    f'Agent reached maximum {limit_type} in headless mode, task stopped. '
-                    f'Current {limit_type}: {current_value:.2f}, max {limit_type}: {max_value:.2f}'
-                )
-                await self.set_agent_state_to(AgentState.ERROR)
-            else:
-                await self.report_error(
-                    f'Agent reached maximum {limit_type}, task paused. '
-                    f'Current {limit_type}: {current_value:.2f}, max {limit_type}: {max_value:.2f}. '
-                    f'{TRAFFIC_CONTROL_REMINDER}'
-                )
-                await self.set_agent_state_to(AgentState.PAUSED)
-            stop_step = True
-        return stop_step
-
    def get_state(self):
        """Returns the current running state object.

@@ -1,4 +1,3 @@
-import argparse
 import asyncio
 import logging
 from typing import Type
@@ -6,7 +5,6 @@ from typing import Type
 from termcolor import colored

 import agenthub  # noqa F401 (we import this to get the agents registered)
-from openhands import __version__
 from openhands.controller import AgentController
 from openhands.controller.agent import Agent
 from openhands.core.config import (
@@ -63,33 +61,8 @@ def display_event(event: Event):
        display_command_output(event.content)


-def get_parser() -> argparse.ArgumentParser:
-    """Get the parser for the command line arguments."""
-    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
-
-    # Add the version argument
-    parser.add_argument(
-        '-v',
-        '--version',
-        action='version',
-        version=f'{__version__}',
-        help='Show the version number and exit',
-        default=None,
-    )
-
-    return parser
-
-
 async def main():
    """Runs the agent in CLI mode"""
-
-    parser = get_parser()
-    args = parser.parse_args()
-
-    if args.version:
-        print(f'OpenHands version: {__version__}')
-        return
-
    logger.setLevel(logging.WARNING)
    config = load_app_config()
    sid = 'cli'
@@ -121,9 +94,6 @@ async def main():
        event_stream=event_stream,
    )

-    if controller is not None:
-        controller.agent_task = asyncio.create_task(controller.start_step_loop())
-
    async def prompt_for_next_task():
        next_message = input('How can I help? >> ')
        if next_message == 'exit':
@@ -0,0 +1,781 @@
+import argparse
+import os
+import pathlib
+import platform
+import uuid
+from dataclasses import dataclass, field, fields, is_dataclass
+from enum import Enum
+from types import UnionType
+from typing import Any, ClassVar, MutableMapping, get_args, get_origin
+
+import toml
+from dotenv import load_dotenv
+
+from openhands.core import logger
+
+load_dotenv()
+
+
+LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']
+_DEFAULT_AGENT = 'CodeActAgent'
+_MAX_ITERATIONS = 100
+
+
+@dataclass
+class LLMConfig:
+    """Configuration for the LLM model.
+
+    Attributes:
+        model: The model to use.
+        api_key: The API key to use.
+        base_url: The base URL for the API. This is necessary for local LLMs. It is also used for Azure embeddings.
+        api_version: The version of the API.
+        embedding_model: The embedding model to use.
+        embedding_base_url: The base URL for the embedding API.
+        embedding_deployment_name: The name of the deployment for the embedding API. This is used for Azure OpenAI.
+        aws_access_key_id: The AWS access key ID.
+        aws_secret_access_key: The AWS secret access key.
+        aws_region_name: The AWS region name.
+        num_retries: The number of retries to attempt.
+        retry_multiplier: The multiplier for the exponential backoff.
+        retry_min_wait: The minimum time to wait between retries, in seconds. This is exponential backoff minimum. For models with very low limits, this can be set to 15-20.
+        retry_max_wait: The maximum time to wait between retries, in seconds. This is exponential backoff maximum.
+        timeout: The timeout for the API.
+        max_message_chars: The approximate max number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated.
+        temperature: The temperature for the API.
+        top_p: The top p for the API.
+        custom_llm_provider: The custom LLM provider to use. This is undocumented in openhands, and normally not used. It is documented on the litellm side.
+        max_input_tokens: The maximum number of input tokens. Note that this is currently unused, and the value at runtime is actually the total tokens in OpenAI (e.g. 128,000 tokens for GPT-4).
+        max_output_tokens: The maximum number of output tokens. This is sent to the LLM.
+        input_cost_per_token: The cost per input token. This will available in logs for the user to check.
+        output_cost_per_token: The cost per output token. This will available in logs for the user to check.
+        ollama_base_url: The base URL for the OLLAMA API.
+        drop_params: Drop any unmapped (unsupported) params without causing an exception.
+        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
+        caching_prompt: Using the prompt caching feature provided by the LLM.
+    """
+
+    model: str = 'gpt-4o'
+    api_key: str | None = None
+    base_url: str | None = None
+    api_version: str | None = None
+    embedding_model: str = 'local'
+    embedding_base_url: str | None = None
+    embedding_deployment_name: str | None = None
+    aws_access_key_id: str | None = None
+    aws_secret_access_key: str | None = None
+    aws_region_name: str | None = None
+    num_retries: int = 8
+    retry_multiplier: float = 2
+    retry_min_wait: int = 15
+    retry_max_wait: int = 120
+    timeout: int | None = None
+    max_message_chars: int = 10_000  # maximum number of characters in an observation's content when sent to the llm
+    temperature: float = 0
+    top_p: float = 0.5
+    custom_llm_provider: str | None = None
+    max_input_tokens: int | None = None
+    max_output_tokens: int | None = None
+    input_cost_per_token: float | None = None
+    output_cost_per_token: float | None = None
+    ollama_base_url: str | None = None
+    drop_params: bool | None = None
+    disable_vision: bool | None = None
+    caching_prompt: bool = False
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            result[f.name] = get_field_info(f)
+        return result
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            if attr_name in LLM_SENSITIVE_FIELDS:
+                attr_value = '******' if attr_value else None
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"LLMConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def to_safe_dict(self):
+        """Return a dict with the sensitive fields replaced with ******."""
+        ret = self.__dict__.copy()
+        for k, v in ret.items():
+            if k in LLM_SENSITIVE_FIELDS:
+                ret[k] = '******' if v else None
+        return ret
+
+    def set_missing_attributes(self):
+        """Set any missing attributes to their default values."""
+        for field_name, field_obj in self.__dataclass_fields__.items():
+            if not hasattr(self, field_name):
+                setattr(self, field_name, field_obj.default)
+
+
+@dataclass
+class AgentConfig:
+    """Configuration for the agent.
+
+    Attributes:
+        micro_agent_name: The name of the micro agent to use for this agent.
+        memory_enabled: Whether long-term memory (embeddings) is enabled.
+        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
+        llm_config: The name of the llm config to use. If specified, this will override global llm config.
+    """
+
+    micro_agent_name: str | None = None
+    memory_enabled: bool = False
+    memory_max_threads: int = 2
+    llm_config: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            result[f.name] = get_field_info(f)
+        return result
+
+
+@dataclass
+class SecurityConfig:
+    """Configuration for security related functionalities.
+
+    Attributes:
+        confirmation_mode: Whether to enable confirmation mode.
+        security_analyzer: The security analyzer to use.
+    """
+
+    confirmation_mode: bool = False
+    security_analyzer: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        dict = {}
+        for f in fields(self):
+            dict[f.name] = get_field_info(f)
+        return dict
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"SecurityConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+@dataclass
+class SandboxConfig:
+    """Configuration for the sandbox.
+
+    Attributes:
+        api_hostname: The hostname for the EventStream Runtime API.
+        base_container_image: The base container image from which to build the runtime image.
+        runtime_container_image: The runtime container image to use.
+        user_id: The user ID for the sandbox.
+        timeout: The timeout for the sandbox.
+        enable_auto_lint: Whether to enable auto-lint.
+        use_host_network: Whether to use the host network.
+        initialize_plugins: Whether to initialize plugins.
+        runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
+            This will be rendered into the end of the Dockerfile that builds the runtime image.
+            It can contain any valid shell commands (e.g., pip install numpy).
+            The path to the interpreter is available as $OD_INTERPRETER_PATH,
+            which can be used to install dependencies for the OD-specific Python interpreter.
+        runtime_startup_env_vars: The environment variables to set at the launch of the runtime.
+            This is a dictionary of key-value pairs.
+            This is useful for setting environment variables that are needed by the runtime.
+            For example, for specifying the base url of website for browsergym evaluation.
+        browsergym_eval_env: The BrowserGym environment to use for evaluation.
+            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
+    """
+
+    api_hostname: str = 'localhost'
+    api_key: str | None = None
+    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
+    runtime_container_image: str | None = None
+    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
+    timeout: int = 120
+    enable_auto_lint: bool = (
+        False  # once enabled, OpenHands would lint files after editing
+    )
+    use_host_network: bool = False
+    initialize_plugins: bool = True
+    runtime_extra_deps: str | None = None
+    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
+    browsergym_eval_env: str | None = None
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        dict = {}
+        for f in fields(self):
+            dict[f.name] = get_field_info(f)
+        return dict
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"SandboxConfig({', '.join(attr_str)})"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UndefinedString(str, Enum):
+    UNDEFINED = 'UNDEFINED'
+
+
+@dataclass
+class AppConfig:
+    """Configuration for the app.
+
+    Attributes:
+        llms: A dictionary of name -> LLM configuration. Default config is under 'llm' key.
+        agents: A dictionary of name -> Agent configuration. Default config is under 'agent' key.
+        default_agent: The name of the default agent to use.
+        sandbox: The sandbox configuration.
+        runtime: The runtime environment.
+        file_store: The file store to use.
+        file_store_path: The path to the file store.
+        workspace_base: The base path for the workspace. Defaults to ./workspace as an absolute path.
+        workspace_mount_path: The path to mount the workspace. This is set to the workspace base by default.
+        workspace_mount_path_in_sandbox: The path to mount the workspace in the sandbox. Defaults to /workspace.
+        workspace_mount_rewrite: The path to rewrite the workspace mount path to.
+        cache_dir: The path to the cache directory. Defaults to /tmp/cache.
+        run_as_openhands: Whether to run as openhands.
+        max_iterations: The maximum number of iterations.
+        max_budget_per_task: The maximum budget allowed per task, beyond which the agent will stop.
+        e2b_api_key: The E2B API key.
+        disable_color: Whether to disable color. For terminals that don't support color.
+        debug: Whether to enable debugging.
+        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
+        file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
+        file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
+        file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
+    """
+
+    llms: dict[str, LLMConfig] = field(default_factory=dict)
+    agents: dict = field(default_factory=dict)
+    default_agent: str = _DEFAULT_AGENT
+    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
+    security: SecurityConfig = field(default_factory=SecurityConfig)
+    runtime: str = 'eventstream'
+    file_store: str = 'memory'
+    file_store_path: str = '/tmp/file_store'
+    # TODO: clean up workspace path after the removal of ServerRuntime
+    workspace_base: str = os.path.join(os.getcwd(), 'workspace')
+    workspace_mount_path: str | None = (
+        UndefinedString.UNDEFINED  # this path should always be set when config is fully loaded
+    )  # when set to None, do not mount the workspace
+    workspace_mount_path_in_sandbox: str = '/workspace'
+    workspace_mount_rewrite: str | None = None
+    cache_dir: str = '/tmp/cache'
+    run_as_openhands: bool = True
+    max_iterations: int = _MAX_ITERATIONS
+    max_budget_per_task: float | None = None
+    e2b_api_key: str = ''
+    disable_color: bool = False
+    jwt_secret: str = uuid.uuid4().hex
+    debug: bool = False
+    enable_cli_session: bool = False
+    file_uploads_max_file_size_mb: int = 0
+    file_uploads_restrict_file_types: bool = False
+    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
+
+    defaults_dict: ClassVar[dict] = {}
+
+    def get_llm_config(self, name='llm') -> LLMConfig:
+        """Llm is the name for default config (for backward compatibility prior to 0.8)"""
+        if name in self.llms:
+            return self.llms[name]
+        if name is not None and name != 'llm':
+            logger.openhands_logger.warning(
+                f'llm config group {name} not found, using default config'
+            )
+        if 'llm' not in self.llms:
+            self.llms['llm'] = LLMConfig()
+        return self.llms['llm']
+
+    def set_llm_config(self, value: LLMConfig, name='llm'):
+        self.llms[name] = value
+
+    def get_agent_config(self, name='agent') -> AgentConfig:
+        """Agent is the name for default config (for backward compability prior to 0.8)"""
+        if name in self.agents:
+            return self.agents[name]
+        if 'agent' not in self.agents:
+            self.agents['agent'] = AgentConfig()
+        return self.agents['agent']
+
+    def set_agent_config(self, value: AgentConfig, name='agent'):
+        self.agents[name] = value
+
+    def get_agent_to_llm_config_map(self) -> dict[str, LLMConfig]:
+        """Get a map of agent names to llm configs."""
+        return {name: self.get_llm_config_from_agent(name) for name in self.agents}
+
+    def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
+        agent_config: AgentConfig = self.get_agent_config(name)
+        llm_config_name = agent_config.llm_config
+        return self.get_llm_config(llm_config_name)
+
+    def get_agent_configs(self) -> dict[str, AgentConfig]:
+        return self.agents
+
+    def __post_init__(self):
+        """Post-initialization hook, called when the instance is created with only default values."""
+        AppConfig.defaults_dict = self.defaults_to_dict()
+
+    def defaults_to_dict(self) -> dict:
+        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
+        result = {}
+        for f in fields(self):
+            field_value = getattr(self, f.name)
+
+            # dataclasses compute their defaults themselves
+            if is_dataclass(type(field_value)):
+                result[f.name] = field_value.defaults_to_dict()
+            else:
+                result[f.name] = get_field_info(f)
+        return result
+
+    def __str__(self):
+        attr_str = []
+        for f in fields(self):
+            attr_name = f.name
+            attr_value = getattr(self, f.name)
+
+            if attr_name in [
+                'e2b_api_key',
+                'github_token',
+                'jwt_secret',
+            ]:
+                attr_value = '******' if attr_value else None
+
+            attr_str.append(f'{attr_name}={repr(attr_value)}')
+
+        return f"AppConfig({', '.join(attr_str)}"
+
+    def __repr__(self):
+        return self.__str__()
+
+
+def get_field_info(f):
+    """Extract information about a dataclass field: type, optional, and default.
+
+    Args:
+        f: The field to extract information from.
+
+    Returns: A dict with the field's type, whether it's optional, and its default value.
+    """
+    field_type = f.type
+    optional = False
+
+    # for types like str | None, find the non-None type and set optional to True
+    # this is useful for the frontend to know if a field is optional
+    # and to show the correct type in the UI
+    # Note: this only works for UnionTypes with None as one of the types
+    if get_origin(field_type) is UnionType:
+        types = get_args(field_type)
+        non_none_arg = next((t for t in types if t is not type(None)), None)
+        if non_none_arg is not None:
+            field_type = non_none_arg
+            optional = True
+
+    # type name in a pretty format
+    type_name = (
+        field_type.__name__ if hasattr(field_type, '__name__') else str(field_type)
+    )
+
+    # default is always present
+    default = f.default
+
+    # return a schema with the useful info for frontend
+    return {'type': type_name.lower(), 'optional': optional, 'default': default}
+
+
+def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, str]):
+    """Reads the env-style vars and sets config attributes based on env vars or a config.toml dict.
+    Compatibility with vars like LLM_BASE_URL, AGENT_MEMORY_ENABLED, SANDBOX_TIMEOUT and others.
+
+    Args:
+        cfg: The AppConfig object to set attributes on.
+        env_or_toml_dict: The environment variables or a config.toml dict.
+    """
+
+    def get_optional_type(union_type: UnionType) -> Any:
+        """Returns the non-None type from a Union."""
+        types = get_args(union_type)
+        return next((t for t in types if t is not type(None)), None)
+
+    # helper function to set attributes based on env vars
+    def set_attr_from_env(sub_config: Any, prefix=''):
+        """Set attributes of a config dataclass based on environment variables."""
+        for field_name, field_type in sub_config.__annotations__.items():
+            # compute the expected env var name from the prefix and field name
+            # e.g. LLM_BASE_URL
+            env_var_name = (prefix + field_name).upper()
+
+            if is_dataclass(field_type):
+                # nested dataclass
+                nested_sub_config = getattr(sub_config, field_name)
+                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
+            elif env_var_name in env_or_toml_dict:
+                # convert the env var to the correct type and set it
+                value = env_or_toml_dict[env_var_name]
+
+                # skip empty config values (fall back to default)
+                if not value:
+                    continue
+
+                try:
+                    # if it's an optional type, get the non-None type
+                    if get_origin(field_type) is UnionType:
+                        field_type = get_optional_type(field_type)
+
+                    # Attempt to cast the env var to type hinted in the dataclass
+                    if field_type is bool:
+                        cast_value = str(value).lower() in ['true', '1']
+                    else:
+                        cast_value = field_type(value)
+                    setattr(sub_config, field_name, cast_value)
+                except (ValueError, TypeError):
+                    logger.openhands_logger.error(
+                        f'Error setting env var {env_var_name}={value}: check that the value is of the right type'
+                    )
+
+    # Start processing from the root of the config object
+    set_attr_from_env(cfg)
+
+    # load default LLM config from env
+    default_llm_config = cfg.get_llm_config()
+    set_attr_from_env(default_llm_config, 'LLM_')
+    # load default agent config from env
+    default_agent_config = cfg.get_agent_config()
+    set_attr_from_env(default_agent_config, 'AGENT_')
+
+
+def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
+    """Load the config from the toml file. Supports both styles of config vars.
+
+    Args:
+        cfg: The AppConfig object to update attributes of.
+        toml_file: The path to the toml file. Defaults to 'config.toml'.
+    """
+    # try to read the config.toml file into the config object
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError:
+        return
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.warning(
+            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
+            exc_info=False,
+        )
+        return
+
+    # if there was an exception or core is not in the toml, try to use the old-style toml
+    if 'core' not in toml_config:
+        # re-use the env loader to set the config from env-style vars
+        load_from_env(cfg, toml_config)
+        return
+
+    core_config = toml_config['core']
+
+    # load llm configs and agent configs
+    for key, value in toml_config.items():
+        if isinstance(value, dict):
+            try:
+                if key is not None and key.lower() == 'agent':
+                    logger.openhands_logger.debug(
+                        'Attempt to load default agent config from config toml'
+                    )
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    agent_config = AgentConfig(**non_dict_fields)
+                    cfg.set_agent_config(agent_config, 'agent')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.openhands_logger.debug(
+                                f'Attempt to load group {nested_key} from config toml as agent config'
+                            )
+                            agent_config = AgentConfig(**nested_value)
+                            cfg.set_agent_config(agent_config, nested_key)
+                elif key is not None and key.lower() == 'llm':
+                    logger.openhands_logger.debug(
+                        'Attempt to load default LLM config from config toml'
+                    )
+                    non_dict_fields = {
+                        k: v for k, v in value.items() if not isinstance(v, dict)
+                    }
+                    llm_config = LLMConfig(**non_dict_fields)
+                    cfg.set_llm_config(llm_config, 'llm')
+                    for nested_key, nested_value in value.items():
+                        if isinstance(nested_value, dict):
+                            logger.openhands_logger.debug(
+                                f'Attempt to load group {nested_key} from config toml as llm config'
+                            )
+                            llm_config = LLMConfig(**nested_value)
+                            cfg.set_llm_config(llm_config, nested_key)
+                elif not key.startswith('sandbox') and key.lower() != 'core':
+                    logger.openhands_logger.warning(
+                        f'Unknown key in {toml_file}: "{key}"'
+                    )
+            except (TypeError, KeyError) as e:
+                logger.openhands_logger.warning(
+                    f'Cannot parse config from toml, toml values have not been applied.\n Error: {e}',
+                    exc_info=False,
+                )
+        else:
+            logger.openhands_logger.warning(f'Unknown key in {toml_file}: "{key}')
+
+    try:
+        # set sandbox config from the toml file
+        sandbox_config = cfg.sandbox
+
+        # migrate old sandbox configs from [core] section to sandbox config
+        keys_to_migrate = [key for key in core_config if key.startswith('sandbox_')]
+        for key in keys_to_migrate:
+            new_key = key.replace('sandbox_', '')
+            if new_key in sandbox_config.__annotations__:
+                # read the key in sandbox and remove it from core
+                setattr(sandbox_config, new_key, core_config.pop(key))
+            else:
+                logger.openhands_logger.warning(f'Unknown sandbox config: {key}')
+
+        # the new style values override the old style values
+        if 'sandbox' in toml_config:
+            sandbox_config = SandboxConfig(**toml_config['sandbox'])
+
+        # update the config object with the new values
+        cfg.sandbox = sandbox_config
+        for key, value in core_config.items():
+            if hasattr(cfg, key):
+                setattr(cfg, key, value)
+            else:
+                logger.openhands_logger.warning(f'Unknown core config key: {key}')
+    except (TypeError, KeyError) as e:
+        logger.openhands_logger.warning(
+            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
+            exc_info=False,
+        )
+
+
+def finalize_config(cfg: AppConfig):
+    """More tweaks to the config after it's been loaded."""
+    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
+    # Set workspace_mount_path if not set by the user
+    if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
+        cfg.workspace_mount_path = cfg.workspace_base
+
+    if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
+        # TODO why do we need to check if workspace_mount_path is None?
+        base = cfg.workspace_base or os.getcwd()
+        parts = cfg.workspace_mount_rewrite.split(':')
+        cfg.workspace_mount_path = base.replace(parts[0], parts[1])
+
+    for llm in cfg.llms.values():
+        if llm.embedding_base_url is None:
+            llm.embedding_base_url = llm.base_url
+
+    if cfg.sandbox.use_host_network and platform.system() == 'Darwin':
+        logger.openhands_logger.warning(
+            'Please upgrade to Docker Desktop 4.29.0 or later to use host network mode on macOS. '
+            'See https://github.com/docker/roadmap/issues/238#issuecomment-2044688144 for more information.'
+        )
+
+    # make sure cache dir exists
+    if cfg.cache_dir:
+        pathlib.Path(cfg.cache_dir).mkdir(parents=True, exist_ok=True)
+
+
+# Utility function for command line --group argument
+def get_llm_config_arg(
+    llm_config_arg: str, toml_file: str = 'config.toml'
+) -> LLMConfig | None:
+    """Get a group of llm settings from the config file.
+
+    A group in config.toml can look like this:
+
+    ```
+    [llm.gpt-3.5-for-eval]
+    model = 'gpt-3.5-turbo'
+    api_key = '...'
+    temperature = 0.5
+    num_retries = 8
+    ...
+    ```
+
+    The user-defined group name, like "gpt-3.5-for-eval", is the argument to this function. The function will load the LLMConfig object
+    with the settings of this group, from the config file, and set it as the LLMConfig object for the app.
+
+    Note that the group must be under "llm" group, or in other words, the group name must start with "llm.".
+
+    Args:
+        llm_config_arg: The group of llm settings to get from the config.toml file.
+
+    Returns:
+        LLMConfig: The LLMConfig object with the settings from the config file.
+    """
+    # keep only the name, just in case
+    llm_config_arg = llm_config_arg.strip('[]')
+
+    # truncate the prefix, just in case
+    if llm_config_arg.startswith('llm.'):
+        llm_config_arg = llm_config_arg[4:]
+
+    logger.openhands_logger.info(f'Loading llm config from {llm_config_arg}')
+
+    # load the toml file
+    try:
+        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
+            toml_config = toml.load(toml_contents)
+    except FileNotFoundError as e:
+        logger.openhands_logger.error(f'Config file not found: {e}')
+        return None
+    except toml.TomlDecodeError as e:
+        logger.openhands_logger.error(
+            f'Cannot parse llm group from {llm_config_arg}. Exception: {e}'
+        )
+        return None
+
+    # update the llm config with the specified section
+    if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
+        return LLMConfig(**toml_config['llm'][llm_config_arg])
+    logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}')
+    return None
+
+
+# Command line arguments
+def get_parser() -> argparse.ArgumentParser:
+    """Get the parser for the command line arguments."""
+    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
+    parser.add_argument(
+        '-d',
+        '--directory',
+        type=str,
+        help='The working directory for the agent',
+    )
+    parser.add_argument(
+        '-t',
+        '--task',
+        type=str,
+        default='',
+        help='The task for the agent to perform',
+    )
+    parser.add_argument(
+        '-f',
+        '--file',
+        type=str,
+        help='Path to a file containing the task. Overrides -t if both are provided.',
+    )
+    parser.add_argument(
+        '-c',
+        '--agent-cls',
+        default=_DEFAULT_AGENT,
+        type=str,
+        help='Name of the default agent to use',
+    )
+    parser.add_argument(
+        '-i',
+        '--max-iterations',
+        default=_MAX_ITERATIONS,
+        type=int,
+        help='The maximum number of iterations to run the agent',
+    )
+    parser.add_argument(
+        '-b',
+        '--max-budget-per-task',
+        type=float,
+        help='The maximum budget allowed per task, beyond which the agent will stop.',
+    )
+    # --eval configs are for evaluations only
+    parser.add_argument(
+        '--eval-output-dir',
+        default='evaluation/evaluation_outputs/outputs',
+        type=str,
+        help='The directory to save evaluation output',
+    )
+    parser.add_argument(
+        '--eval-n-limit',
+        default=None,
+        type=int,
+        help='The number of instances to evaluate',
+    )
+    parser.add_argument(
+        '--eval-num-workers',
+        default=4,
+        type=int,
+        help='The number of workers to use for evaluation',
+    )
+    parser.add_argument(
+        '--eval-note',
+        default=None,
+        type=str,
+        help='The note to add to the evaluation directory',
+    )
+    parser.add_argument(
+        '-l',
+        '--llm-config',
+        default=None,
+        type=str,
+        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
+    )
+    parser.add_argument(
+        '-n',
+        '--name',
+        default='default',
+        type=str,
+        help='Name for the session',
+    )
+    parser.add_argument(
+        '--eval-ids',
+        default=None,
+        type=str,
+        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
+    )
+    return parser
+
+
+def parse_arguments() -> argparse.Namespace:
+    """Parse the command line arguments."""
+    parser = get_parser()
+    parsed_args, _ = parser.parse_known_args()
+    return parsed_args
+
+
+def load_app_config(set_logging_levels: bool = True) -> AppConfig:
+    """Load the configuration from the config.toml file and environment variables.
+
+    Args:
+        set_logger_levels: Whether to set the global variables for logging levels.
+    """
+    config = AppConfig()
+    load_from_toml(config)
+    load_from_env(config, os.environ)
+    finalize_config(config)
+    if set_logging_levels:
+        logger.DEBUG = config.debug
+        logger.DISABLE_COLOR_PRINTING = config.disable_color
+    return config
@@ -1,104 +0,0 @@
-# Configuration Management in OpenHands
-
-## Overview
-
-OpenHands uses a flexible configuration system that allows settings to be defined through environment variables, TOML files, and command-line arguments. The configuration is managed through a package structure in `openhands/core/config/`.
-
-## Configuration Classes
-
-The main configuration classes are:
-
- `AppConfig`: The root configuration class
- `LLMConfig`: Configuration for the Language Model
- `AgentConfig`: Configuration for the agent
- `SandboxConfig`: Configuration for the sandbox environment
- `SecurityConfig`: Configuration for security settings
-
-These classes are defined as dataclasses, with class attributes holding default values for all fields.
-
-## Loading Configuration from Environment Variables
-
-The `load_from_env` function in the config package is responsible for loading configuration values from environment variables. It recursively processes the configuration classes, mapping environment variable names to class attributes.
-
-### Naming Convention for Environment Variables
-
- Prefix: uppercase name of the configuration class followed by an underscore (e.g., `LLM_`, `AGENT_`)
- Field Names: all uppercase
- Full Variable Name: Prefix + Field Name (e.g., `LLM_API_KEY`, `AGENT_MEMORY_ENABLED`)
-
-### Examples
-
-```bash
-export LLM_API_KEY='your_api_key_here'
-export LLM_MODEL='gpt-4'
-export AGENT_MEMORY_ENABLED='true'
-export SANDBOX_TIMEOUT='300'
-```
-
-## Type Handling
-
-The `load_from_env` function attempts to cast environment variable values to the types specified in the dataclasses. It handles:
-
- Basic types (str, int, bool)
- Optional types (e.g., `str | None`)
- Nested dataclasses
-
-If type casting fails, an error is logged, and the default value is retained.
-
-## Default Values
-
-If an environment variable is not set, the default value specified in the dataclass is used.
-
-## Nested Configurations
-
-The `AppConfig` class contains nested configurations like `LLMConfig` and `AgentConfig`. The `load_from_env` function handles these by recursively processing nested dataclasses with updated prefixes.
-
-## Security Considerations
-
-Be cautious when setting sensitive information like API keys in environment variables. Ensure your environment is secure.
-
-## Usage
-
-The `load_app_config()` function is the recommended way to initialize your configuration. It performs the following steps:
-
-1. Creates an instance of `AppConfig`
-2. Loads settings from the `config.toml` file (if present)
-3. Loads settings from environment variables, overriding TOML settings if applicable
-4. Applies final tweaks and validations to the configuration, falling back to the default values specified in the code
-5. Optionally sets global logging levels based on the configuration
-
-There are also command line args, which may work to override other sources.
-
-Here's an example of how to use `load_app_config()`:
-
-````python
-from openhands.core.config import load_app_config
-
-# Load all configuration settings
-config = load_app_config()
-
-# Now you can access your configuration
-llm_config = config.get_llm_config()
-agent_config = config.get_agent_config()
-sandbox_config = config.sandbox
-
-# Use the configuration in your application
-print(f"Using LLM model: {llm_config.model}")
-print(f"Agent memory enabled: {agent_config.memory_enabled}")
-print(f"Sandbox timeout: {sandbox_config.timeout}")
-````
-
-By using `load_app_config()`, you ensure that all configuration sources are properly loaded and processed, providing a consistent and fully initialized configuration for your application.
-
-## Additional Configuration Methods
-
-While this document focuses on environment variable configuration, OpenHands also supports:
-
- Loading from TOML files
- Parsing command-line arguments
-
-These methods are handled by separate functions in the config package.
-
-## Conclusion
-
-The OpenHands configuration system provides a flexible and type-safe way to manage application settings. By following the naming conventions and utilizing the provided functions, developers can easily customize the behavior of OpenHands components through environment variables and other configuration sources.
@@ -1,39 +0,0 @@
-from openhands.core.config.agent_config import AgentConfig
-from openhands.core.config.app_config import AppConfig
-from openhands.core.config.config_utils import (
-    OH_DEFAULT_AGENT,
-    OH_MAX_ITERATIONS,
-    UndefinedString,
-    get_field_info,
-)
-from openhands.core.config.llm_config import LLMConfig
-from openhands.core.config.sandbox_config import SandboxConfig
-from openhands.core.config.security_config import SecurityConfig
-from openhands.core.config.utils import (
-    finalize_config,
-    get_llm_config_arg,
-    get_parser,
-    load_app_config,
-    load_from_env,
-    load_from_toml,
-    parse_arguments,
-)
-
-__all__ = [
-    'OH_DEFAULT_AGENT',
-    'OH_MAX_ITERATIONS',
-    'UndefinedString',
-    'AgentConfig',
-    'AppConfig',
-    'LLMConfig',
-    'SandboxConfig',
-    'SecurityConfig',
-    'load_app_config',
-    'load_from_env',
-    'load_from_toml',
-    'finalize_config',
-    'get_llm_config_arg',
-    'get_field_info',
-    'get_parser',
-    'parse_arguments',
-]
@@ -1,27 +0,0 @@
-from dataclasses import dataclass, fields
-
-from openhands.core.config.config_utils import get_field_info
-
-
-@dataclass
-class AgentConfig:
-    """Configuration for the agent.
-
-    Attributes:
-        micro_agent_name: The name of the micro agent to use for this agent.
-        memory_enabled: Whether long-term memory (embeddings) is enabled.
-        memory_max_threads: The maximum number of threads indexing at the same time for embeddings.
-        llm_config: The name of the llm config to use. If specified, this will override global llm config.
-    """
-
-    micro_agent_name: str | None = None
-    memory_enabled: bool = False
-    memory_max_threads: int = 2
-    llm_config: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            result[f.name] = get_field_info(f)
-        return result
@@ -1,151 +0,0 @@
-import os
-import uuid
-from dataclasses import dataclass, field, fields, is_dataclass
-from typing import ClassVar
-
-from openhands.core import logger
-from openhands.core.config.agent_config import AgentConfig
-from openhands.core.config.config_utils import (
-    OH_DEFAULT_AGENT,
-    OH_MAX_ITERATIONS,
-    UndefinedString,
-    get_field_info,
-)
-from openhands.core.config.llm_config import LLMConfig
-from openhands.core.config.sandbox_config import SandboxConfig
-from openhands.core.config.security_config import SecurityConfig
-
-
-@dataclass
-class AppConfig:
-    """Configuration for the app.
-
-    Attributes:
-        llms: A dictionary of name -> LLM configuration. Default config is under 'llm' key.
-        agents: A dictionary of name -> Agent configuration. Default config is under 'agent' key.
-        default_agent: The name of the default agent to use.
-        sandbox: The sandbox configuration.
-        runtime: The runtime environment.
-        file_store: The file store to use.
-        file_store_path: The path to the file store.
-        workspace_base: The base path for the workspace. Defaults to ./workspace as an absolute path.
-        workspace_mount_path: The path to mount the workspace. This is set to the workspace base by default.
-        workspace_mount_path_in_sandbox: The path to mount the workspace in the sandbox. Defaults to /workspace.
-        workspace_mount_rewrite: The path to rewrite the workspace mount path to.
-        cache_dir: The path to the cache directory. Defaults to /tmp/cache.
-        run_as_openhands: Whether to run as openhands.
-        max_iterations: The maximum number of iterations.
-        max_budget_per_task: The maximum budget allowed per task, beyond which the agent will stop.
-        e2b_api_key: The E2B API key.
-        disable_color: Whether to disable color. For terminals that don't support color.
-        debug: Whether to enable debugging.
-        enable_cli_session: Whether to enable saving and restoring the session when run from CLI.
-        file_uploads_max_file_size_mb: Maximum file size for uploads in megabytes. 0 means no limit.
-        file_uploads_restrict_file_types: Whether to restrict file types for file uploads. Defaults to False.
-        file_uploads_allowed_extensions: List of allowed file extensions for uploads. ['.*'] means all extensions are allowed.
-    """
-
-    llms: dict[str, LLMConfig] = field(default_factory=dict)
-    agents: dict = field(default_factory=dict)
-    default_agent: str = OH_DEFAULT_AGENT
-    sandbox: SandboxConfig = field(default_factory=SandboxConfig)
-    security: SecurityConfig = field(default_factory=SecurityConfig)
-    runtime: str = 'eventstream'
-    file_store: str = 'memory'
-    file_store_path: str = '/tmp/file_store'
-    # TODO: clean up workspace path after the removal of ServerRuntime
-    workspace_base: str = os.path.join(os.getcwd(), 'workspace')
-    workspace_mount_path: str | None = (
-        UndefinedString.UNDEFINED  # this path should always be set when config is fully loaded
-    )  # when set to None, do not mount the workspace
-    workspace_mount_path_in_sandbox: str = '/workspace'
-    workspace_mount_rewrite: str | None = None
-    cache_dir: str = '/tmp/cache'
-    run_as_openhands: bool = True
-    max_iterations: int = OH_MAX_ITERATIONS
-    max_budget_per_task: float | None = None
-    e2b_api_key: str = ''
-    disable_color: bool = False
-    jwt_secret: str = uuid.uuid4().hex
-    debug: bool = False
-    enable_cli_session: bool = False
-    file_uploads_max_file_size_mb: int = 0
-    file_uploads_restrict_file_types: bool = False
-    file_uploads_allowed_extensions: list[str] = field(default_factory=lambda: ['.*'])
-
-    defaults_dict: ClassVar[dict] = {}
-
-    def get_llm_config(self, name='llm') -> LLMConfig:
-        """Llm is the name for default config (for backward compatibility prior to 0.8)"""
-        if name in self.llms:
-            return self.llms[name]
-        if name is not None and name != 'llm':
-            logger.openhands_logger.warning(
-                f'llm config group {name} not found, using default config'
-            )
-        if 'llm' not in self.llms:
-            self.llms['llm'] = LLMConfig()
-        return self.llms['llm']
-
-    def set_llm_config(self, value: LLMConfig, name='llm'):
-        self.llms[name] = value
-
-    def get_agent_config(self, name='agent') -> AgentConfig:
-        """Agent is the name for default config (for backward compability prior to 0.8)"""
-        if name in self.agents:
-            return self.agents[name]
-        if 'agent' not in self.agents:
-            self.agents['agent'] = AgentConfig()
-        return self.agents['agent']
-
-    def set_agent_config(self, value: AgentConfig, name='agent'):
-        self.agents[name] = value
-
-    def get_agent_to_llm_config_map(self) -> dict[str, LLMConfig]:
-        """Get a map of agent names to llm configs."""
-        return {name: self.get_llm_config_from_agent(name) for name in self.agents}
-
-    def get_llm_config_from_agent(self, name='agent') -> LLMConfig:
-        agent_config: AgentConfig = self.get_agent_config(name)
-        llm_config_name = agent_config.llm_config
-        return self.get_llm_config(llm_config_name)
-
-    def get_agent_configs(self) -> dict[str, AgentConfig]:
-        return self.agents
-
-    def __post_init__(self):
-        """Post-initialization hook, called when the instance is created with only default values."""
-        AppConfig.defaults_dict = self.defaults_to_dict()
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            field_value = getattr(self, f.name)
-
-            # dataclasses compute their defaults themselves
-            if is_dataclass(type(field_value)):
-                result[f.name] = field_value.defaults_to_dict()
-            else:
-                result[f.name] = get_field_info(f)
-        return result
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            if attr_name in [
-                'e2b_api_key',
-                'github_token',
-                'jwt_secret',
-            ]:
-                attr_value = '******' if attr_value else None
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"AppConfig({', '.join(attr_str)}"
-
-    def __repr__(self):
-        return self.__str__()
@@ -1,44 +0,0 @@
-from enum import Enum
-from types import UnionType
-from typing import get_args, get_origin
-
-OH_DEFAULT_AGENT = 'CodeActAgent'
-OH_MAX_ITERATIONS = 100
-
-
-class UndefinedString(str, Enum):
-    UNDEFINED = 'UNDEFINED'
-
-
-def get_field_info(f):
-    """Extract information about a dataclass field: type, optional, and default.
-
-    Args:
-        f: The field to extract information from.
-
-    Returns: A dict with the field's type, whether it's optional, and its default value.
-    """
-    field_type = f.type
-    optional = False
-
-    # for types like str | None, find the non-None type and set optional to True
-    # this is useful for the frontend to know if a field is optional
-    # and to show the correct type in the UI
-    # Note: this only works for UnionTypes with None as one of the types
-    if get_origin(field_type) is UnionType:
-        types = get_args(field_type)
-        non_none_arg = next((t for t in types if t is not type(None)), None)
-        if non_none_arg is not None:
-            field_type = non_none_arg
-            optional = True
-
-    # type name in a pretty format
-    type_name = (
-        field_type.__name__ if hasattr(field_type, '__name__') else str(field_type)
-    )
-
-    # default is always present
-    default = f.default
-
-    # return a schema with the useful info for frontend
-    return {'type': type_name.lower(), 'optional': optional, 'default': default}
@@ -1,109 +0,0 @@
-from dataclasses import dataclass, fields
-
-from openhands.core.config.config_utils import get_field_info
-
-LLM_SENSITIVE_FIELDS = ['api_key', 'aws_access_key_id', 'aws_secret_access_key']
-
-
-@dataclass
-class LLMConfig:
-    """Configuration for the LLM model.
-
-    Attributes:
-        model: The model to use.
-        api_key: The API key to use.
-        base_url: The base URL for the API. This is necessary for local LLMs. It is also used for Azure embeddings.
-        api_version: The version of the API.
-        embedding_model: The embedding model to use.
-        embedding_base_url: The base URL for the embedding API.
-        embedding_deployment_name: The name of the deployment for the embedding API. This is used for Azure OpenAI.
-        aws_access_key_id: The AWS access key ID.
-        aws_secret_access_key: The AWS secret access key.
-        aws_region_name: The AWS region name.
-        num_retries: The number of retries to attempt.
-        retry_multiplier: The multiplier for the exponential backoff.
-        retry_min_wait: The minimum time to wait between retries, in seconds. This is exponential backoff minimum. For models with very low limits, this can be set to 15-20.
-        retry_max_wait: The maximum time to wait between retries, in seconds. This is exponential backoff maximum.
-        timeout: The timeout for the API.
-        max_message_chars: The approximate max number of characters in the content of an event included in the prompt to the LLM. Larger observations are truncated.
-        temperature: The temperature for the API.
-        top_p: The top p for the API.
-        custom_llm_provider: The custom LLM provider to use. This is undocumented in openhands, and normally not used. It is documented on the litellm side.
-        max_input_tokens: The maximum number of input tokens. Note that this is currently unused, and the value at runtime is actually the total tokens in OpenAI (e.g. 128,000 tokens for GPT-4).
-        max_output_tokens: The maximum number of output tokens. This is sent to the LLM.
-        input_cost_per_token: The cost per input token. This will available in logs for the user to check.
-        output_cost_per_token: The cost per output token. This will available in logs for the user to check.
-        ollama_base_url: The base URL for the OLLAMA API.
-        drop_params: Drop any unmapped (unsupported) params without causing an exception.
-        disable_vision: If model is vision capable, this option allows to disable image processing (useful for cost reduction).
-        caching_prompt: Using the prompt caching feature provided by the LLM.
-        log_completions: Whether to log LLM completions to the state.
-    """
-
-    model: str = 'gpt-4o'
-    api_key: str | None = None
-    base_url: str | None = None
-    api_version: str | None = None
-    embedding_model: str = 'local'
-    embedding_base_url: str | None = None
-    embedding_deployment_name: str | None = None
-    aws_access_key_id: str | None = None
-    aws_secret_access_key: str | None = None
-    aws_region_name: str | None = None
-    openrouter_site_url: str = 'https://docs.all-hands.dev/'
-    openrouter_app_name: str = 'OpenHands'
-    num_retries: int = 8
-    retry_multiplier: float = 2
-    retry_min_wait: int = 15
-    retry_max_wait: int = 120
-    timeout: int | None = None
-    max_message_chars: int = 10_000  # maximum number of characters in an observation's content when sent to the llm
-    temperature: float = 0.0
-    top_p: float = 1.0
-    custom_llm_provider: str | None = None
-    max_input_tokens: int | None = None
-    max_output_tokens: int | None = None
-    input_cost_per_token: float | None = None
-    output_cost_per_token: float | None = None
-    ollama_base_url: str | None = None
-    drop_params: bool = True
-    disable_vision: bool | None = None
-    caching_prompt: bool = False
-    log_completions: bool = False
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        result = {}
-        for f in fields(self):
-            result[f.name] = get_field_info(f)
-        return result
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            if attr_name in LLM_SENSITIVE_FIELDS:
-                attr_value = '******' if attr_value else None
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"LLMConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
-
-    def to_safe_dict(self):
-        """Return a dict with the sensitive fields replaced with ******."""
-        ret = self.__dict__.copy()
-        for k, v in ret.items():
-            if k in LLM_SENSITIVE_FIELDS:
-                ret[k] = '******' if v else None
-        return ret
-
-    def set_missing_attributes(self):
-        """Set any missing attributes to their default values."""
-        for field_name, field_obj in self.__dataclass_fields__.items():
-            if not hasattr(self, field_name):
-                setattr(self, field_name, field_obj.default)
@@ -1,66 +0,0 @@
-import os
-from dataclasses import dataclass, field, fields
-
-from openhands.core.config.config_utils import get_field_info
-
-
-@dataclass
-class SandboxConfig:
-    """Configuration for the sandbox.
-
-    Attributes:
-        api_hostname: The hostname for the EventStream Runtime API.
-        base_container_image: The base container image from which to build the runtime image.
-        runtime_container_image: The runtime container image to use.
-        user_id: The user ID for the sandbox.
-        timeout: The timeout for the sandbox.
-        enable_auto_lint: Whether to enable auto-lint.
-        use_host_network: Whether to use the host network.
-        initialize_plugins: Whether to initialize plugins.
-        runtime_extra_deps: The extra dependencies to install in the runtime image (typically used for evaluation).
-            This will be rendered into the end of the Dockerfile that builds the runtime image.
-            It can contain any valid shell commands (e.g., pip install numpy).
-            The path to the interpreter is available as $OH_INTERPRETER_PATH,
-            which can be used to install dependencies for the OH-specific Python interpreter.
-        runtime_startup_env_vars: The environment variables to set at the launch of the runtime.
-            This is a dictionary of key-value pairs.
-            This is useful for setting environment variables that are needed by the runtime.
-            For example, for specifying the base url of website for browsergym evaluation.
-        browsergym_eval_env: The BrowserGym environment to use for evaluation.
-            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
-    """
-
-    api_hostname: str = 'localhost'
-    api_key: str | None = None
-    base_container_image: str = 'nikolaik/python-nodejs:python3.11-nodejs22'  # default to nikolaik/python-nodejs:python3.11-nodejs22 for eventstream runtime
-    runtime_container_image: str | None = None
-    user_id: int = os.getuid() if hasattr(os, 'getuid') else 1000
-    timeout: int = 120
-    enable_auto_lint: bool = (
-        False  # once enabled, OpenHands would lint files after editing
-    )
-    use_host_network: bool = False
-    initialize_plugins: bool = True
-    runtime_extra_deps: str | None = None
-    runtime_startup_env_vars: dict[str, str] = field(default_factory=dict)
-    browsergym_eval_env: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        dict = {}
-        for f in fields(self):
-            dict[f.name] = get_field_info(f)
-        return dict
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"SandboxConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
@@ -1,36 +0,0 @@
-from dataclasses import dataclass, fields
-
-from openhands.core.config.config_utils import get_field_info
-
-
-@dataclass
-class SecurityConfig:
-    """Configuration for security related functionalities.
-
-    Attributes:
-        confirmation_mode: Whether to enable confirmation mode.
-        security_analyzer: The security analyzer to use.
-    """
-
-    confirmation_mode: bool = False
-    security_analyzer: str | None = None
-
-    def defaults_to_dict(self) -> dict:
-        """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
-        dict = {}
-        for f in fields(self):
-            dict[f.name] = get_field_info(f)
-        return dict
-
-    def __str__(self):
-        attr_str = []
-        for f in fields(self):
-            attr_name = f.name
-            attr_value = getattr(self, f.name)
-
-            attr_str.append(f'{attr_name}={repr(attr_value)}')
-
-        return f"SecurityConfig({', '.join(attr_str)})"
-
-    def __repr__(self):
-        return self.__str__()
@@ -1,391 +0,0 @@
-import argparse
-import os
-import pathlib
-import platform
-from dataclasses import is_dataclass
-from types import UnionType
-from typing import Any, MutableMapping, get_args, get_origin
-
-import toml
-from dotenv import load_dotenv
-
-from openhands.core import logger
-from openhands.core.config.agent_config import AgentConfig
-from openhands.core.config.app_config import AppConfig
-from openhands.core.config.config_utils import (
-    OH_DEFAULT_AGENT,
-    OH_MAX_ITERATIONS,
-    UndefinedString,
-)
-from openhands.core.config.llm_config import LLMConfig
-from openhands.core.config.sandbox_config import SandboxConfig
-
-load_dotenv()
-
-
-def load_from_env(cfg: AppConfig, env_or_toml_dict: dict | MutableMapping[str, str]):
-    """Reads the env-style vars and sets config attributes based on env vars or a config.toml dict.
-    Compatibility with vars like LLM_BASE_URL, AGENT_MEMORY_ENABLED, SANDBOX_TIMEOUT and others.
-
-    Args:
-        cfg: The AppConfig object to set attributes on.
-        env_or_toml_dict: The environment variables or a config.toml dict.
-    """
-
-    def get_optional_type(union_type: UnionType) -> Any:
-        """Returns the non-None type from a Union."""
-        types = get_args(union_type)
-        return next((t for t in types if t is not type(None)), None)
-
-    # helper function to set attributes based on env vars
-    def set_attr_from_env(sub_config: Any, prefix=''):
-        """Set attributes of a config dataclass based on environment variables."""
-        for field_name, field_type in sub_config.__annotations__.items():
-            # compute the expected env var name from the prefix and field name
-            # e.g. LLM_BASE_URL
-            env_var_name = (prefix + field_name).upper()
-
-            if is_dataclass(field_type):
-                # nested dataclass
-                nested_sub_config = getattr(sub_config, field_name)
-                set_attr_from_env(nested_sub_config, prefix=field_name + '_')
-            elif env_var_name in env_or_toml_dict:
-                # convert the env var to the correct type and set it
-                value = env_or_toml_dict[env_var_name]
-
-                # skip empty config values (fall back to default)
-                if not value:
-                    continue
-
-                try:
-                    # if it's an optional type, get the non-None type
-                    if get_origin(field_type) is UnionType:
-                        field_type = get_optional_type(field_type)
-
-                    # Attempt to cast the env var to type hinted in the dataclass
-                    if field_type is bool:
-                        cast_value = str(value).lower() in ['true', '1']
-                    else:
-                        cast_value = field_type(value)
-                    setattr(sub_config, field_name, cast_value)
-                except (ValueError, TypeError):
-                    logger.openhands_logger.error(
-                        f'Error setting env var {env_var_name}={value}: check that the value is of the right type'
-                    )
-
-    # Start processing from the root of the config object
-    set_attr_from_env(cfg)
-
-    # load default LLM config from env
-    default_llm_config = cfg.get_llm_config()
-    set_attr_from_env(default_llm_config, 'LLM_')
-    # load default agent config from env
-    default_agent_config = cfg.get_agent_config()
-    set_attr_from_env(default_agent_config, 'AGENT_')
-
-
-def load_from_toml(cfg: AppConfig, toml_file: str = 'config.toml'):
-    """Load the config from the toml file. Supports both styles of config vars.
-
-    Args:
-        cfg: The AppConfig object to update attributes of.
-        toml_file: The path to the toml file. Defaults to 'config.toml'.
-    """
-    # try to read the config.toml file into the config object
-    try:
-        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
-            toml_config = toml.load(toml_contents)
-    except FileNotFoundError:
-        return
-    except toml.TomlDecodeError as e:
-        logger.openhands_logger.warning(
-            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
-            exc_info=False,
-        )
-        return
-
-    # if there was an exception or core is not in the toml, try to use the old-style toml
-    if 'core' not in toml_config:
-        # re-use the env loader to set the config from env-style vars
-        load_from_env(cfg, toml_config)
-        return
-
-    core_config = toml_config['core']
-
-    # load llm configs and agent configs
-    for key, value in toml_config.items():
-        if isinstance(value, dict):
-            try:
-                if key is not None and key.lower() == 'agent':
-                    logger.openhands_logger.debug(
-                        'Attempt to load default agent config from config toml'
-                    )
-                    non_dict_fields = {
-                        k: v for k, v in value.items() if not isinstance(v, dict)
-                    }
-                    agent_config = AgentConfig(**non_dict_fields)
-                    cfg.set_agent_config(agent_config, 'agent')
-                    for nested_key, nested_value in value.items():
-                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as agent config'
-                            )
-                            agent_config = AgentConfig(**nested_value)
-                            cfg.set_agent_config(agent_config, nested_key)
-                elif key is not None and key.lower() == 'llm':
-                    logger.openhands_logger.debug(
-                        'Attempt to load default LLM config from config toml'
-                    )
-                    non_dict_fields = {
-                        k: v for k, v in value.items() if not isinstance(v, dict)
-                    }
-                    llm_config = LLMConfig(**non_dict_fields)
-                    cfg.set_llm_config(llm_config, 'llm')
-                    for nested_key, nested_value in value.items():
-                        if isinstance(nested_value, dict):
-                            logger.openhands_logger.debug(
-                                f'Attempt to load group {nested_key} from config toml as llm config'
-                            )
-                            llm_config = LLMConfig(**nested_value)
-                            cfg.set_llm_config(llm_config, nested_key)
-                elif not key.startswith('sandbox') and key.lower() != 'core':
-                    logger.openhands_logger.warning(
-                        f'Unknown key in {toml_file}: "{key}"'
-                    )
-            except (TypeError, KeyError) as e:
-                logger.openhands_logger.warning(
-                    f'Cannot parse config from toml, toml values have not been applied.\n Error: {e}',
-                    exc_info=False,
-                )
-        else:
-            logger.openhands_logger.warning(f'Unknown key in {toml_file}: "{key}')
-
-    try:
-        # set sandbox config from the toml file
-        sandbox_config = cfg.sandbox
-
-        # migrate old sandbox configs from [core] section to sandbox config
-        keys_to_migrate = [key for key in core_config if key.startswith('sandbox_')]
-        for key in keys_to_migrate:
-            new_key = key.replace('sandbox_', '')
-            if new_key in sandbox_config.__annotations__:
-                # read the key in sandbox and remove it from core
-                setattr(sandbox_config, new_key, core_config.pop(key))
-            else:
-                logger.openhands_logger.warning(f'Unknown sandbox config: {key}')
-
-        # the new style values override the old style values
-        if 'sandbox' in toml_config:
-            sandbox_config = SandboxConfig(**toml_config['sandbox'])
-
-        # update the config object with the new values
-        cfg.sandbox = sandbox_config
-        for key, value in core_config.items():
-            if hasattr(cfg, key):
-                setattr(cfg, key, value)
-            else:
-                logger.openhands_logger.warning(f'Unknown core config key: {key}')
-    except (TypeError, KeyError) as e:
-        logger.openhands_logger.warning(
-            f'Cannot parse config from toml, toml values have not been applied.\nError: {e}',
-            exc_info=False,
-        )
-
-
-def finalize_config(cfg: AppConfig):
-    """More tweaks to the config after it's been loaded."""
-    cfg.workspace_base = os.path.abspath(cfg.workspace_base)
-    # Set workspace_mount_path if not set by the user
-    if cfg.workspace_mount_path is UndefinedString.UNDEFINED:
-        cfg.workspace_mount_path = cfg.workspace_base
-
-    if cfg.workspace_mount_rewrite:  # and not config.workspace_mount_path:
-        # TODO why do we need to check if workspace_mount_path is None?
-        base = cfg.workspace_base or os.getcwd()
-        parts = cfg.workspace_mount_rewrite.split(':')
-        cfg.workspace_mount_path = base.replace(parts[0], parts[1])
-
-    for llm in cfg.llms.values():
-        if llm.embedding_base_url is None:
-            llm.embedding_base_url = llm.base_url
-
-    if cfg.sandbox.use_host_network and platform.system() == 'Darwin':
-        logger.openhands_logger.warning(
-            'Please upgrade to Docker Desktop 4.29.0 or later to use host network mode on macOS. '
-            'See https://github.com/docker/roadmap/issues/238#issuecomment-2044688144 for more information.'
-        )
-
-    # make sure cache dir exists
-    if cfg.cache_dir:
-        pathlib.Path(cfg.cache_dir).mkdir(parents=True, exist_ok=True)
-
-
-# Utility function for command line --group argument
-def get_llm_config_arg(
-    llm_config_arg: str, toml_file: str = 'config.toml'
-) -> LLMConfig | None:
-    """Get a group of llm settings from the config file.
-
-    A group in config.toml can look like this:
-
-    ```
-    [llm.gpt-3.5-for-eval]
-    model = 'gpt-3.5-turbo'
-    api_key = '...'
-    temperature = 0.5
-    num_retries = 8
-    ...
-    ```
-
-    The user-defined group name, like "gpt-3.5-for-eval", is the argument to this function. The function will load the LLMConfig object
-    with the settings of this group, from the config file, and set it as the LLMConfig object for the app.
-
-    Note that the group must be under "llm" group, or in other words, the group name must start with "llm.".
-
-    Args:
-        llm_config_arg: The group of llm settings to get from the config.toml file.
-
-    Returns:
-        LLMConfig: The LLMConfig object with the settings from the config file.
-    """
-    # keep only the name, just in case
-    llm_config_arg = llm_config_arg.strip('[]')
-
-    # truncate the prefix, just in case
-    if llm_config_arg.startswith('llm.'):
-        llm_config_arg = llm_config_arg[4:]
-
-    logger.openhands_logger.info(f'Loading llm config from {llm_config_arg}')
-
-    # load the toml file
-    try:
-        with open(toml_file, 'r', encoding='utf-8') as toml_contents:
-            toml_config = toml.load(toml_contents)
-    except FileNotFoundError as e:
-        logger.openhands_logger.error(f'Config file not found: {e}')
-        return None
-    except toml.TomlDecodeError as e:
-        logger.openhands_logger.error(
-            f'Cannot parse llm group from {llm_config_arg}. Exception: {e}'
-        )
-        return None
-
-    # update the llm config with the specified section
-    if 'llm' in toml_config and llm_config_arg in toml_config['llm']:
-        return LLMConfig(**toml_config['llm'][llm_config_arg])
-    logger.openhands_logger.debug(f'Loading from toml failed for {llm_config_arg}')
-    return None
-
-
-# Command line arguments
-def get_parser() -> argparse.ArgumentParser:
-    """Get the parser for the command line arguments."""
-    parser = argparse.ArgumentParser(description='Run an agent with a specific task')
-    parser.add_argument(
-        '-d',
-        '--directory',
-        type=str,
-        help='The working directory for the agent',
-    )
-    parser.add_argument(
-        '-t',
-        '--task',
-        type=str,
-        default='',
-        help='The task for the agent to perform',
-    )
-    parser.add_argument(
-        '-f',
-        '--file',
-        type=str,
-        help='Path to a file containing the task. Overrides -t if both are provided.',
-    )
-    parser.add_argument(
-        '-c',
-        '--agent-cls',
-        default=OH_DEFAULT_AGENT,
-        type=str,
-        help='Name of the default agent to use',
-    )
-    parser.add_argument(
-        '-i',
-        '--max-iterations',
-        default=OH_MAX_ITERATIONS,
-        type=int,
-        help='The maximum number of iterations to run the agent',
-    )
-    parser.add_argument(
-        '-b',
-        '--max-budget-per-task',
-        type=float,
-        help='The maximum budget allowed per task, beyond which the agent will stop.',
-    )
-    # --eval configs are for evaluations only
-    parser.add_argument(
-        '--eval-output-dir',
-        default='evaluation/evaluation_outputs/outputs',
-        type=str,
-        help='The directory to save evaluation output',
-    )
-    parser.add_argument(
-        '--eval-n-limit',
-        default=None,
-        type=int,
-        help='The number of instances to evaluate',
-    )
-    parser.add_argument(
-        '--eval-num-workers',
-        default=4,
-        type=int,
-        help='The number of workers to use for evaluation',
-    )
-    parser.add_argument(
-        '--eval-note',
-        default=None,
-        type=str,
-        help='The note to add to the evaluation directory',
-    )
-    parser.add_argument(
-        '-l',
-        '--llm-config',
-        default=None,
-        type=str,
-        help='Replace default LLM ([llm] section in config.toml) config with the specified LLM config, e.g. "llama3" for [llm.llama3] section in config.toml',
-    )
-    parser.add_argument(
-        '-n',
-        '--name',
-        default='default',
-        type=str,
-        help='Name for the session',
-    )
-    parser.add_argument(
-        '--eval-ids',
-        default=None,
-        type=str,
-        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
-    )
-    return parser
-
-
-def parse_arguments() -> argparse.Namespace:
-    """Parse the command line arguments."""
-    parser = get_parser()
-    parsed_args, _ = parser.parse_known_args()
-    return parsed_args
-
-
-def load_app_config(set_logging_levels: bool = True) -> AppConfig:
-    """Load the configuration from the config.toml file and environment variables.
-
-    Args:
-        set_logger_levels: Whether to set the global variables for logging levels.
-    """
-    config = AppConfig()
-    load_from_toml(config)
-    load_from_env(config, os.environ)
-    finalize_config(config)
-    if set_logging_levels:
-        logger.DEBUG = config.debug
-        logger.DISABLE_COLOR_PRINTING = config.disable_color
-    return config
@@ -77,10 +77,3 @@ class UserCancelledError(Exception):
 class MicroAgentValidationError(Exception):
    def __init__(self, message='Micro agent validation failed'):
        super().__init__(message)
-
-
-class OperationCancelled(Exception):
-    """Exception raised when an operation is cancelled (e.g. by a keyboard interrupt)."""
-
-    def __init__(self, message='Operation was cancelled'):
-        super().__init__(message)
@@ -55,6 +55,7 @@ def create_runtime(

    config: The app config.
    sid: The session id.
+    runtime_tools_config: (will be deprecated) The runtime tools config.
    """
    # if sid is provided on the command line, use it as the name of the event stream
    # otherwise generate it on the basis of the configured jwt_secret
@@ -143,9 +144,6 @@ async def run_controller(
        headless_mode=headless_mode,
    )

-    if controller is not None:
-        controller.agent_task = asyncio.create_task(controller.start_step_loop())
-
    assert isinstance(task_str, str), f'task_str must be a string, got {type(task_str)}'
    # Logging
    logger.info(
@@ -1,7 +1,10 @@
 from enum import Enum
-from typing import Literal
+from typing import Union

 from pydantic import BaseModel, Field, model_serializer
+from typing_extensions import Literal
+
+from openhands.core.logger import openhands_logger as logger


 class ContentType(Enum):
@@ -57,24 +60,60 @@ class Message(BaseModel):

    @model_serializer
    def serialize_model(self) -> dict:
-        content: list[dict] | str
-        if self.role == 'system':
-            # For system role, concatenate all text content into a single string
-            content = '\n'.join(
-                item.text for item in self.content if isinstance(item, TextContent)
-            )
-        elif self.role == 'assistant' and not self.contains_image:
-            # For assistant role without vision, concatenate all text content into a single string
-            content = '\n'.join(
-                item.text for item in self.content if isinstance(item, TextContent)
-            )
-        else:
-            # For user role or assistant role with vision enabled, serialize each content item
-            content = []
-            for item in self.content:
-                if isinstance(item, TextContent):
-                    content.append(item.model_dump())
-                elif isinstance(item, ImageContent):
-                    content.extend(item.model_dump())
+        content: list[dict[str, str | dict[str, str]]] = []
+
+        for item in self.content:
+            if isinstance(item, TextContent):
+                content.append(item.model_dump())
+            elif isinstance(item, ImageContent):
+                content.extend(item.model_dump())

        return {'content': content, 'role': self.role}
+
+
+def format_messages(
+    messages: Union[Message, list[Message]],
+    with_images: bool,
+    with_prompt_caching: bool,
+) -> list[dict]:
+    if not isinstance(messages, list):
+        messages = [messages]
+
+    if with_images or with_prompt_caching:
+        return [message.model_dump() for message in messages]
+
+    converted_messages = []
+    for message in messages:
+        content_parts = []
+        role = 'user'
+
+        if isinstance(message, str) and message:
+            content_parts.append(message)
+        elif isinstance(message, dict):
+            role = message.get('role', 'user')
+            if 'content' in message and message['content']:
+                content_parts.append(message['content'])
+        elif isinstance(message, Message):
+            role = message.role
+            for content in message.content:
+                if isinstance(content, list):
+                    for item in content:
+                        if isinstance(item, TextContent) and item.text:
+                            content_parts.append(item.text)
+                elif isinstance(content, TextContent) and content.text:
+                    content_parts.append(content.text)
+        else:
+            logger.error(
+                f'>>> `message` is not a string, dict, or Message: {type(message)}'
+            )
+
+        if content_parts:
+            content_str = '\n'.join(content_parts)
+            converted_messages.append(
+                {
+                    'role': role,
+                    'content': content_str,
+                }
+            )
+
+    return converted_messages
@@ -24,10 +24,6 @@ class ActionTypeSchema(BaseModel):
    """Writes the content to a file.
    """

-    EDIT: str = Field(default='edit')
-    """Edits the content of a file.
-    """
-
    RUN: str = Field(default='run')
    """Runs a command.
    """
@@ -10,10 +10,6 @@ class ObservationTypeSchema(BaseModel):

    WRITE: str = Field(default='write')

-    EDIT: str = Field(default='edit')
-    """The edited file
-    """
-
    BROWSE: str = Field(default='browse')
    """The HTML content of a URL
    """
@@ -9,11 +9,7 @@ from openhands.events.action.agent import (
 from openhands.events.action.browse import BrowseInteractiveAction, BrowseURLAction
 from openhands.events.action.commands import CmdRunAction, IPythonRunCellAction
 from openhands.events.action.empty import NullAction
-from openhands.events.action.files import (
-    FileEditAction,
-    FileReadAction,
-    FileWriteAction,
-)
+from openhands.events.action.files import FileReadAction, FileWriteAction
 from openhands.events.action.message import MessageAction
 from openhands.events.action.tasks import AddTaskAction, ModifyTaskAction

@@ -25,7 +21,6 @@ __all__ = [
    'BrowseInteractiveAction',
    'FileReadAction',
    'FileWriteAction',
-    'FileEditAction',
    'AgentFinishAction',
    'AgentRejectAction',
    'AgentDelegateAction',
@@ -39,23 +39,3 @@ class FileWriteAction(Action):
    @property
    def message(self) -> str:
        return f'Writing file: {self.path}'
-
-
-@dataclass
-class FileEditAction(Action):
-    diff_block: str
-    thought: str = ''
-    action: str = ActionType.EDIT
-    runnable: ClassVar[bool] = True
-    security_risk: ActionSecurityRisk | None = None
-
-    def __str__(self) -> str:
-        ret = '**EditFileAction**\n'
-        if self.thought:
-            ret += f'THOUGHT: {self.thought}\n'
-        ret += f'DIFF BLOCK:\n{self.diff_block}\n'
-        return ret
-
-    @property
-    def message(self) -> str:
-        return f'Edit Diff block: {self.diff_block}'
@@ -1,5 +1,5 @@
+import datetime
 from dataclasses import dataclass
-from datetime import datetime
 from enum import Enum


@@ -23,14 +23,10 @@ class Event:
        return -1

    @property
-    def timestamp(self):
-        if hasattr(self, '_timestamp') and isinstance(self._timestamp, str):
-            return self._timestamp
-
-    @timestamp.setter
-    def timestamp(self, value: datetime) -> None:
-        if isinstance(value, datetime):
-            self._timestamp = value.isoformat()
+    def timestamp(self) -> datetime.datetime | None:
+        if hasattr(self, '_timestamp'):
+            return self._timestamp  # type: ignore[attr-defined]
+        return None

    @property
    def source(self) -> EventSource | None:
@@ -7,11 +7,7 @@ from openhands.events.observation.commands import (
 from openhands.events.observation.delegate import AgentDelegateObservation
 from openhands.events.observation.empty import NullObservation
 from openhands.events.observation.error import ErrorObservation
-from openhands.events.observation.files import (
-    FileEditObservation,
-    FileReadObservation,
-    FileWriteObservation,
-)
+from openhands.events.observation.files import FileReadObservation, FileWriteObservation
 from openhands.events.observation.observation import Observation
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
@@ -24,7 +20,6 @@ __all__ = [
    'BrowserOutputObservation',
    'FileReadObservation',
    'FileWriteObservation',
-    'FileEditObservation',
    'ErrorObservation',
    'AgentStateChangedObservation',
    'AgentDelegateObservation',
@@ -26,30 +26,3 @@ class FileWriteObservation(Observation):
    @property
    def message(self) -> str:
        return f'I wrote to the file {self.path}.'
-
-
-@dataclass
-class FileEditObservation(Observation):
-    """This data class represents a file edit operation"""
-
-    path: str
-    search_block: str
-    replace_block: str
-    observation: str = ObservationType.EDIT
-
-    @property
-    def message(self) -> str:
-        if self.search_block:
-            return (
-                f'I updated the file {self.path} by \n'
-                f'replacing:\n {self.search_block}\n'
-                f'with:\n {self.replace_block}\n'
-            )
-        else:
-            return (
-                f'I updated the file {self.path} by \n'
-                f'appending:\n {self.replace_block}\n'
-            )
-
-    def __str__(self) -> str:
-        return f'**FileEditObservation**\n' f'DIFF BLOCK: {self.content}\n'
@@ -12,11 +12,7 @@ from openhands.events.action.commands import (
    IPythonRunCellAction,
 )
 from openhands.events.action.empty import NullAction
-from openhands.events.action.files import (
-    FileEditAction,
-    FileReadAction,
-    FileWriteAction,
-)
+from openhands.events.action.files import FileReadAction, FileWriteAction
 from openhands.events.action.message import MessageAction
 from openhands.events.action.tasks import AddTaskAction, ModifyTaskAction

@@ -28,7 +24,6 @@ actions = (
    BrowseInteractiveAction,
    FileReadAction,
    FileWriteAction,
-    FileEditAction,
    AgentFinishAction,
    AgentRejectAction,
    AgentDelegateAction,
@@ -57,20 +52,10 @@ def action_from_dict(action: dict) -> Action:
            f"'{action['action']=}' is not defined. Available actions: {ACTION_TYPE_TO_CLASS.keys()}"
        )
    args = action.get('args', {})
-    # Remove timestamp from args if present
-    timestamp = args.pop('timestamp', None)
-
    try:
        decoded_action = action_class(**args)
        if 'timeout' in action:
            decoded_action.timeout = action['timeout']
-
-        # Set timestamp if it was provided
-        if timestamp:
-            decoded_action._timestamp = timestamp
-
-    except TypeError as e:
-        raise LLMMalformedActionError(
-            f'action={action} has the wrong arguments: {str(e)}'
-        )
+    except TypeError:
+        raise LLMMalformedActionError(f'action={action} has the wrong arguments')
    return decoded_action
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Xingyao Wang	d8314fb854	patch fo reval	2024-10-15 12:45:02 +00:00
Xingyao Wang	b468dddb52	update runtime to be compatible	2024-10-10 18:20:14 +00:00