Compare commits

..

6 Commits

Author SHA1 Message Date
Engel Nyst 12e774917d Update containers/dev/Dockerfile 2024-12-14 12:54:53 +01:00
Engel Nyst b55f893d73 Update containers/dev/Dockerfile 2024-12-14 12:54:29 +01:00
openhands 89add3a3e8 Fix pr #5521: Fix installation size 2024-12-11 13:54:22 +00:00
openhands 871c8f7a7b Fix issue #5359: [Resolver] The LLM uses a non-optimal way to install openhands 2024-12-04 03:58:50 +00:00
Xingyao Wang 9908e1b285 [Evaluation]: Log openhands version in eval output folder, instead of agent version (#5394) 2024-12-04 03:33:43 +00:00
Robert Brennan 793e142c4a Show all actions in the message window (#5190)
Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Graham Neubig <neubig@gmail.com>
Co-authored-by: amanape <83104063+amanape@users.noreply.github.com>
2024-12-03 18:29:49 -05:00
39 changed files with 557 additions and 209 deletions
+1 -1
View File
@@ -133,7 +133,7 @@ install-python-dependencies:
export HNSWLIB_NO_NATIVE=1; \
poetry run pip install chroma-hnswlib; \
fi
@poetry install --without llama-index
@poetry install --without llama-index,evaluation
@if [ -f "/etc/manjaro-release" ]; then \
echo "$(BLUE)Detected Manjaro Linux. Installing Playwright dependencies...$(RESET)"; \
poetry run pip install playwright; \
+39 -39
View File
@@ -5,24 +5,24 @@ FROM ubuntu:22.04 AS dind
# https://docs.docker.com/engine/install/ubuntu/
RUN apt-get update && apt-get install -y \
ca-certificates \
curl \
&& install -m 0755 -d /etc/apt/keyrings \
&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
&& chmod a+r /etc/apt/keyrings/docker.asc \
&& echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
ca-certificates \
curl \
&& install -m 0755 -d /etc/apt/keyrings \
&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
&& chmod a+r /etc/apt/keyrings/docker.asc \
&& echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
RUN apt-get update && apt-get install -y \
docker-ce \
docker-ce-cli \
containerd.io \
docker-buildx-plugin \
docker-compose-plugin \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& apt-get autoremove -y
docker-ce \
docker-ce-cli \
containerd.io \
docker-buildx-plugin \
docker-compose-plugin \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& apt-get autoremove -y
###
FROM dind AS openhands
@@ -31,25 +31,25 @@ ENV DEBIAN_FRONTEND=noninteractive
#
RUN apt-get update && apt-get install -y \
bash \
bash \
build-essential \
curl \
git \
git-lfs \
software-properties-common \
make \
git \
git-lfs \
software-properties-common \
make \
netcat \
sudo \
wget \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& apt-get autoremove -y
wget \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& apt-get autoremove -y
# https://github.com/cli/cli/blob/trunk/docs/install_linux.md
RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \
&& chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
&& apt-get update && apt-get -y install \
&& chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
&& apt-get update && apt-get -y install \
gh \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
@@ -92,17 +92,17 @@ EOF
FROM openhands AS dev
RUN apt-get update && apt-get install -y \
dnsutils \
file \
iproute2 \
jq \
lsof \
ripgrep \
silversearcher-ag \
vim \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& apt-get autoremove -y
dnsutils \
file \
iproute2 \
jq \
lsof \
ripgrep \
silversearcher-ag \
vim \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean \
&& apt-get autoremove -y
WORKDIR /app
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
if [ -z "$DATASET" ]; then
echo "Dataset not specified, use default 'things'"
@@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then
exit 1
fi
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
@@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
--max-iterations 20 \
--OPENAI_API_KEY $OPENAI_API_KEY \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${DATASET}"
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
@@ -31,7 +31,7 @@ COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poe
--llm-config $MODEL_CONFIG \
--max-iterations 30 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE=$AGENT_VERSION
EVAL_NOTE=$OPENHANDS_VERSION
# Default to NOT use unit tests.
if [ -z "$USE_UNIT_TESTS" ]; then
@@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
@@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${DATASET}"
--eval-note ${OPENHANDS_VERSION}_${DATASET}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
@@ -31,7 +31,7 @@ COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 5 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION" \
--eval-note $OPENHANDS_VERSION" \
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
--agent-cls $AGENT \
@@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
echo "HF SPLIT: $SPLIT"
@@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
export USE_HINT_TEXT=false
fi
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
# if not using Hint, add -no-hint to the eval note
if [ "$USE_HINT_TEXT" = false ]; then
EVAL_NOTE="$EVAL_NOTE-no-hint"
@@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
@@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
--max-iterations 10 \
--max-chars 10000000 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
if [ -z "$LEVELS" ]; then
LEVELS="2023_level1"
echo "Levels not specified, use default $LEVELS"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "LEVELS: $LEVELS"
@@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
--level $LEVELS \
--data-split validation \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
if [ -z "$HUBS" ]; then
HUBS="hf,torch,tf"
@@ -29,7 +29,7 @@ if [ -z "$HUBS" ]; then
fi
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "HUBS: $HUBS"
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
--hubs $HUBS \
--data-split validation \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then
DATA_SPLIT="gpqa_diamond"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
@@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--data-split $DATA_SPLIT \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -58,10 +58,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
@@ -69,7 +69,7 @@ COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then
DATASET="ProofWriter"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
--dataset $DATASET \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then
AGENT="BrowsingAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"
COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
--agent-cls $AGENT \
@@ -18,10 +18,10 @@ checkout_eval_branch
# Only 'CodeActAgent' is supported for MINT now
AGENT="CodeActAgent"
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
export PYTHONPATH=$(pwd)
@@ -26,10 +26,10 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
@@ -37,7 +37,7 @@ COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION"
--eval-note $OPENHANDS_VERSION"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then
USE_KNOWLEDGE=false
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
@@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py
--use_knowledge $USE_KNOWLEDGE \
--max-iterations 30 \
--eval-num-workers $NUM_WORKERS \
--eval-note $AGENT_VERSION" \
--eval-note $OPENHANDS_VERSION" \
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
echo "SPLIT: $SPLIT"
@@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
export USE_HINT_TEXT=false
fi
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
# if not using Hint, add -no-hint to the eval note
if [ "$USE_HINT_TEXT" = false ]; then
EVAL_NOTE="$EVAL_NOTE-no-hint"
@@ -38,10 +38,10 @@ if [ -z "$WOLFRAM_APPID" ]; then
echo "WOLFRAM_APPID not specified"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
echo "DATASET: $DATASET"
echo "HARDNESS: $HARDNESS"
@@ -56,7 +56,7 @@ COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
--wolfram_alpha_appid $WOLFRAM_APPID\
--data-split validation \
--eval-num-workers $NUM_WORKERS \
--eval-note ${AGENT_VERSION}_${LEVELS}"
--eval-note ${OPENHANDS_VERSION}_${LEVELS}"
if [ -n "$EVAL_LIMIT" ]; then
echo "EVAL_LIMIT: $EVAL_LIMIT"
@@ -27,13 +27,13 @@ if [ -z "$AGENT" ]; then
AGENT="BrowsingAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="$AGENT_VERSION"
EVAL_NOTE="$OPENHANDS_VERSION"
COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
--agent-cls $AGENT \
@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
AGENT="CodeActAgent"
fi
get_agent_version
get_openhands_version
echo "AGENT: $AGENT"
echo "AGENT_VERSION: $AGENT_VERSION"
echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE=$AGENT_VERSION
EVAL_NOTE=$OPENHANDS_VERSION
# Default to NOT use unit tests.
if [ -z "$USE_UNIT_TESTS" ]; then
+2 -2
View File
@@ -39,8 +39,8 @@ checkout_original_branch() {
git checkout $current_branch
}
get_agent_version() {
get_openhands_version() {
# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
# We need to track the version of Agent in the evaluation to make sure results are comparable
AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
}
@@ -9,7 +9,7 @@ import { WsClientProviderStatus } from "#/context/ws-client-provider";
import { ChatInterface } from "#/components/features/chat/chat-interface";
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const renderChatInterface = (messages: (Message | ErrorMessage)[]) =>
const renderChatInterface = (messages: (Message)[]) =>
renderWithProviders(<ChatInterface />);
describe("Empty state", () => {
@@ -278,7 +278,7 @@ describe.skip("ChatInterface", () => {
});
it("should render inline errors", () => {
const messages: (Message | ErrorMessage)[] = [
const messages: (Message)[] = [
{
sender: "assistant",
content: "Hello",
@@ -287,9 +287,10 @@ describe.skip("ChatInterface", () => {
pending: true,
},
{
error: true,
id: "",
message: "Something went wrong",
type: "error",
content: "Something went wrong",
sender: "assistant",
timestamp: new Date().toISOString(),
},
];
renderChatInterface(messages);
@@ -47,7 +47,7 @@ export function ChatMessage({
"rounded-xl relative",
"flex flex-col gap-2",
type === "user" && " max-w-[305px] p-4 bg-neutral-700 self-end",
type === "assistant" && "pb-4 max-w-full bg-tranparent",
type === "assistant" && "mt-6 max-w-full bg-tranparent",
)}
>
<CopyToClipboardButton
@@ -1,42 +0,0 @@
import { useState, useEffect } from "react";
import { useTranslation } from "react-i18next";
interface ErrorMessageProps {
id?: string;
message: string;
}
export function ErrorMessage({ id, message }: ErrorMessageProps) {
const { t, i18n } = useTranslation();
const [showDetails, setShowDetails] = useState(true);
const [headline, setHeadline] = useState("");
const [details, setDetails] = useState(message);
useEffect(() => {
if (id && i18n.exists(id)) {
setHeadline(t(id));
setDetails(message);
setShowDetails(false);
}
}, [id, message, i18n.language]);
return (
<div className="flex gap-2 items-center justify-start border-l-2 border-danger pl-2 my-2 py-2">
<div className="text-sm leading-4 flex flex-col gap-2">
{headline && <p className="text-danger font-bold">{headline}</p>}
{headline && (
<button
type="button"
onClick={() => setShowDetails(!showDetails)}
className="cursor-pointer text-left"
>
{showDetails
? t("ERROR_MESSAGE$HIDE_DETAILS")
: t("ERROR_MESSAGE$SHOW_DETAILS")}
</button>
)}
{showDetails && <p className="text-neutral-300">{details}</p>}
</div>
</div>
);
}
@@ -0,0 +1,80 @@
import { useState, useEffect } from "react";
import { useTranslation } from "react-i18next";
import Markdown from "react-markdown";
import remarkGfm from "remark-gfm";
import { code } from "../markdown/code";
import { ol, ul } from "../markdown/list";
import ArrowUp from "#/icons/angle-up-solid.svg?react";
import ArrowDown from "#/icons/angle-down-solid.svg?react";
interface ExpandableMessageProps {
id?: string;
message: string;
type: string;
}
export function ExpandableMessage({
id,
message,
type,
}: ExpandableMessageProps) {
const { t, i18n } = useTranslation();
const [showDetails, setShowDetails] = useState(true);
const [headline, setHeadline] = useState("");
const [details, setDetails] = useState(message);
useEffect(() => {
if (id && i18n.exists(id)) {
setHeadline(t(id));
setDetails(message);
setShowDetails(false);
}
}, [id, message, i18n.language]);
const border = type === "error" ? "border-danger" : "border-neutral-300";
const textColor = type === "error" ? "text-danger" : "text-neutral-300";
let arrowClasses = "h-4 w-4 ml-2 inline";
if (type === "error") {
arrowClasses += " fill-danger";
} else {
arrowClasses += " fill-neutral-300";
}
return (
<div
className={`flex gap-2 items-center justify-start border-l-2 pl-2 my-2 py-2 ${border}`}
>
<div className="text-sm leading-4 flex flex-col gap-2 max-w-full">
{headline && (
<p className={`${textColor} font-bold`}>
{headline}
<button
type="button"
onClick={() => setShowDetails(!showDetails)}
className="cursor-pointer text-left"
>
{showDetails ? (
<ArrowUp className={arrowClasses} />
) : (
<ArrowDown className={arrowClasses} />
)}
</button>
</p>
)}
{showDetails && (
<Markdown
className="text-sm overflow-auto"
components={{
code,
ul,
ol,
}}
remarkPlugins={[remarkGfm]}
>
{details}
</Markdown>
)}
</div>
</div>
);
}
@@ -1,14 +1,10 @@
import { ChatMessage } from "#/components/features/chat/chat-message";
import { ConfirmationButtons } from "#/components/shared/buttons/confirmation-buttons";
import { ImageCarousel } from "../images/image-carousel";
import { ErrorMessage } from "./error-message";
const isErrorMessage = (
message: Message | ErrorMessage,
): message is ErrorMessage => "error" in message;
import { ExpandableMessage } from "./expandable-message";
interface MessagesProps {
messages: (Message | ErrorMessage)[];
messages: Message[];
isAwaitingUserConfirmation: boolean;
}
@@ -16,18 +12,28 @@ export function Messages({
messages,
isAwaitingUserConfirmation,
}: MessagesProps) {
return messages.map((message, index) =>
isErrorMessage(message) ? (
<ErrorMessage key={index} id={message.id} message={message.message} />
) : (
return messages.map((message, index) => {
if (message.type === "error" || message.type === "action") {
console.log("expando", message);
return (
<ExpandableMessage
key={index}
type={message.type}
id={message.translationID}
message={message.content}
/>
);
}
return (
<ChatMessage key={index} type={message.sender} message={message.content}>
{message.imageUrls.length > 0 && (
{message.imageUrls && message.imageUrls.length > 0 && (
<ImageCarousel size="small" images={message.imageUrls} />
)}
{messages.length - 1 === index &&
message.sender === "assistant" &&
isAwaitingUserConfirmation && <ConfirmationButtons />}
</ChatMessage>
),
);
);
});
}
+30 -8
View File
@@ -1782,14 +1782,6 @@
"fr": "Privé",
"tr": "Özel"
},
"ERROR_MESSAGE$SHOW_DETAILS": {
"en": "Show details",
"es": "Mostrar detalles"
},
"ERROR_MESSAGE$HIDE_DETAILS": {
"en": "Hide details",
"es": "Ocultar detalles"
},
"STATUS$STARTING_RUNTIME": {
"en": "Starting Runtime...",
"zh-CN": "启动运行时...",
@@ -2012,5 +2004,35 @@
"PROJECT_MENU_CARD_CONTEXT_MENU$DOWNLOAD_AS_ZIP_LABEL": {
"en": "Download as .zip",
"es": "Descargar como .zip"
},
"ACTION_MESSAGE$RUN": {
"en": "Running a bash command"
},
"ACTION_MESSAGE$RUN_IPYTHON": {
"en": "Running a Jupyter command"
},
"ACTION_MESSAGE$READ": {
"en": "Reading the contents of a file"
},
"ACTION_MESSAGE$WRITE": {
"en": "Writing to a file"
},
"OBSERVATION_MESSAGE$RUN": {
"en": "Ran a bash command"
},
"OBSERVATION_MESSAGE$RUN_IPYTHON": {
"en": "Ran a Jupyter command"
},
"OBSERVATION_MESSAGE$READ": {
"en": "Read the contents of a file"
},
"OBSERVATION_MESSAGE$WRITE": {
"en": "Wrote to a file"
},
"EXPANDABLE_MESSAGE$SHOW_DETAILS": {
"en": "Show details"
},
"EXPANDABLE_MESSAGE$HIDE_DETAILS": {
"en": "Hide details"
}
}
+1
View File
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--!Font Awesome Free 6.7.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2024 Fonticons, Inc.--><path d="M201.4 374.6c12.5 12.5 32.8 12.5 45.3 0l160-160c12.5-12.5 12.5-32.8 0-45.3s-32.8-12.5-45.3 0L224 306.7 86.6 169.4c-12.5-12.5-32.8-12.5-45.3 0s-12.5 32.8 0 45.3l160 160z"/></svg>

After

Width:  |  Height:  |  Size: 400 B

+1
View File
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--!Font Awesome Free 6.7.1 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free Copyright 2024 Fonticons, Inc.--><path d="M201.4 137.4c12.5-12.5 32.8-12.5 45.3 0l160 160c12.5 12.5 12.5 32.8 0 45.3s-32.8 12.5-45.3 0L224 205.3 86.6 342.6c-12.5 12.5-32.8 12.5-45.3 0s-12.5-32.8 0-45.3l160-160z"/></svg>

After

Width:  |  Height:  |  Size: 400 B

+4 -7
View File
@@ -1,13 +1,10 @@
type Message = {
sender: "user" | "assistant";
content: string;
imageUrls: string[];
timestamp: string;
imageUrls?: string[];
type?: "thought" | "error" | "action";
pending?: boolean;
};
type ErrorMessage = {
error: boolean;
id?: string;
message: string;
translationID?: string;
eventID?: number;
};
+82 -20
View File
@@ -1,5 +1,6 @@
import {
addAssistantMessage,
addAssistantAction,
addUserMessage,
addErrorMessage,
} from "#/state/chat-slice";
@@ -50,29 +51,9 @@ const messageActions = {
pending: false,
}),
);
} else {
store.dispatch(addAssistantMessage(message.args.content));
}
},
[ActionType.FINISH]: (message: ActionMessage) => {
store.dispatch(addAssistantMessage(message.message));
},
[ActionType.REJECT]: (message: ActionMessage) => {
store.dispatch(addAssistantMessage(message.message));
},
[ActionType.DELEGATE]: (message: ActionMessage) => {
store.dispatch(addAssistantMessage(message.message));
},
[ActionType.RUN]: (message: ActionMessage) => {
if (message.args.hidden) return;
if (message.args.thought) {
store.dispatch(addAssistantMessage(message.args.thought));
}
},
[ActionType.RUN_IPYTHON]: (message: ActionMessage) => {
if (message.args.thought) {
store.dispatch(addAssistantMessage(message.args.thought));
}
if (message.args.confirmation_state !== "rejected") {
store.dispatch(appendJupyterInput(message.args.code));
}
@@ -124,6 +105,87 @@ export function handleActionMessage(message: ActionMessage) {
return;
}
if (message.source !== "user" && !message.args?.hidden) {
if (message.args && message.args.thought) {
store.dispatch(addAssistantMessage(message.args.thought));
}
// Convert the message to a properly typed action
const baseAction = {
...message,
source: "agent" as const,
args: {
...message.args,
thought: message.args?.thought || message.message || "",
},
};
// Cast to the appropriate action type based on the action field
switch (message.action) {
case "run":
store.dispatch(
addAssistantAction({
...baseAction,
action: "run" as const,
args: {
command: String(message.args?.command || ""),
confirmation_state: (message.args?.confirmation_state ||
"confirmed") as
| "confirmed"
| "rejected"
| "awaiting_confirmation",
thought: String(message.args?.thought || message.message || ""),
hidden: Boolean(message.args?.hidden),
},
}),
);
break;
case "message":
store.dispatch(
addAssistantAction({
...baseAction,
action: "message" as const,
args: {
content: String(message.args?.content || message.message || ""),
image_urls: Array.isArray(message.args?.image_urls)
? message.args.image_urls
: null,
wait_for_response: Boolean(message.args?.wait_for_response),
},
}),
);
break;
case "run_ipython":
store.dispatch(
addAssistantAction({
...baseAction,
action: "run_ipython" as const,
args: {
code: String(message.args?.code || ""),
confirmation_state: (message.args?.confirmation_state ||
"confirmed") as
| "confirmed"
| "rejected"
| "awaiting_confirmation",
kernel_init_code: String(message.args?.kernel_init_code || ""),
thought: String(message.args?.thought || message.message || ""),
},
}),
);
break;
default:
// For other action types, ensure we have the required thought property
store.dispatch(
addAssistantAction({
...baseAction,
action: "reject" as const,
args: {
thought: String(message.args?.thought || message.message || ""),
},
}),
);
}
}
if (message.action in messageActions) {
const actionFn =
messageActions[message.action as keyof typeof messageActions];
+121 -1
View File
@@ -2,10 +2,14 @@ import { setCurrentAgentState } from "#/state/agent-slice";
import { setUrl, setScreenshotSrc } from "#/state/browser-slice";
import store from "#/store";
import { ObservationMessage } from "#/types/message";
import AgentState from "#/types/agent-state";
import { appendOutput } from "#/state/command-slice";
import { appendJupyterOutput } from "#/state/jupyter-slice";
import ObservationType from "#/types/observation-type";
import { addAssistantMessage } from "#/state/chat-slice";
import {
addAssistantMessage,
addAssistantObservation,
} from "#/state/chat-slice";
export function handleObservationMessage(message: ObservationMessage) {
switch (message.observation) {
@@ -46,4 +50,120 @@ export function handleObservationMessage(message: ObservationMessage) {
store.dispatch(addAssistantMessage(message.message));
break;
}
if (!message.extras?.hidden) {
// Convert the message to the appropriate observation type
const { observation } = message;
const baseObservation = {
...message,
source: "agent" as const,
};
switch (observation) {
case "agent_state_changed":
store.dispatch(
addAssistantObservation({
...baseObservation,
observation: "agent_state_changed" as const,
extras: {
agent_state: (message.extras.agent_state as AgentState) || "idle",
},
}),
);
break;
case "run":
store.dispatch(
addAssistantObservation({
...baseObservation,
observation: "run" as const,
extras: {
command: String(message.extras.command || ""),
command_id: Number(message.extras.command_id || 0),
exit_code: Number(message.extras.exit_code || 0),
hidden: Boolean(message.extras.hidden),
},
}),
);
break;
case "run_ipython":
store.dispatch(
addAssistantObservation({
...baseObservation,
observation: "run_ipython" as const,
extras: {
code: String(message.extras.code || ""),
},
}),
);
break;
case "delegate":
store.dispatch(
addAssistantObservation({
...baseObservation,
observation: "delegate" as const,
extras: {
outputs:
typeof message.extras.outputs === "object"
? (message.extras.outputs as Record<string, unknown>)
: {},
},
}),
);
break;
case "browse":
store.dispatch(
addAssistantObservation({
...baseObservation,
observation: "browse" as const,
extras: {
url: String(message.extras.url || ""),
screenshot: String(message.extras.screenshot || ""),
error: Boolean(message.extras.error),
open_page_urls: Array.isArray(message.extras.open_page_urls)
? message.extras.open_page_urls
: [],
active_page_index: Number(message.extras.active_page_index || 0),
dom_object:
typeof message.extras.dom_object === "object"
? (message.extras.dom_object as Record<string, unknown>)
: {},
axtree_object:
typeof message.extras.axtree_object === "object"
? (message.extras.axtree_object as Record<string, unknown>)
: {},
extra_element_properties:
typeof message.extras.extra_element_properties === "object"
? (message.extras.extra_element_properties as Record<
string,
unknown
>)
: {},
last_browser_action: String(
message.extras.last_browser_action || "",
),
last_browser_action_error:
message.extras.last_browser_action_error,
focused_element_bid: String(
message.extras.focused_element_bid || "",
),
},
}),
);
break;
case "error":
store.dispatch(
addAssistantObservation({
...baseObservation,
observation: "error" as const,
source: "user" as const,
extras: {
error_id: message.extras.error_id,
},
}),
);
break;
default:
// For any unhandled observation types, just ignore them
break;
}
}
}
+79 -2
View File
@@ -1,6 +1,13 @@
import { createSlice, PayloadAction } from "@reduxjs/toolkit";
type SliceState = { messages: (Message | ErrorMessage)[] };
import { OpenHandsObservation } from "#/types/core/observations";
import { OpenHandsAction } from "#/types/core/actions";
type SliceState = { messages: Message[] };
const MAX_CONTENT_LENGTH = 1000;
const HANDLED_ACTIONS = ["run", "run_ipython", "write", "read"];
const initialState: SliceState = {
messages: [],
@@ -20,6 +27,7 @@ export const chatSlice = createSlice({
}>,
) {
const message: Message = {
type: "thought",
sender: "user",
content: action.payload.content,
imageUrls: action.payload.imageUrls,
@@ -40,6 +48,7 @@ export const chatSlice = createSlice({
addAssistantMessage(state, action: PayloadAction<string>) {
const message: Message = {
type: "thought",
sender: "assistant",
content: action.payload,
imageUrls: [],
@@ -49,12 +58,78 @@ export const chatSlice = createSlice({
state.messages.push(message);
},
addAssistantAction(state, action: PayloadAction<OpenHandsAction>) {
const actionID = action.payload.action;
if (!HANDLED_ACTIONS.includes(actionID)) {
return;
}
const translationID = `ACTION_MESSAGE$${actionID.toUpperCase()}`;
let text = "";
if (actionID === "run") {
text = `\`${action.payload.args.command}\``;
} else if (actionID === "run_ipython") {
text = `\`\`\`\n${action.payload.args.code}\n\`\`\``;
} else if (actionID === "write") {
let { content } = action.payload.args;
if (content.length > MAX_CONTENT_LENGTH) {
content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
}
text = `${action.payload.args.path}\n${content}`;
} else if (actionID === "read") {
text = action.payload.args.path;
}
const message: Message = {
type: "action",
sender: "assistant",
translationID,
eventID: action.payload.id,
content: text,
imageUrls: [],
timestamp: new Date().toISOString(),
};
state.messages.push(message);
},
addAssistantObservation(
state,
observation: PayloadAction<OpenHandsObservation>,
) {
const observationID = observation.payload.observation;
if (!HANDLED_ACTIONS.includes(observationID)) {
return;
}
const translationID = `OBSERVATION_MESSAGE$${observationID.toUpperCase()}`;
const causeID = observation.payload.cause;
const causeMessage = state.messages.find(
(message) => message.eventID === causeID,
);
if (!causeMessage) {
return;
}
causeMessage.translationID = translationID;
if (observationID === "run" || observationID === "run_ipython") {
let { content } = observation.payload;
if (content.length > MAX_CONTENT_LENGTH) {
content = `${content.slice(0, MAX_CONTENT_LENGTH)}...`;
}
content = `\`\`\`\n${content}\n\`\`\``;
causeMessage.content = content; // Observation content includes the action
}
},
addErrorMessage(
state,
action: PayloadAction<{ id?: string; message: string }>,
) {
const { id, message } = action.payload;
state.messages.push({ id, message, error: true });
console.log("add err message", id, message);
state.messages.push({
translationID: id,
content: message,
type: "error",
sender: "assistant",
timestamp: new Date().toISOString(),
});
},
clearMessages(state) {
@@ -66,6 +141,8 @@ export const chatSlice = createSlice({
export const {
addUserMessage,
addAssistantMessage,
addAssistantAction,
addAssistantObservation,
addErrorMessage,
clearMessages,
} = chatSlice.actions;
+19
View File
@@ -96,6 +96,23 @@ export interface ModifyTaskAction extends OpenHandsActionEvent<"modify_task"> {
};
}
export interface FileReadAction extends OpenHandsActionEvent<"read"> {
source: "agent";
args: {
path: string;
thought: string;
};
}
export interface FileWriteAction extends OpenHandsActionEvent<"write"> {
source: "agent";
args: {
path: string;
content: string;
thought: string;
};
}
export interface RejectAction extends OpenHandsActionEvent<"reject"> {
source: "agent";
args: {
@@ -112,6 +129,8 @@ export type OpenHandsAction =
| DelegateAction
| BrowseAction
| BrowseInteractiveAction
| FileReadAction
| FileWriteAction
| AddTaskAction
| ModifyTaskAction
| RejectAction;
+3 -1
View File
@@ -1,7 +1,9 @@
type OpenHandsEventType =
export type OpenHandsEventType =
| "message"
| "agent_state_changed"
| "run"
| "read"
| "write"
| "run_ipython"
| "delegate"
| "browse"
+6 -1
View File
@@ -1,6 +1,8 @@
export interface ActionMessage {
id: number;
// Either 'agent' or 'user'
source: string;
source: "agent" | "user";
// The action to be taken
action: string;
@@ -19,6 +21,9 @@ export interface ObservationMessage {
// The type of observation
observation: string;
id: number;
cause: number;
// The observed data
content: string;