increase timeout

update litellm to latest main
attemp 128k thinking budget
2026-04-29 03:00:45 -04:00 · 2025-03-06 04:22:24 +00:00 · 2025-03-06 01:44:05 +00:00 · 2025-03-05 19:50:02 +00:00 · 2025-03-04 16:34:05 +00:00 · 2025-03-04 15:12:57 +00:00
7 changed files with 61 additions and 20 deletions
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -247,12 +247,12 @@ def prepare_dataset(
            f'Starting evaluation with skipping first {skip_num} instances ({len(dataset)} instances to run).'
        )
        if eval_n_limit and eval_n_limit > 0:
-            # Use fixed random seed 42 for sampling without replacement
-            dataset = dataset.sample(
-                min(eval_n_limit, len(dataset)), random_state=42, replace=False
-            )
+            # First shuffle the entire dataset with a fixed seed
+            shuffled_dataset = dataset.sample(frac=1.0, random_state=42, replace=False)
+            # Then take the first eval_n_limit rows
+            dataset = shuffled_dataset.iloc[:eval_n_limit]
            logger.info(
-                f'Randomly sampling {eval_n_limit} unique instances with random seed 42.'
+                f'Taking first {eval_n_limit} instances from randomly shuffled dataset (seed 42).'
            )
    elif eval_n_limit and eval_n_limit > 0:
        # Use fixed random seed 42 for sampling without replacement
--- a/openhands/core/config/llm_config.py
+++ b/openhands/core/config/llm_config.py
@@ -65,7 +65,9 @@ class LLMConfig(BaseModel):
    retry_multiplier: float = Field(default=2)
    retry_min_wait: int = Field(default=5)
    retry_max_wait: int = Field(default=30)
-    timeout: int | None = Field(default=None)
+    timeout: int | None = Field(
+        default=1200
+    )  # 20 minutes, extended thinking can take a while
    max_message_chars: int = Field(
        default=30_000
    )  # maximum number of characters in an observation's content when sent to the llm
--- a/openhands/core/message.py
+++ b/openhands/core/message.py
@@ -67,6 +67,10 @@ class Message(BaseModel):
    # force string serializer
    force_string_serializer: bool = False

+    # This is unique to Anthropic's claude-3-7-sonnet
+    # it is required to pass it back to the model to produce the next action
+    thinking_blocks: list[dict[str, str]] | None = None
+
    @property
    def contains_image(self) -> bool:
        return any(isinstance(content, ImageContent) for content in self.content)
@@ -96,6 +100,9 @@ class Message(BaseModel):

    def _list_serializer(self) -> dict:
        content: list[dict] = []
+        if self.thinking_blocks is not None:
+            content.extend(self.thinking_blocks)
+
        role_tool_with_prompt_caching = False
        for item in self.content:
            d = item.model_dump()
--- a/openhands/llm/llm.py
+++ b/openhands/llm/llm.py
@@ -139,8 +139,25 @@ class LLM(RetryMixin, DebugMixin):
        # set up the completion function
        kwargs: dict[str, Any] = {
            'temperature': self.config.temperature,
+            'top_p': self.config.top_p,
            'max_completion_tokens': self.config.max_output_tokens,
+            'extra_headers': {'anthropic-beta': 'output-128k-2025-02-19'},
        }
+
+        # if 'claude-3-7-sonnet' in self.config.model:
+        kwargs['thinking'] = {
+            'type': 'enabled',
+            # 'budget_tokens': 30720,  # 32768 - 2048
+            # 'budget_tokens': 62000, # 64000 - 2000
+            'budget_tokens': 120000,  # 128000 - 8000
+        }
+        # kwargs['max_completion_tokens'] = 32768
+        # kwargs['max_completion_tokens'] = 64000
+        kwargs['max_completion_tokens'] = 128000
+        kwargs.pop('temperature')
+        kwargs.pop('top_p')
+        logger.info(f'Setting thinking for {self.config.model} with kwargs: {kwargs}')
+
        if (
            self.config.model.lower() in REASONING_EFFORT_SUPPORTED_MODELS
            or self.config.model.split('/')[-1] in REASONING_EFFORT_SUPPORTED_MODELS
@@ -164,7 +181,6 @@ class LLM(RetryMixin, DebugMixin):
            api_version=self.config.api_version,
            custom_llm_provider=self.config.custom_llm_provider,
            timeout=self.config.timeout,
-            top_p=self.config.top_p,
            drop_params=self.config.drop_params,
            **kwargs,
        )
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -205,6 +205,9 @@ class ConversationMemory:
                if assistant_msg.content is not None
                else [],
                tool_calls=assistant_msg.tool_calls,
+                thinking_blocks=llm_response.choices[0].message.thinking_blocks
+                if hasattr(llm_response.choices[0].message, 'thinking_blocks')
+                else None,
            )
            return []
        elif isinstance(action, AgentFinishAction):
@@ -236,6 +239,15 @@ class ConversationMemory:
                Message(
                    role=role,  # type: ignore[arg-type]
                    content=[TextContent(text=action.thought)],
+                    thinking_blocks=tool_metadata.model_response.choices[
+                        0
+                    ].message.thinking_blocks
+                    if tool_metadata is not None
+                    and hasattr(
+                        tool_metadata.model_response.choices[0].message,
+                        'thinking_blocks',
+                    )
+                    else None,
                )
            ]
        elif isinstance(action, MessageAction):
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.0.0 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -4185,25 +4185,23 @@ types-tqdm = "*"

 [[package]]
 name = "litellm"
-version = "1.61.19"
+version = "1.62.4"
 description = "Library to easily interface with LLM API providers"
 optional = false
-python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,>=3.8"
+python-versions = ">=3.8.1,<4.0, !=3.9.7"
 groups = ["main"]
-files = [
-    {file = "litellm-1.61.19-py3-none-any.whl", hash = "sha256:dc8e4d820f37f042a9ed9572ad0ece6a6b6790d160fdf4c61f4d1b877fba6f67"},
-    {file = "litellm-1.61.19.tar.gz", hash = "sha256:b6726dc8dece7b1a55252685092a0c8039700472bf72d79e5c81d0f63d145e02"},
-]
+files = []
+develop = false

 [package.dependencies]
 aiohttp = "*"
 click = "*"
 httpx = ">=0.23.0"
 importlib-metadata = ">=6.8.0"
-jinja2 = ">=3.1.2,<4.0.0"
-jsonschema = ">=4.22.0,<5.0.0"
+jinja2 = "^3.1.2"
+jsonschema = "^4.22.0"
 openai = ">=1.61.0"
-pydantic = ">=2.0.0,<3.0.0"
+pydantic = "^2.0.0"
 python-dotenv = ">=0.2.0"
 tiktoken = ">=0.7.0"
 tokenizers = "*"
@@ -4212,6 +4210,12 @@ tokenizers = "*"
 extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "resend (>=0.8.0,<0.9.0)"]
 proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=43.0.1,<44.0.0)", "fastapi (>=0.115.5,<0.116.0)", "fastapi-sso (>=0.16.0,<0.17.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "pynacl (>=1.5.0,<2.0.0)", "python-multipart (>=0.0.18,<0.0.19)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.29.0,<0.30.0)", "uvloop (>=0.21.0,<0.22.0)"]

+[package.source]
+type = "git"
+url = "https://github.com/BerriAI/litellm.git"
+reference = "main"
+resolved_reference = "17efbf0ee94d21d9e8fc8a8ce24b5c854490e0ff"
+
 [[package]]
 name = "llama-cloud"
 version = "0.1.12"
@@ -8938,7 +8942,7 @@ files = [

 [package.dependencies]
 greenlet = [
-    {version = "!=0.4.17", optional = true, markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") or extra == \"asyncio\""},
+    {version = "!=0.4.17", markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"},
    {version = "!=0.4.17", optional = true, markers = "python_version < \"3.14\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\") or extra == \"asyncio\""},
 ]
 typing-extensions = ">=4.6.0"
@@ -10855,4 +10859,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.12"
-content-hash = "83da0b681253a79417c9842862cdd102c1ab6e8770d9dd9e0c42bc7994be2cd0"
+content-hash = "2a2c34de062d9292dcbf1e4d2af9cda764b666b654a6d62e7423b01f1966ea61"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@ packages = [
 python = "^3.12"
 datasets = "*"
 pandas = "*"
-litellm = "^1.60.0"
+litellm = { git = "https://github.com/BerriAI/litellm.git", branch = "main" }
 google-generativeai = "*" # To use litellm with Gemini Pro API
 google-api-python-client = "*" # For Google Sheets API
 google-auth-httplib2 = "*" # For Google Sheets authentication
Author	SHA1	Message	Date
Xingyao Wang	c79350fa67	increase timeout	2025-03-06 04:22:24 +00:00
Xingyao Wang	0177cb3d21	update litellm to latest main	2025-03-06 01:44:05 +00:00
Xingyao Wang	764cd72e45	attemp 128k thinking budget	2025-03-05 19:50:02 +00:00
Xingyao Wang	f0339e24cc	improve dataset shuffled	2025-03-04 16:34:05 +00:00
Xingyao Wang	4a07029318	remove the annoying primt	2025-03-04 15:12:57 +00:00
Xingyao Wang	b1398f2d03	remove the annoying primt	2025-03-04 15:12:50 +00:00
Xingyao Wang	be4dec238f	pass thinking block to the next request	2025-03-04 14:27:40 +00:00
Xingyao Wang	9abeeb8008	bump litellm ver	2025-03-04 13:48:42 +00:00
Xingyao Wang	af9e1896e9	bump litellm version	2025-03-03 18:28:09 +00:00