fix: handle token limits and estimate token count for llm calls (#9880)

👋 Hi there! This PR was automatically generated by Autofix 🤖 This fix was triggered by Toran Bruce Richards. Fixes [AUTOGPT-SERVER-1ZY](https://sentry.io/organizations/significant-gravitas/issues/6386687527/). The issue was that: `llm_call` calculates `max_tokens` without considering `input_tokens`, causing OpenRouter API errors when the context window is exceeded. - Implements a function `estimate_token_count` to estimate the number of tokens in a list of messages. - Calculates available tokens based on the context window, estimated input tokens, and user-defined max tokens. - Adjusts `max_tokens` for LLM calls to prevent exceeding context window limits. - Reduces `max_tokens` by 15% and retries if a token limit error is encountered during LLM calls. If you have any questions or feedback for the Sentry team about this fix, please email [autofix@sentry.io](mailto:autofix@sentry.io) with the Run ID: 32838. --------- Co-authored-by: sentry-autofix[bot] <157164994+sentry-autofix[bot]@users.noreply.github.com> Co-authored-by: Krzysztof Czerwinski <kpczerwinski@gmail.com>
2026-04-08 03:00:28 -04:00 · 2025-04-25 14:45:47 +01:00
parent ef022720d5
commit 9715ea5313
1 changed files with 26 additions and 1 deletions
--- a/autogpt_platform/backend/backend/blocks/llm.py
+++ b/autogpt_platform/backend/backend/blocks/llm.py
@@ -288,6 +288,13 @@ def convert_openai_tool_fmt_to_anthropic(
    return anthropic_tools


+def estimate_token_count(prompt_messages: list[dict]) -> int:
+    char_count = sum(len(str(msg.get("content", ""))) for msg in prompt_messages)
+    message_overhead = len(prompt_messages) * 4
+    estimated_tokens = (char_count // 4) + message_overhead
+    return int(estimated_tokens * 1.2)
+
+
 def llm_call(
    credentials: APIKeyCredentials,
    llm_model: LlmModel,
@@ -319,7 +326,14 @@ def llm_call(
            - completion_tokens: The number of tokens used in the completion.
    """
    provider = llm_model.metadata.provider
-    max_tokens = max_tokens or llm_model.max_output_tokens or 4096
+
+    # Calculate available tokens based on context window and input length
+    estimated_input_tokens = estimate_token_count(prompt)
+    context_window = llm_model.context_window
+    model_max_output = llm_model.max_output_tokens or 4096
+    user_max = max_tokens or model_max_output
+    available_tokens = max(context_window - estimated_input_tokens, 0)
+    max_tokens = max(min(available_tokens, model_max_output, user_max), 0)

    if provider == "openai":
        tools_param = tools if tools else openai.NOT_GIVEN
@@ -475,6 +489,7 @@ def llm_call(
            model=llm_model.value,
            prompt=f"{sys_messages}\n\n{usr_messages}",
            stream=False,
+            options={"num_ctx": max_tokens},
        )
        return LLMResponse(
            raw_response=response.get("response") or "",
@@ -773,6 +788,16 @@ class AIStructuredResponseGeneratorBlock(AIBlockBase):
                prompt.append({"role": "user", "content": retry_prompt})
            except Exception as e:
                logger.exception(f"Error calling LLM: {e}")
+                if (
+                    "maximum context length" in str(e).lower()
+                    or "token limit" in str(e).lower()
+                ):
+                    if input_data.max_tokens is None:
+                        input_data.max_tokens = llm_model.max_output_tokens or 4096
+                    input_data.max_tokens = int(input_data.max_tokens * 0.85)
+                    logger.debug(
+                        f"Reducing max_tokens to {input_data.max_tokens} for next attempt"
+                    )
                retry_prompt = f"Error calling LLM: {e}"
            finally:
                self.merge_stats(