Lorenze/improve tool response pt2 (#4297)

* no need post tool reflection on native tools * refactor: update prompt generation to prevent thought leakage - Modified the prompt structure to ensure agents without tools use a simplified format, avoiding ReAct instructions. - Introduced a new 'task_no_tools' slice for agents lacking tools, ensuring clean output without Thought: prefixes. - Enhanced test coverage to verify that prompts do not encourage thought leakage, ensuring outputs remain focused and direct. - Added integration tests to validate that real LLM calls produce clean outputs without internal reasoning artifacts. * dont forget the cassettes
2026-04-04 03:00:48 -04:00 · 2026-01-28 16:53:19 -08:00
parent a731efac8d
commit 2d05e59223
6 changed files with 476 additions and 18 deletions
--- a/lib/crewai/src/crewai/experimental/agent_executor.py
+++ b/lib/crewai/src/crewai/experimental/agent_executor.py
@@ -819,15 +819,6 @@ class AgentExecutor(Flow[AgentReActState], CrewAgentExecutorMixin):
                self.state.is_finished = True
                return "tool_result_is_final"

-        # Add reflection prompt once after all tools in the batch
-        reasoning_prompt = self._i18n.slice("post_tool_reasoning")
-
-        reasoning_message: LLMMessage = {
-            "role": "user",
-            "content": reasoning_prompt,
-        }
-        self.state.messages.append(reasoning_message)
-
        return "native_tool_completed"

    def _extract_tool_name(self, tool_call: Any) -> str:
--- a/lib/crewai/src/crewai/translations/en.json
+++ b/lib/crewai/src/crewai/translations/en.json
@@ -10,9 +10,10 @@
    "memory": "\n\n# Useful context: \n{memory}",
    "role_playing": "You are {role}. {backstory}\nYour personal goal is: {goal}",
    "tools": "\nYou ONLY have access to the following tools, and should NEVER make up tools that are not listed here:\n\n{tools}\n\nIMPORTANT: Use the following format in your response:\n\n```\nThought: you should always think about what to do\nAction: the action to take, only one name of [{tool_names}], just the name, exactly as it's written.\nAction Input: the input to the action, just a simple JSON object, enclosed in curly braces, using \" to wrap keys and values.\nObservation: the result of the action\n```\n\nOnce all necessary information is gathered, return the following format:\n\n```\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n```",
-    "no_tools": "\nTo give my best complete final answer to the task respond using the exact following format:\n\nThought: I now can give a great answer\nFinal Answer: Your final answer must be the great and the most complete as possible, it must be outcome described.\n\nI MUST use these formats, my job depends on it!",
-    "native_tools": "\nUse available tools to gather information and complete your task.",
-    "native_task": "\nCurrent Task: {input}\n\nThis is VERY important to you, your job depends on it!",
+    "no_tools": "",
+    "task_no_tools": "\nCurrent Task: {input}\n\nProvide your complete response:",
+    "native_tools": "",
+    "native_task": "\nCurrent Task: {input}",
    "post_tool_reasoning": "Analyze the tool result. If requirements are met, provide the Final Answer. Otherwise, call the next tool. Deliver only the answer without meta-commentary.",
    "format": "Decide if you need a tool or can provide the final answer. Use one at a time.\nTo use a tool, use:\nThought: [reasoning]\nAction: [name from {tool_names}]\nAction Input: [JSON object]\n\nTo provide the final answer, use:\nThought: [reasoning]\nFinal Answer: [complete response]",
    "final_answer_format": "If you don't need to use any more tools, you must give your best complete final answer, make sure it satisfies the expected criteria, use the EXACT format below:\n\n```\nThought: I now can give a great answer\nFinal Answer: my best complete final answer to the task.\n\n```",
--- a/lib/crewai/src/crewai/utilities/prompts.py
+++ b/lib/crewai/src/crewai/utilities/prompts.py
@@ -23,7 +23,13 @@ class SystemPromptResult(StandardPromptResult):


 COMPONENTS = Literal[
-    "role_playing", "tools", "no_tools", "native_tools", "task", "native_task"
+    "role_playing",
+    "tools",
+    "no_tools",
+    "native_tools",
+    "task",
+    "native_task",
+    "task_no_tools",
 ]


@@ -74,11 +80,14 @@ class Prompts(BaseModel):
            slices.append("no_tools")
        system: str = self._build_prompt(slices)

-        # Use native_task for native tool calling (no "Thought:" prompt)
-        # Use task for ReAct pattern (includes "Thought:" prompt)
-        task_slice: COMPONENTS = (
-            "native_task" if self.use_native_tool_calling else "task"
-        )
+        # Determine which task slice to use:
+        task_slice: COMPONENTS
+        if self.use_native_tool_calling:
+            task_slice = "native_task"
+        elif self.has_tools:
+            task_slice = "task"
+        else:
+            task_slice = "task_no_tools"
        slices.append(task_slice)

        if (
--- a/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_agent_without_tools_no_thought_in_output.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_agent_without_tools_no_thought_in_output.yaml
@@ -0,0 +1,112 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are Language Detector. You
+      are an expert linguist who can identify languages.\nYour personal goal is: Detect
+      the language of text"},{"role":"user","content":"\nCurrent Task: What language
+      is this text written in: ''Hello, how are you?''\n\nThis is the expected criteria
+      for your final answer: The detected language (e.g., English, Spanish, etc.)\nyou
+      MUST return the actual complete content as the final answer, not a summary.\n\nProvide
+      your complete response:"}],"model":"gpt-4o-mini"}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '530'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D39bkotgEapBcz1sSIXvhPhK9G7FD\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1769644288,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"English\",\n        \"refusal\": null,\n
+        \       \"annotations\": []\n      },\n      \"logprobs\": null,\n      \"finish_reason\":
+        \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 101,\n    \"completion_tokens\":
+        1,\n    \"total_tokens\": 102,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_3683ee3deb\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 28 Jan 2026 23:51:28 GMT
+      Server:
+      - cloudflare
+      Set-Cookie:
+      - SET-COOKIE-XXX
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '279'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_simple_task_clean_output.yaml
+++ b/lib/crewai/tests/cassettes/utilities/TestRealLLMNoThoughtLeakage.test_simple_task_clean_output.yaml
@@ -0,0 +1,111 @@
+interactions:
+- request:
+    body: '{"messages":[{"role":"system","content":"You are Classifier. You classify
+      text sentiment accurately.\nYour personal goal is: Classify text sentiment"},{"role":"user","content":"\nCurrent
+      Task: Classify the sentiment of: ''I love this product!''\n\nThis is the expected
+      criteria for your final answer: One word: positive, negative, or neutral\nyou
+      MUST return the actual complete content as the final answer, not a summary.\n\nProvide
+      your complete response:"}],"model":"gpt-4o-mini"}'
+    headers:
+      User-Agent:
+      - X-USER-AGENT-XXX
+      accept:
+      - application/json
+      accept-encoding:
+      - ACCEPT-ENCODING-XXX
+      authorization:
+      - AUTHORIZATION-XXX
+      connection:
+      - keep-alive
+      content-length:
+      - '481'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      x-stainless-arch:
+      - X-STAINLESS-ARCH-XXX
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - X-STAINLESS-OS-XXX
+      x-stainless-package-version:
+      - 1.83.0
+      x-stainless-read-timeout:
+      - X-STAINLESS-READ-TIMEOUT-XXX
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.13.3
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: "{\n  \"id\": \"chatcmpl-D39bkVPelOZanWIMBoIyzsuj072sM\",\n  \"object\":
+        \"chat.completion\",\n  \"created\": 1769644288,\n  \"model\": \"gpt-4o-mini-2024-07-18\",\n
+        \ \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\":
+        \"assistant\",\n        \"content\": \"positive\",\n        \"refusal\": null,\n
+        \       \"annotations\": []\n      },\n      \"logprobs\": null,\n      \"finish_reason\":
+        \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 89,\n    \"completion_tokens\":
+        1,\n    \"total_tokens\": 90,\n    \"prompt_tokens_details\": {\n      \"cached_tokens\":
+        0,\n      \"audio_tokens\": 0\n    },\n    \"completion_tokens_details\":
+        {\n      \"reasoning_tokens\": 0,\n      \"audio_tokens\": 0,\n      \"accepted_prediction_tokens\":
+        0,\n      \"rejected_prediction_tokens\": 0\n    }\n  },\n  \"service_tier\":
+        \"default\",\n  \"system_fingerprint\": \"fp_3683ee3deb\"\n}\n"
+    headers:
+      CF-RAY:
+      - CF-RAY-XXX
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 28 Jan 2026 23:51:29 GMT
+      Server:
+      - cloudflare
+      Set-Cookie:
+      - SET-COOKIE-XXX
+      Strict-Transport-Security:
+      - STS-XXX
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - X-CONTENT-TYPE-XXX
+      access-control-expose-headers:
+      - ACCESS-CONTROL-XXX
+      alt-svc:
+      - h3=":443"; ma=86400
+      cf-cache-status:
+      - DYNAMIC
+      openai-organization:
+      - OPENAI-ORG-XXX
+      openai-processing-ms:
+      - '323'
+      openai-project:
+      - OPENAI-PROJECT-XXX
+      openai-version:
+      - '2020-10-01'
+      x-openai-proxy-wasm:
+      - v0.1
+      x-ratelimit-limit-requests:
+      - X-RATELIMIT-LIMIT-REQUESTS-XXX
+      x-ratelimit-limit-tokens:
+      - X-RATELIMIT-LIMIT-TOKENS-XXX
+      x-ratelimit-remaining-requests:
+      - X-RATELIMIT-REMAINING-REQUESTS-XXX
+      x-ratelimit-remaining-tokens:
+      - X-RATELIMIT-REMAINING-TOKENS-XXX
+      x-ratelimit-reset-requests:
+      - X-RATELIMIT-RESET-REQUESTS-XXX
+      x-ratelimit-reset-tokens:
+      - X-RATELIMIT-RESET-TOKENS-XXX
+      x-request-id:
+      - X-REQUEST-ID-XXX
+    status:
+      code: 200
+      message: OK
+version: 1
--- a/lib/crewai/tests/utilities/test_prompts_no_thought_leakage.py
+++ b/lib/crewai/tests/utilities/test_prompts_no_thought_leakage.py
@@ -0,0 +1,234 @@
+"""Tests for prompt generation to prevent thought leakage.
+
+These tests verify that:
+1. Agents without tools don't get ReAct format instructions
+2. The generated prompts don't encourage "Thought:" prefixes that leak into output
+3. Real LLM calls produce clean output without internal reasoning
+"""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from crewai import Agent, Crew, Task
+from crewai.llm import LLM
+from crewai.utilities.prompts import Prompts
+
+
+class TestNoToolsPromptGeneration:
+    """Tests for prompt generation when agent has no tools."""
+
+    def test_no_tools_uses_task_no_tools_slice(self) -> None:
+        """Test that agents without tools use task_no_tools slice instead of task."""
+        mock_agent = MagicMock()
+        mock_agent.role = "Test Agent"
+        mock_agent.goal = "Test goal"
+        mock_agent.backstory = "Test backstory"
+
+        prompts = Prompts(
+            has_tools=False,
+            use_native_tool_calling=False,
+            use_system_prompt=True,
+            agent=mock_agent,
+        )
+
+        result = prompts.task_execution()
+
+        # Verify it's a SystemPromptResult with system and user keys
+        assert "system" in result
+        assert "user" in result
+        assert "prompt" in result
+
+        # The user prompt should NOT contain "Thought:" (ReAct format)
+        assert "Thought:" not in result["user"]
+
+        # The user prompt should NOT mention tools
+        assert "use the tools available" not in result["user"]
+        assert "tools available" not in result["user"].lower()
+
+        # The system prompt should NOT contain ReAct format instructions
+        assert "Thought:" not in result["system"]
+        assert "Final Answer:" not in result["system"]
+
+    def test_no_tools_prompt_is_simple(self) -> None:
+        """Test that no-tools prompt is simple and direct."""
+        mock_agent = MagicMock()
+        mock_agent.role = "Language Detector"
+        mock_agent.goal = "Detect language"
+        mock_agent.backstory = "Expert linguist"
+
+        prompts = Prompts(
+            has_tools=False,
+            use_native_tool_calling=False,
+            use_system_prompt=True,
+            agent=mock_agent,
+        )
+
+        result = prompts.task_execution()
+
+        # Should contain the role playing info
+        assert "Language Detector" in result["system"]
+
+        # User prompt should be simple with just the task
+        assert "Current Task:" in result["user"]
+        assert "Provide your complete response:" in result["user"]
+
+    def test_with_tools_uses_task_slice_with_react(self) -> None:
+        """Test that agents WITH tools use the task slice (ReAct format)."""
+        mock_agent = MagicMock()
+        mock_agent.role = "Test Agent"
+        mock_agent.goal = "Test goal"
+        mock_agent.backstory = "Test backstory"
+
+        prompts = Prompts(
+            has_tools=True,
+            use_native_tool_calling=False,
+            use_system_prompt=True,
+            agent=mock_agent,
+        )
+
+        result = prompts.task_execution()
+
+        # With tools and ReAct, the prompt SHOULD contain Thought:
+        assert "Thought:" in result["user"]
+
+    def test_native_tools_uses_native_task_slice(self) -> None:
+        """Test that native tool calling uses native_task slice."""
+        mock_agent = MagicMock()
+        mock_agent.role = "Test Agent"
+        mock_agent.goal = "Test goal"
+        mock_agent.backstory = "Test backstory"
+
+        prompts = Prompts(
+            has_tools=True,
+            use_native_tool_calling=True,
+            use_system_prompt=True,
+            agent=mock_agent,
+        )
+
+        result = prompts.task_execution()
+
+        # Native tool calling should NOT have Thought: in user prompt
+        assert "Thought:" not in result["user"]
+
+        # Should NOT have emotional manipulation
+        assert "your job depends on it" not in result["user"]
+
+
+class TestNoThoughtLeakagePatterns:
+    """Tests to verify prompts don't encourage thought leakage."""
+
+    def test_no_job_depends_on_it_in_no_tools(self) -> None:
+        """Test that 'your job depends on it' is not in no-tools prompts."""
+        mock_agent = MagicMock()
+        mock_agent.role = "Test"
+        mock_agent.goal = "Test"
+        mock_agent.backstory = "Test"
+
+        prompts = Prompts(
+            has_tools=False,
+            use_native_tool_calling=False,
+            use_system_prompt=True,
+            agent=mock_agent,
+        )
+
+        result = prompts.task_execution()
+
+        full_prompt = result["prompt"]
+        assert "your job depends on it" not in full_prompt.lower()
+        assert "i must use these formats" not in full_prompt.lower()
+
+    def test_no_job_depends_on_it_in_native_task(self) -> None:
+        """Test that 'your job depends on it' is not in native task prompts."""
+        mock_agent = MagicMock()
+        mock_agent.role = "Test"
+        mock_agent.goal = "Test"
+        mock_agent.backstory = "Test"
+
+        prompts = Prompts(
+            has_tools=True,
+            use_native_tool_calling=True,
+            use_system_prompt=True,
+            agent=mock_agent,
+        )
+
+        result = prompts.task_execution()
+
+        full_prompt = result["prompt"]
+        assert "your job depends on it" not in full_prompt.lower()
+
+
+class TestRealLLMNoThoughtLeakage:
+    """Integration tests with real LLM calls to verify no thought leakage."""
+
+    @pytest.mark.vcr()
+    def test_agent_without_tools_no_thought_in_output(self) -> None:
+        """Test that agent without tools produces clean output without 'Thought:' prefix."""
+        agent = Agent(
+            role="Language Detector",
+            goal="Detect the language of text",
+            backstory="You are an expert linguist who can identify languages.",
+            tools=[],  # No tools
+            llm=LLM(model="gpt-4o-mini"),
+            verbose=False,
+        )
+
+        task = Task(
+            description="What language is this text written in: 'Hello, how are you?'",
+            expected_output="The detected language (e.g., English, Spanish, etc.)",
+            agent=agent,
+        )
+
+        crew = Crew(agents=[agent], tasks=[task])
+        result = crew.kickoff()
+
+        assert result is not None
+        assert result.raw is not None
+
+        # The output should NOT start with "Thought:" or contain ReAct artifacts
+        output = str(result.raw)
+        assert not output.strip().startswith("Thought:")
+        assert "Final Answer:" not in output
+        assert "I now can give a great answer" not in output
+
+        # Should contain an actual answer about the language
+        assert any(
+            lang in output.lower()
+            for lang in ["english", "en", "language"]
+        )
+
+    @pytest.mark.vcr()
+    def test_simple_task_clean_output(self) -> None:
+        """Test that a simple task produces clean output without internal reasoning."""
+        agent = Agent(
+            role="Classifier",
+            goal="Classify text sentiment",
+            backstory="You classify text sentiment accurately.",
+            tools=[],
+            llm=LLM(model="gpt-4o-mini"),
+            verbose=False,
+        )
+
+        task = Task(
+            description="Classify the sentiment of: 'I love this product!'",
+            expected_output="One word: positive, negative, or neutral",
+            agent=agent,
+        )
+
+        crew = Crew(agents=[agent], tasks=[task])
+        result = crew.kickoff()
+
+        assert result is not None
+        output = str(result.raw).strip().lower()
+
+        # Output should be clean - just the classification
+        assert not output.startswith("thought:")
+        assert "final answer:" not in output
+
+        # Should contain the actual classification
+        assert any(
+            sentiment in output
+            for sentiment in ["positive", "negative", "neutral"]
+        )