mirror of
https://github.com/crewAIInc/crewAI.git
synced 2026-04-04 03:00:48 -04:00
Lorenze/improve tool response pt2 (#4297)
* no need post tool reflection on native tools * refactor: update prompt generation to prevent thought leakage - Modified the prompt structure to ensure agents without tools use a simplified format, avoiding ReAct instructions. - Introduced a new 'task_no_tools' slice for agents lacking tools, ensuring clean output without Thought: prefixes. - Enhanced test coverage to verify that prompts do not encourage thought leakage, ensuring outputs remain focused and direct. - Added integration tests to validate that real LLM calls produce clean outputs without internal reasoning artifacts. * dont forget the cassettes
This commit is contained in:
@@ -819,15 +819,6 @@ class AgentExecutor(Flow[AgentReActState], CrewAgentExecutorMixin):
|
||||
self.state.is_finished = True
|
||||
return "tool_result_is_final"
|
||||
|
||||
# Add reflection prompt once after all tools in the batch
|
||||
reasoning_prompt = self._i18n.slice("post_tool_reasoning")
|
||||
|
||||
reasoning_message: LLMMessage = {
|
||||
"role": "user",
|
||||
"content": reasoning_prompt,
|
||||
}
|
||||
self.state.messages.append(reasoning_message)
|
||||
|
||||
return "native_tool_completed"
|
||||
|
||||
def _extract_tool_name(self, tool_call: Any) -> str:
|
||||
|
||||
@@ -10,9 +10,10 @@
|
||||
"memory": "\n\n# Useful context: \n{memory}",
|
||||
"role_playing": "You are {role}. {backstory}\nYour personal goal is: {goal}",
|
||||
"tools": "\nYou ONLY have access to the following tools, and should NEVER make up tools that are not listed here:\n\n{tools}\n\nIMPORTANT: Use the following format in your response:\n\n```\nThought: you should always think about what to do\nAction: the action to take, only one name of [{tool_names}], just the name, exactly as it's written.\nAction Input: the input to the action, just a simple JSON object, enclosed in curly braces, using \" to wrap keys and values.\nObservation: the result of the action\n```\n\nOnce all necessary information is gathered, return the following format:\n\n```\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n```",
|
||||
"no_tools": "\nTo give my best complete final answer to the task respond using the exact following format:\n\nThought: I now can give a great answer\nFinal Answer: Your final answer must be the great and the most complete as possible, it must be outcome described.\n\nI MUST use these formats, my job depends on it!",
|
||||
"native_tools": "\nUse available tools to gather information and complete your task.",
|
||||
"native_task": "\nCurrent Task: {input}\n\nThis is VERY important to you, your job depends on it!",
|
||||
"no_tools": "",
|
||||
"task_no_tools": "\nCurrent Task: {input}\n\nProvide your complete response:",
|
||||
"native_tools": "",
|
||||
"native_task": "\nCurrent Task: {input}",
|
||||
"post_tool_reasoning": "Analyze the tool result. If requirements are met, provide the Final Answer. Otherwise, call the next tool. Deliver only the answer without meta-commentary.",
|
||||
"format": "Decide if you need a tool or can provide the final answer. Use one at a time.\nTo use a tool, use:\nThought: [reasoning]\nAction: [name from {tool_names}]\nAction Input: [JSON object]\n\nTo provide the final answer, use:\nThought: [reasoning]\nFinal Answer: [complete response]",
|
||||
"final_answer_format": "If you don't need to use any more tools, you must give your best complete final answer, make sure it satisfies the expected criteria, use the EXACT format below:\n\n```\nThought: I now can give a great answer\nFinal Answer: my best complete final answer to the task.\n\n```",
|
||||
|
||||
@@ -23,7 +23,13 @@ class SystemPromptResult(StandardPromptResult):
|
||||
|
||||
|
||||
COMPONENTS = Literal[
|
||||
"role_playing", "tools", "no_tools", "native_tools", "task", "native_task"
|
||||
"role_playing",
|
||||
"tools",
|
||||
"no_tools",
|
||||
"native_tools",
|
||||
"task",
|
||||
"native_task",
|
||||
"task_no_tools",
|
||||
]
|
||||
|
||||
|
||||
@@ -74,11 +80,14 @@ class Prompts(BaseModel):
|
||||
slices.append("no_tools")
|
||||
system: str = self._build_prompt(slices)
|
||||
|
||||
# Use native_task for native tool calling (no "Thought:" prompt)
|
||||
# Use task for ReAct pattern (includes "Thought:" prompt)
|
||||
task_slice: COMPONENTS = (
|
||||
"native_task" if self.use_native_tool_calling else "task"
|
||||
)
|
||||
# Determine which task slice to use:
|
||||
task_slice: COMPONENTS
|
||||
if self.use_native_tool_calling:
|
||||
task_slice = "native_task"
|
||||
elif self.has_tools:
|
||||
task_slice = "task"
|
||||
else:
|
||||
task_slice = "task_no_tools"
|
||||
slices.append(task_slice)
|
||||
|
||||
if (
|
||||
|
||||
@@ -0,0 +1,112 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"messages":[{"role":"system","content":"You are Language Detector. You
|
||||
are an expert linguist who can identify languages.\nYour personal goal is: Detect
|
||||
the language of text"},{"role":"user","content":"\nCurrent Task: What language
|
||||
is this text written in: ''Hello, how are you?''\n\nThis is the expected criteria
|
||||
for your final answer: The detected language (e.g., English, Spanish, etc.)\nyou
|
||||
MUST return the actual complete content as the final answer, not a summary.\n\nProvide
|
||||
your complete response:"}],"model":"gpt-4o-mini"}'
|
||||
headers:
|
||||
User-Agent:
|
||||
- X-USER-AGENT-XXX
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- ACCEPT-ENCODING-XXX
|
||||
authorization:
|
||||
- AUTHORIZATION-XXX
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '530'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.openai.com
|
||||
x-stainless-arch:
|
||||
- X-STAINLESS-ARCH-XXX
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- X-STAINLESS-OS-XXX
|
||||
x-stainless-package-version:
|
||||
- 1.83.0
|
||||
x-stainless-read-timeout:
|
||||
- X-STAINLESS-READ-TIMEOUT-XXX
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.13.3
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: "{\n \"id\": \"chatcmpl-D39bkotgEapBcz1sSIXvhPhK9G7FD\",\n \"object\":
|
||||
\"chat.completion\",\n \"created\": 1769644288,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n
|
||||
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
|
||||
\"assistant\",\n \"content\": \"English\",\n \"refusal\": null,\n
|
||||
\ \"annotations\": []\n },\n \"logprobs\": null,\n \"finish_reason\":
|
||||
\"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\": 101,\n \"completion_tokens\":
|
||||
1,\n \"total_tokens\": 102,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
|
||||
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\":
|
||||
{\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
|
||||
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
|
||||
\"default\",\n \"system_fingerprint\": \"fp_3683ee3deb\"\n}\n"
|
||||
headers:
|
||||
CF-RAY:
|
||||
- CF-RAY-XXX
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Wed, 28 Jan 2026 23:51:28 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- SET-COOKIE-XXX
|
||||
Strict-Transport-Security:
|
||||
- STS-XXX
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- X-CONTENT-TYPE-XXX
|
||||
access-control-expose-headers:
|
||||
- ACCESS-CONTROL-XXX
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- OPENAI-ORG-XXX
|
||||
openai-processing-ms:
|
||||
- '279'
|
||||
openai-project:
|
||||
- OPENAI-PROJECT-XXX
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
x-openai-proxy-wasm:
|
||||
- v0.1
|
||||
x-ratelimit-limit-requests:
|
||||
- X-RATELIMIT-LIMIT-REQUESTS-XXX
|
||||
x-ratelimit-limit-tokens:
|
||||
- X-RATELIMIT-LIMIT-TOKENS-XXX
|
||||
x-ratelimit-remaining-requests:
|
||||
- X-RATELIMIT-REMAINING-REQUESTS-XXX
|
||||
x-ratelimit-remaining-tokens:
|
||||
- X-RATELIMIT-REMAINING-TOKENS-XXX
|
||||
x-ratelimit-reset-requests:
|
||||
- X-RATELIMIT-RESET-REQUESTS-XXX
|
||||
x-ratelimit-reset-tokens:
|
||||
- X-RATELIMIT-RESET-TOKENS-XXX
|
||||
x-request-id:
|
||||
- X-REQUEST-ID-XXX
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
@@ -0,0 +1,111 @@
|
||||
interactions:
|
||||
- request:
|
||||
body: '{"messages":[{"role":"system","content":"You are Classifier. You classify
|
||||
text sentiment accurately.\nYour personal goal is: Classify text sentiment"},{"role":"user","content":"\nCurrent
|
||||
Task: Classify the sentiment of: ''I love this product!''\n\nThis is the expected
|
||||
criteria for your final answer: One word: positive, negative, or neutral\nyou
|
||||
MUST return the actual complete content as the final answer, not a summary.\n\nProvide
|
||||
your complete response:"}],"model":"gpt-4o-mini"}'
|
||||
headers:
|
||||
User-Agent:
|
||||
- X-USER-AGENT-XXX
|
||||
accept:
|
||||
- application/json
|
||||
accept-encoding:
|
||||
- ACCEPT-ENCODING-XXX
|
||||
authorization:
|
||||
- AUTHORIZATION-XXX
|
||||
connection:
|
||||
- keep-alive
|
||||
content-length:
|
||||
- '481'
|
||||
content-type:
|
||||
- application/json
|
||||
host:
|
||||
- api.openai.com
|
||||
x-stainless-arch:
|
||||
- X-STAINLESS-ARCH-XXX
|
||||
x-stainless-async:
|
||||
- 'false'
|
||||
x-stainless-lang:
|
||||
- python
|
||||
x-stainless-os:
|
||||
- X-STAINLESS-OS-XXX
|
||||
x-stainless-package-version:
|
||||
- 1.83.0
|
||||
x-stainless-read-timeout:
|
||||
- X-STAINLESS-READ-TIMEOUT-XXX
|
||||
x-stainless-retry-count:
|
||||
- '0'
|
||||
x-stainless-runtime:
|
||||
- CPython
|
||||
x-stainless-runtime-version:
|
||||
- 3.13.3
|
||||
method: POST
|
||||
uri: https://api.openai.com/v1/chat/completions
|
||||
response:
|
||||
body:
|
||||
string: "{\n \"id\": \"chatcmpl-D39bkVPelOZanWIMBoIyzsuj072sM\",\n \"object\":
|
||||
\"chat.completion\",\n \"created\": 1769644288,\n \"model\": \"gpt-4o-mini-2024-07-18\",\n
|
||||
\ \"choices\": [\n {\n \"index\": 0,\n \"message\": {\n \"role\":
|
||||
\"assistant\",\n \"content\": \"positive\",\n \"refusal\": null,\n
|
||||
\ \"annotations\": []\n },\n \"logprobs\": null,\n \"finish_reason\":
|
||||
\"stop\"\n }\n ],\n \"usage\": {\n \"prompt_tokens\": 89,\n \"completion_tokens\":
|
||||
1,\n \"total_tokens\": 90,\n \"prompt_tokens_details\": {\n \"cached_tokens\":
|
||||
0,\n \"audio_tokens\": 0\n },\n \"completion_tokens_details\":
|
||||
{\n \"reasoning_tokens\": 0,\n \"audio_tokens\": 0,\n \"accepted_prediction_tokens\":
|
||||
0,\n \"rejected_prediction_tokens\": 0\n }\n },\n \"service_tier\":
|
||||
\"default\",\n \"system_fingerprint\": \"fp_3683ee3deb\"\n}\n"
|
||||
headers:
|
||||
CF-RAY:
|
||||
- CF-RAY-XXX
|
||||
Connection:
|
||||
- keep-alive
|
||||
Content-Type:
|
||||
- application/json
|
||||
Date:
|
||||
- Wed, 28 Jan 2026 23:51:29 GMT
|
||||
Server:
|
||||
- cloudflare
|
||||
Set-Cookie:
|
||||
- SET-COOKIE-XXX
|
||||
Strict-Transport-Security:
|
||||
- STS-XXX
|
||||
Transfer-Encoding:
|
||||
- chunked
|
||||
X-Content-Type-Options:
|
||||
- X-CONTENT-TYPE-XXX
|
||||
access-control-expose-headers:
|
||||
- ACCESS-CONTROL-XXX
|
||||
alt-svc:
|
||||
- h3=":443"; ma=86400
|
||||
cf-cache-status:
|
||||
- DYNAMIC
|
||||
openai-organization:
|
||||
- OPENAI-ORG-XXX
|
||||
openai-processing-ms:
|
||||
- '323'
|
||||
openai-project:
|
||||
- OPENAI-PROJECT-XXX
|
||||
openai-version:
|
||||
- '2020-10-01'
|
||||
x-openai-proxy-wasm:
|
||||
- v0.1
|
||||
x-ratelimit-limit-requests:
|
||||
- X-RATELIMIT-LIMIT-REQUESTS-XXX
|
||||
x-ratelimit-limit-tokens:
|
||||
- X-RATELIMIT-LIMIT-TOKENS-XXX
|
||||
x-ratelimit-remaining-requests:
|
||||
- X-RATELIMIT-REMAINING-REQUESTS-XXX
|
||||
x-ratelimit-remaining-tokens:
|
||||
- X-RATELIMIT-REMAINING-TOKENS-XXX
|
||||
x-ratelimit-reset-requests:
|
||||
- X-RATELIMIT-RESET-REQUESTS-XXX
|
||||
x-ratelimit-reset-tokens:
|
||||
- X-RATELIMIT-RESET-TOKENS-XXX
|
||||
x-request-id:
|
||||
- X-REQUEST-ID-XXX
|
||||
status:
|
||||
code: 200
|
||||
message: OK
|
||||
version: 1
|
||||
234
lib/crewai/tests/utilities/test_prompts_no_thought_leakage.py
Normal file
234
lib/crewai/tests/utilities/test_prompts_no_thought_leakage.py
Normal file
@@ -0,0 +1,234 @@
|
||||
"""Tests for prompt generation to prevent thought leakage.
|
||||
|
||||
These tests verify that:
|
||||
1. Agents without tools don't get ReAct format instructions
|
||||
2. The generated prompts don't encourage "Thought:" prefixes that leak into output
|
||||
3. Real LLM calls produce clean output without internal reasoning
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from crewai import Agent, Crew, Task
|
||||
from crewai.llm import LLM
|
||||
from crewai.utilities.prompts import Prompts
|
||||
|
||||
|
||||
class TestNoToolsPromptGeneration:
|
||||
"""Tests for prompt generation when agent has no tools."""
|
||||
|
||||
def test_no_tools_uses_task_no_tools_slice(self) -> None:
|
||||
"""Test that agents without tools use task_no_tools slice instead of task."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.role = "Test Agent"
|
||||
mock_agent.goal = "Test goal"
|
||||
mock_agent.backstory = "Test backstory"
|
||||
|
||||
prompts = Prompts(
|
||||
has_tools=False,
|
||||
use_native_tool_calling=False,
|
||||
use_system_prompt=True,
|
||||
agent=mock_agent,
|
||||
)
|
||||
|
||||
result = prompts.task_execution()
|
||||
|
||||
# Verify it's a SystemPromptResult with system and user keys
|
||||
assert "system" in result
|
||||
assert "user" in result
|
||||
assert "prompt" in result
|
||||
|
||||
# The user prompt should NOT contain "Thought:" (ReAct format)
|
||||
assert "Thought:" not in result["user"]
|
||||
|
||||
# The user prompt should NOT mention tools
|
||||
assert "use the tools available" not in result["user"]
|
||||
assert "tools available" not in result["user"].lower()
|
||||
|
||||
# The system prompt should NOT contain ReAct format instructions
|
||||
assert "Thought:" not in result["system"]
|
||||
assert "Final Answer:" not in result["system"]
|
||||
|
||||
def test_no_tools_prompt_is_simple(self) -> None:
|
||||
"""Test that no-tools prompt is simple and direct."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.role = "Language Detector"
|
||||
mock_agent.goal = "Detect language"
|
||||
mock_agent.backstory = "Expert linguist"
|
||||
|
||||
prompts = Prompts(
|
||||
has_tools=False,
|
||||
use_native_tool_calling=False,
|
||||
use_system_prompt=True,
|
||||
agent=mock_agent,
|
||||
)
|
||||
|
||||
result = prompts.task_execution()
|
||||
|
||||
# Should contain the role playing info
|
||||
assert "Language Detector" in result["system"]
|
||||
|
||||
# User prompt should be simple with just the task
|
||||
assert "Current Task:" in result["user"]
|
||||
assert "Provide your complete response:" in result["user"]
|
||||
|
||||
def test_with_tools_uses_task_slice_with_react(self) -> None:
|
||||
"""Test that agents WITH tools use the task slice (ReAct format)."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.role = "Test Agent"
|
||||
mock_agent.goal = "Test goal"
|
||||
mock_agent.backstory = "Test backstory"
|
||||
|
||||
prompts = Prompts(
|
||||
has_tools=True,
|
||||
use_native_tool_calling=False,
|
||||
use_system_prompt=True,
|
||||
agent=mock_agent,
|
||||
)
|
||||
|
||||
result = prompts.task_execution()
|
||||
|
||||
# With tools and ReAct, the prompt SHOULD contain Thought:
|
||||
assert "Thought:" in result["user"]
|
||||
|
||||
def test_native_tools_uses_native_task_slice(self) -> None:
|
||||
"""Test that native tool calling uses native_task slice."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.role = "Test Agent"
|
||||
mock_agent.goal = "Test goal"
|
||||
mock_agent.backstory = "Test backstory"
|
||||
|
||||
prompts = Prompts(
|
||||
has_tools=True,
|
||||
use_native_tool_calling=True,
|
||||
use_system_prompt=True,
|
||||
agent=mock_agent,
|
||||
)
|
||||
|
||||
result = prompts.task_execution()
|
||||
|
||||
# Native tool calling should NOT have Thought: in user prompt
|
||||
assert "Thought:" not in result["user"]
|
||||
|
||||
# Should NOT have emotional manipulation
|
||||
assert "your job depends on it" not in result["user"]
|
||||
|
||||
|
||||
class TestNoThoughtLeakagePatterns:
|
||||
"""Tests to verify prompts don't encourage thought leakage."""
|
||||
|
||||
def test_no_job_depends_on_it_in_no_tools(self) -> None:
|
||||
"""Test that 'your job depends on it' is not in no-tools prompts."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.role = "Test"
|
||||
mock_agent.goal = "Test"
|
||||
mock_agent.backstory = "Test"
|
||||
|
||||
prompts = Prompts(
|
||||
has_tools=False,
|
||||
use_native_tool_calling=False,
|
||||
use_system_prompt=True,
|
||||
agent=mock_agent,
|
||||
)
|
||||
|
||||
result = prompts.task_execution()
|
||||
|
||||
full_prompt = result["prompt"]
|
||||
assert "your job depends on it" not in full_prompt.lower()
|
||||
assert "i must use these formats" not in full_prompt.lower()
|
||||
|
||||
def test_no_job_depends_on_it_in_native_task(self) -> None:
|
||||
"""Test that 'your job depends on it' is not in native task prompts."""
|
||||
mock_agent = MagicMock()
|
||||
mock_agent.role = "Test"
|
||||
mock_agent.goal = "Test"
|
||||
mock_agent.backstory = "Test"
|
||||
|
||||
prompts = Prompts(
|
||||
has_tools=True,
|
||||
use_native_tool_calling=True,
|
||||
use_system_prompt=True,
|
||||
agent=mock_agent,
|
||||
)
|
||||
|
||||
result = prompts.task_execution()
|
||||
|
||||
full_prompt = result["prompt"]
|
||||
assert "your job depends on it" not in full_prompt.lower()
|
||||
|
||||
|
||||
class TestRealLLMNoThoughtLeakage:
|
||||
"""Integration tests with real LLM calls to verify no thought leakage."""
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_agent_without_tools_no_thought_in_output(self) -> None:
|
||||
"""Test that agent without tools produces clean output without 'Thought:' prefix."""
|
||||
agent = Agent(
|
||||
role="Language Detector",
|
||||
goal="Detect the language of text",
|
||||
backstory="You are an expert linguist who can identify languages.",
|
||||
tools=[], # No tools
|
||||
llm=LLM(model="gpt-4o-mini"),
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
task = Task(
|
||||
description="What language is this text written in: 'Hello, how are you?'",
|
||||
expected_output="The detected language (e.g., English, Spanish, etc.)",
|
||||
agent=agent,
|
||||
)
|
||||
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
result = crew.kickoff()
|
||||
|
||||
assert result is not None
|
||||
assert result.raw is not None
|
||||
|
||||
# The output should NOT start with "Thought:" or contain ReAct artifacts
|
||||
output = str(result.raw)
|
||||
assert not output.strip().startswith("Thought:")
|
||||
assert "Final Answer:" not in output
|
||||
assert "I now can give a great answer" not in output
|
||||
|
||||
# Should contain an actual answer about the language
|
||||
assert any(
|
||||
lang in output.lower()
|
||||
for lang in ["english", "en", "language"]
|
||||
)
|
||||
|
||||
@pytest.mark.vcr()
|
||||
def test_simple_task_clean_output(self) -> None:
|
||||
"""Test that a simple task produces clean output without internal reasoning."""
|
||||
agent = Agent(
|
||||
role="Classifier",
|
||||
goal="Classify text sentiment",
|
||||
backstory="You classify text sentiment accurately.",
|
||||
tools=[],
|
||||
llm=LLM(model="gpt-4o-mini"),
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
task = Task(
|
||||
description="Classify the sentiment of: 'I love this product!'",
|
||||
expected_output="One word: positive, negative, or neutral",
|
||||
agent=agent,
|
||||
)
|
||||
|
||||
crew = Crew(agents=[agent], tasks=[task])
|
||||
result = crew.kickoff()
|
||||
|
||||
assert result is not None
|
||||
output = str(result.raw).strip().lower()
|
||||
|
||||
# Output should be clean - just the classification
|
||||
assert not output.startswith("thought:")
|
||||
assert "final answer:" not in output
|
||||
|
||||
# Should contain the actual classification
|
||||
assert any(
|
||||
sentiment in output
|
||||
for sentiment in ["positive", "negative", "neutral"]
|
||||
)
|
||||
Reference in New Issue
Block a user