mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-02-24 03:00:28 -05:00
Compare commits
3 Commits
fix/agent-
...
fix/micros
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
11cfd8756c | ||
|
|
e40c8c70ce | ||
|
|
9cdcd6793f |
@@ -192,3 +192,8 @@ POSTHOG_HOST=https://eu.i.posthog.com
|
||||
|
||||
# Other Services
|
||||
AUTOMOD_API_KEY=
|
||||
|
||||
# Agent Generator Service
|
||||
# The Agent Generator microservice handles AI-powered agent creation from natural language
|
||||
AGENTGENERATOR_HOST=localhost
|
||||
AGENTGENERATOR_PORT=8009
|
||||
|
||||
@@ -27,7 +27,6 @@ class ChatConfig(BaseSettings):
|
||||
session_ttl: int = Field(default=43200, description="Session TTL in seconds")
|
||||
|
||||
# Streaming Configuration
|
||||
stream_timeout: int = Field(default=300, description="Stream timeout in seconds")
|
||||
max_retries: int = Field(
|
||||
default=3,
|
||||
description="Max retries for fallback path (SDK handles retries internally)",
|
||||
@@ -39,8 +38,10 @@ class ChatConfig(BaseSettings):
|
||||
|
||||
# Long-running operation configuration
|
||||
long_running_operation_ttl: int = Field(
|
||||
default=600,
|
||||
description="TTL in seconds for long-running operation tracking in Redis (safety net if pod dies)",
|
||||
default=3600,
|
||||
description="TTL in seconds for long-running operation deduplication lock "
|
||||
"(1 hour, matches stream_ttl). Prevents duplicate operations if pod dies. "
|
||||
"For longer operations, the stream_registry heartbeat keeps them alive.",
|
||||
)
|
||||
|
||||
# Stream registry configuration for SSE reconnection
|
||||
@@ -48,6 +49,11 @@ class ChatConfig(BaseSettings):
|
||||
default=3600,
|
||||
description="TTL in seconds for stream data in Redis (1 hour)",
|
||||
)
|
||||
stream_lock_ttl: int = Field(
|
||||
default=120,
|
||||
description="TTL in seconds for stream lock (2 minutes). Short timeout allows "
|
||||
"reconnection after refresh/crash without long waits.",
|
||||
)
|
||||
stream_max_length: int = Field(
|
||||
default=10000,
|
||||
description="Maximum number of messages to store per stream",
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any, cast
|
||||
from typing import Any
|
||||
|
||||
from prisma.errors import UniqueViolationError
|
||||
from prisma.models import ChatMessage as PrismaChatMessage
|
||||
from prisma.models import ChatSession as PrismaChatSession
|
||||
from prisma.types import (
|
||||
@@ -92,10 +93,9 @@ async def add_chat_message(
|
||||
function_call: dict[str, Any] | None = None,
|
||||
) -> ChatMessage:
|
||||
"""Add a message to a chat session."""
|
||||
# Build input dict dynamically rather than using ChatMessageCreateInput directly
|
||||
# because Prisma's TypedDict validation rejects optional fields set to None.
|
||||
# We only include fields that have values, then cast at the end.
|
||||
data: dict[str, Any] = {
|
||||
# Build ChatMessageCreateInput with only non-None values
|
||||
# (Prisma TypedDict rejects optional fields set to None)
|
||||
data: ChatMessageCreateInput = {
|
||||
"Session": {"connect": {"id": session_id}},
|
||||
"role": role,
|
||||
"sequence": sequence,
|
||||
@@ -123,7 +123,7 @@ async def add_chat_message(
|
||||
where={"id": session_id},
|
||||
data={"updatedAt": datetime.now(UTC)},
|
||||
),
|
||||
PrismaChatMessage.prisma().create(data=cast(ChatMessageCreateInput, data)),
|
||||
PrismaChatMessage.prisma().create(data=data),
|
||||
)
|
||||
return ChatMessage.from_db(message)
|
||||
|
||||
@@ -132,58 +132,93 @@ async def add_chat_messages_batch(
|
||||
session_id: str,
|
||||
messages: list[dict[str, Any]],
|
||||
start_sequence: int,
|
||||
) -> list[ChatMessage]:
|
||||
) -> int:
|
||||
"""Add multiple messages to a chat session in a batch.
|
||||
|
||||
Uses a transaction for atomicity - if any message creation fails,
|
||||
the entire batch is rolled back.
|
||||
Uses collision detection with retry: tries to create messages starting
|
||||
at start_sequence. If a unique constraint violation occurs (e.g., the
|
||||
streaming loop and long-running callback race), queries the latest
|
||||
sequence and retries with the correct offset. This avoids unnecessary
|
||||
upserts and DB queries in the common case (no collision).
|
||||
|
||||
Returns:
|
||||
Next sequence number for the next message to be inserted. This equals
|
||||
start_sequence + len(messages) and allows callers to update their
|
||||
counters even when collision detection adjusts start_sequence.
|
||||
"""
|
||||
if not messages:
|
||||
return []
|
||||
# No messages to add - return current count
|
||||
return start_sequence
|
||||
|
||||
created_messages = []
|
||||
max_retries = 5
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
# Single timestamp for all messages and session update
|
||||
now = datetime.now(UTC)
|
||||
|
||||
async with db.transaction() as tx:
|
||||
for i, msg in enumerate(messages):
|
||||
# Build input dict dynamically rather than using ChatMessageCreateInput
|
||||
# directly because Prisma's TypedDict validation rejects optional fields
|
||||
# set to None. We only include fields that have values, then cast.
|
||||
data: dict[str, Any] = {
|
||||
"Session": {"connect": {"id": session_id}},
|
||||
"role": msg["role"],
|
||||
"sequence": start_sequence + i,
|
||||
}
|
||||
async with db.transaction() as tx:
|
||||
# Build all message data
|
||||
messages_data = []
|
||||
for i, msg in enumerate(messages):
|
||||
# Build ChatMessageCreateInput with only non-None values
|
||||
# (Prisma TypedDict rejects optional fields set to None)
|
||||
# Note: create_many doesn't support nested creates, use sessionId directly
|
||||
data: ChatMessageCreateInput = {
|
||||
"sessionId": session_id,
|
||||
"role": msg["role"],
|
||||
"sequence": start_sequence + i,
|
||||
"createdAt": now,
|
||||
}
|
||||
|
||||
# Add optional string fields
|
||||
if msg.get("content") is not None:
|
||||
data["content"] = msg["content"]
|
||||
if msg.get("name") is not None:
|
||||
data["name"] = msg["name"]
|
||||
if msg.get("tool_call_id") is not None:
|
||||
data["toolCallId"] = msg["tool_call_id"]
|
||||
if msg.get("refusal") is not None:
|
||||
data["refusal"] = msg["refusal"]
|
||||
# Add optional string fields
|
||||
if msg.get("content") is not None:
|
||||
data["content"] = msg["content"]
|
||||
if msg.get("name") is not None:
|
||||
data["name"] = msg["name"]
|
||||
if msg.get("tool_call_id") is not None:
|
||||
data["toolCallId"] = msg["tool_call_id"]
|
||||
if msg.get("refusal") is not None:
|
||||
data["refusal"] = msg["refusal"]
|
||||
|
||||
# Add optional JSON fields only when they have values
|
||||
if msg.get("tool_calls") is not None:
|
||||
data["toolCalls"] = SafeJson(msg["tool_calls"])
|
||||
if msg.get("function_call") is not None:
|
||||
data["functionCall"] = SafeJson(msg["function_call"])
|
||||
# Add optional JSON fields only when they have values
|
||||
if msg.get("tool_calls") is not None:
|
||||
data["toolCalls"] = SafeJson(msg["tool_calls"])
|
||||
if msg.get("function_call") is not None:
|
||||
data["functionCall"] = SafeJson(msg["function_call"])
|
||||
|
||||
created = await PrismaChatMessage.prisma(tx).create(
|
||||
data=cast(ChatMessageCreateInput, data)
|
||||
)
|
||||
created_messages.append(created)
|
||||
messages_data.append(data)
|
||||
|
||||
# Update session's updatedAt timestamp within the same transaction.
|
||||
# Note: Token usage (total_prompt_tokens, total_completion_tokens) is updated
|
||||
# separately via update_chat_session() after streaming completes.
|
||||
await PrismaChatSession.prisma(tx).update(
|
||||
where={"id": session_id},
|
||||
data={"updatedAt": datetime.now(UTC)},
|
||||
)
|
||||
# Run create_many and session update in parallel within transaction
|
||||
# Both use the same timestamp for consistency
|
||||
await asyncio.gather(
|
||||
PrismaChatMessage.prisma(tx).create_many(data=messages_data),
|
||||
PrismaChatSession.prisma(tx).update(
|
||||
where={"id": session_id},
|
||||
data={"updatedAt": now},
|
||||
),
|
||||
)
|
||||
|
||||
return [ChatMessage.from_db(m) for m in created_messages]
|
||||
# Return next sequence number for counter sync
|
||||
return start_sequence + len(messages)
|
||||
|
||||
except UniqueViolationError:
|
||||
if attempt < max_retries - 1:
|
||||
# Collision detected - query MAX(sequence)+1 and retry with correct offset
|
||||
logger.info(
|
||||
f"Collision detected for session {session_id} at sequence "
|
||||
f"{start_sequence}, querying DB for latest sequence"
|
||||
)
|
||||
start_sequence = await get_next_sequence(session_id)
|
||||
logger.info(
|
||||
f"Retrying batch insert with start_sequence={start_sequence}"
|
||||
)
|
||||
continue
|
||||
else:
|
||||
# Max retries exceeded - propagate error
|
||||
raise
|
||||
|
||||
# Should never reach here due to raise in exception handler
|
||||
raise RuntimeError(f"Failed to insert messages after {max_retries} attempts")
|
||||
|
||||
|
||||
async def get_user_chat_sessions(
|
||||
@@ -237,10 +272,20 @@ async def delete_chat_session(session_id: str, user_id: str | None = None) -> bo
|
||||
return False
|
||||
|
||||
|
||||
async def get_chat_session_message_count(session_id: str) -> int:
|
||||
"""Get the number of messages in a chat session."""
|
||||
count = await PrismaChatMessage.prisma().count(where={"sessionId": session_id})
|
||||
return count
|
||||
async def get_next_sequence(session_id: str) -> int:
|
||||
"""Get the next sequence number for a new message in this session.
|
||||
|
||||
Uses MAX(sequence) + 1 for robustness. Returns 0 if no messages exist.
|
||||
More robust than COUNT(*) because it's immune to deleted messages.
|
||||
|
||||
Optimized to select only the sequence column using raw SQL.
|
||||
The unique index on (sessionId, sequence) makes this query fast.
|
||||
"""
|
||||
results = await db.query_raw_with_schema(
|
||||
'SELECT "sequence" FROM {schema_prefix}"ChatMessage" WHERE "sessionId" = $1 ORDER BY "sequence" DESC LIMIT 1',
|
||||
session_id,
|
||||
)
|
||||
return 0 if not results else results[0]["sequence"] + 1
|
||||
|
||||
|
||||
async def update_tool_message_content(
|
||||
|
||||
@@ -266,7 +266,11 @@ class CoPilotProcessor:
|
||||
|
||||
except asyncio.CancelledError:
|
||||
log.info("Task cancelled")
|
||||
await stream_registry.mark_task_completed(entry.task_id, status="failed")
|
||||
await stream_registry.mark_task_completed(
|
||||
entry.task_id,
|
||||
status="failed",
|
||||
error_message="Task was cancelled",
|
||||
)
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -434,8 +434,6 @@ async def _get_session_from_db(session_id: str) -> ChatSession | None:
|
||||
|
||||
async def upsert_chat_session(
|
||||
session: ChatSession,
|
||||
*,
|
||||
existing_message_count: int | None = None,
|
||||
) -> ChatSession:
|
||||
"""Update a chat session in both cache and database.
|
||||
|
||||
@@ -443,12 +441,6 @@ async def upsert_chat_session(
|
||||
operations (e.g., background title update and main stream handler)
|
||||
attempt to upsert the same session simultaneously.
|
||||
|
||||
Args:
|
||||
existing_message_count: If provided, skip the DB query to count
|
||||
existing messages. The caller is responsible for tracking this
|
||||
accurately. Useful for incremental saves in a streaming loop
|
||||
where the caller already knows how many messages are persisted.
|
||||
|
||||
Raises:
|
||||
DatabaseError: If the database write fails. The cache is still updated
|
||||
as a best-effort optimization, but the error is propagated to ensure
|
||||
@@ -459,11 +451,8 @@ async def upsert_chat_session(
|
||||
lock = await _get_session_lock(session.session_id)
|
||||
|
||||
async with lock:
|
||||
# Get existing message count from DB for incremental saves
|
||||
if existing_message_count is None:
|
||||
existing_message_count = await chat_db().get_chat_session_message_count(
|
||||
session.session_id
|
||||
)
|
||||
# Always query DB for existing message count to ensure consistency
|
||||
existing_message_count = await chat_db().get_next_sequence(session.session_id)
|
||||
|
||||
db_error: Exception | None = None
|
||||
|
||||
@@ -587,9 +576,7 @@ async def append_and_save_message(session_id: str, message: ChatMessage) -> Chat
|
||||
raise ValueError(f"Session {session_id} not found")
|
||||
|
||||
session.messages.append(message)
|
||||
existing_message_count = await chat_db().get_chat_session_message_count(
|
||||
session_id
|
||||
)
|
||||
existing_message_count = await chat_db().get_next_sequence(session_id)
|
||||
|
||||
try:
|
||||
await _save_session_to_db(session, existing_message_count)
|
||||
|
||||
@@ -331,3 +331,96 @@ def test_to_openai_messages_merges_split_assistants():
|
||||
tc_list = merged.get("tool_calls")
|
||||
assert tc_list is not None and len(list(tc_list)) == 1
|
||||
assert list(tc_list)[0]["id"] == "tc1"
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Concurrent save collision detection #
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
@pytest.mark.asyncio(loop_scope="session")
|
||||
async def test_concurrent_saves_collision_detection(setup_test_user, test_user_id):
|
||||
"""Test that concurrent saves from streaming loop and callback handle collisions correctly.
|
||||
|
||||
Simulates the race condition where:
|
||||
1. Streaming loop starts with saved_msg_count=5
|
||||
2. Long-running callback appends message #5 and saves
|
||||
3. Streaming loop tries to save with stale count=5
|
||||
|
||||
The collision detection should handle this gracefully.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
# Create a session with initial messages
|
||||
session = ChatSession.new(user_id=test_user_id)
|
||||
for i in range(3):
|
||||
session.messages.append(
|
||||
ChatMessage(
|
||||
role="user" if i % 2 == 0 else "assistant", content=f"Message {i}"
|
||||
)
|
||||
)
|
||||
|
||||
# Save initial messages
|
||||
session = await upsert_chat_session(session)
|
||||
|
||||
# Simulate streaming loop and callback saving concurrently
|
||||
async def streaming_loop_save():
|
||||
"""Simulates streaming loop saving messages."""
|
||||
# Add 2 messages
|
||||
session.messages.append(ChatMessage(role="user", content="Streaming message 1"))
|
||||
session.messages.append(
|
||||
ChatMessage(role="assistant", content="Streaming message 2")
|
||||
)
|
||||
|
||||
# Wait a bit to let callback potentially save first
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
# Save (will query DB for existing count)
|
||||
return await upsert_chat_session(session)
|
||||
|
||||
async def callback_save():
|
||||
"""Simulates long-running callback saving a message."""
|
||||
# Add 1 message
|
||||
session.messages.append(
|
||||
ChatMessage(role="tool", content="Callback result", tool_call_id="tc1")
|
||||
)
|
||||
|
||||
# Save immediately (will query DB for existing count)
|
||||
return await upsert_chat_session(session)
|
||||
|
||||
# Run both saves concurrently - one will hit collision detection
|
||||
results = await asyncio.gather(streaming_loop_save(), callback_save())
|
||||
|
||||
# Both should succeed
|
||||
assert all(r is not None for r in results)
|
||||
|
||||
# Reload session from DB to verify
|
||||
from backend.data.redis_client import get_redis_async
|
||||
|
||||
redis_key = f"chat:session:{session.session_id}"
|
||||
async_redis = await get_redis_async()
|
||||
await async_redis.delete(redis_key) # Clear cache to force DB load
|
||||
|
||||
loaded_session = await get_chat_session(session.session_id, test_user_id)
|
||||
assert loaded_session is not None
|
||||
|
||||
# Should have all 6 messages (3 initial + 2 streaming + 1 callback)
|
||||
assert len(loaded_session.messages) == 6
|
||||
|
||||
# Verify no duplicate sequences
|
||||
sequences = []
|
||||
for i, msg in enumerate(loaded_session.messages):
|
||||
# Messages should have sequential sequence numbers starting from 0
|
||||
sequences.append(i)
|
||||
|
||||
# All sequences should be unique and sequential
|
||||
assert sequences == list(range(6))
|
||||
|
||||
# Verify message content is preserved
|
||||
contents = [m.content for m in loaded_session.messages]
|
||||
assert "Message 0" in contents
|
||||
assert "Message 1" in contents
|
||||
assert "Message 2" in contents
|
||||
assert "Streaming message 1" in contents
|
||||
assert "Streaming message 2" in contents
|
||||
assert "Callback result" in contents
|
||||
|
||||
@@ -7,8 +7,10 @@ import os
|
||||
import uuid
|
||||
from collections.abc import AsyncGenerator
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
from typing import Any, cast
|
||||
|
||||
from backend.data.redis_client import get_redis_async
|
||||
from backend.executor.cluster_lock import AsyncClusterLock
|
||||
from backend.util.exceptions import NotFoundError
|
||||
|
||||
from .. import stream_registry
|
||||
@@ -61,6 +63,7 @@ from .transcript import (
|
||||
logger = logging.getLogger(__name__)
|
||||
config = ChatConfig()
|
||||
|
||||
|
||||
# Set to hold background tasks to prevent garbage collection
|
||||
_background_tasks: set[asyncio.Task[Any]] = set()
|
||||
|
||||
@@ -132,8 +135,12 @@ is delivered to the user via a background stream.
|
||||
All tasks must run in the foreground.
|
||||
"""
|
||||
|
||||
STREAM_LOCK_PREFIX = "copilot:stream:lock:"
|
||||
|
||||
def _build_long_running_callback(user_id: str | None) -> LongRunningCallback:
|
||||
|
||||
def _build_long_running_callback(
|
||||
user_id: str | None,
|
||||
) -> LongRunningCallback:
|
||||
"""Build a callback that delegates long-running tools to the non-SDK infrastructure.
|
||||
|
||||
Long-running tools (create_agent, edit_agent, etc.) are delegated to the
|
||||
@@ -142,6 +149,9 @@ def _build_long_running_callback(user_id: str | None) -> LongRunningCallback:
|
||||
page refreshes / pod restarts, and the frontend shows the proper loading
|
||||
widget with progress updates.
|
||||
|
||||
Args:
|
||||
user_id: User ID for the session
|
||||
|
||||
The returned callback matches the ``LongRunningCallback`` signature:
|
||||
``(tool_name, args, session) -> MCP response dict``.
|
||||
"""
|
||||
@@ -207,7 +217,8 @@ def _build_long_running_callback(user_id: str | None) -> LongRunningCallback:
|
||||
tool_call_id=tool_call_id,
|
||||
)
|
||||
session.messages.append(pending_message)
|
||||
await upsert_chat_session(session)
|
||||
# Collision detection happens in add_chat_messages_batch (db.py)
|
||||
session = await upsert_chat_session(session)
|
||||
|
||||
# --- Spawn background task (reuses non-SDK infrastructure) ---
|
||||
bg_task = asyncio.create_task(
|
||||
@@ -527,6 +538,9 @@ async def stream_chat_completion_sdk(
|
||||
f"Session {session_id} not found. Please create a new session first."
|
||||
)
|
||||
|
||||
# Type narrowing: session is guaranteed ChatSession after the check above
|
||||
session = cast(ChatSession, session)
|
||||
|
||||
# Append the new message to the session if it's not already there
|
||||
new_message_role = "user" if is_user_message else "assistant"
|
||||
if message and (
|
||||
@@ -564,6 +578,29 @@ async def stream_chat_completion_sdk(
|
||||
system_prompt += _SDK_TOOL_SUPPLEMENT
|
||||
message_id = str(uuid.uuid4())
|
||||
task_id = str(uuid.uuid4())
|
||||
stream_id = task_id # Use task_id as unique stream identifier
|
||||
|
||||
# Acquire stream lock to prevent concurrent streams to the same session
|
||||
lock = AsyncClusterLock(
|
||||
redis=await get_redis_async(),
|
||||
key=f"{STREAM_LOCK_PREFIX}{session_id}",
|
||||
owner_id=stream_id,
|
||||
timeout=config.stream_lock_ttl,
|
||||
)
|
||||
|
||||
lock_owner = await lock.try_acquire()
|
||||
if lock_owner != stream_id:
|
||||
# Another stream is active
|
||||
logger.warning(
|
||||
f"[SDK] Session {session_id} already has an active stream: {lock_owner}"
|
||||
)
|
||||
yield StreamError(
|
||||
errorText="Another stream is already active for this session. "
|
||||
"Please wait or stop it.",
|
||||
code="stream_already_active",
|
||||
)
|
||||
yield StreamFinish()
|
||||
return
|
||||
|
||||
yield StreamStart(messageId=message_id, taskId=task_id)
|
||||
|
||||
@@ -715,9 +752,6 @@ async def stream_chat_completion_sdk(
|
||||
accumulated_tool_calls: list[dict[str, Any]] = []
|
||||
has_appended_assistant = False
|
||||
has_tool_results = False
|
||||
# Track persisted message count to skip DB count queries
|
||||
# on incremental saves. Initial save happened at line 545.
|
||||
saved_msg_count = len(session.messages)
|
||||
|
||||
# Use an explicit async iterator with non-cancelling heartbeats.
|
||||
# CRITICAL: we must NOT cancel __anext__() mid-flight — doing so
|
||||
@@ -744,6 +778,8 @@ async def stream_chat_completion_sdk(
|
||||
|
||||
if not done:
|
||||
# Timeout — emit heartbeat but keep the task alive
|
||||
# Also refresh lock TTL to keep it alive
|
||||
await lock.refresh()
|
||||
yield StreamHeartbeat()
|
||||
continue
|
||||
|
||||
@@ -893,13 +929,10 @@ async def stream_chat_completion_sdk(
|
||||
has_appended_assistant = True
|
||||
# Save before tool execution starts so the
|
||||
# pending tool call is visible on refresh /
|
||||
# other devices.
|
||||
# other devices. Collision detection happens
|
||||
# in add_chat_messages_batch (db.py).
|
||||
try:
|
||||
await upsert_chat_session(
|
||||
session,
|
||||
existing_message_count=saved_msg_count,
|
||||
)
|
||||
saved_msg_count = len(session.messages)
|
||||
session = await upsert_chat_session(session)
|
||||
except Exception as save_err:
|
||||
logger.warning(
|
||||
"[SDK] [%s] Incremental save " "failed: %s",
|
||||
@@ -922,12 +955,9 @@ async def stream_chat_completion_sdk(
|
||||
has_tool_results = True
|
||||
# Save after tool completes so the result is
|
||||
# visible on refresh / other devices.
|
||||
# Collision detection happens in add_chat_messages_batch (db.py).
|
||||
try:
|
||||
await upsert_chat_session(
|
||||
session,
|
||||
existing_message_count=saved_msg_count,
|
||||
)
|
||||
saved_msg_count = len(session.messages)
|
||||
session = await upsert_chat_session(session)
|
||||
except Exception as save_err:
|
||||
logger.warning(
|
||||
"[SDK] [%s] Incremental save " "failed: %s",
|
||||
@@ -1059,7 +1089,7 @@ async def stream_chat_completion_sdk(
|
||||
"to use the OpenAI-compatible fallback."
|
||||
)
|
||||
|
||||
await asyncio.shield(upsert_chat_session(session))
|
||||
session = cast(ChatSession, await asyncio.shield(upsert_chat_session(session)))
|
||||
logger.info(
|
||||
"[SDK] [%s] Session saved with %d messages",
|
||||
session_id[:12],
|
||||
@@ -1076,10 +1106,11 @@ async def stream_chat_completion_sdk(
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"[SDK] Error: {e}", exc_info=True)
|
||||
try:
|
||||
await asyncio.shield(upsert_chat_session(session))
|
||||
except Exception as save_err:
|
||||
logger.error(f"[SDK] Failed to save session on error: {save_err}")
|
||||
if session:
|
||||
try:
|
||||
await asyncio.shield(upsert_chat_session(session))
|
||||
except Exception as save_err:
|
||||
logger.error(f"[SDK] Failed to save session on error: {save_err}")
|
||||
yield StreamError(
|
||||
errorText="An error occurred. Please try again.",
|
||||
code="sdk_error",
|
||||
@@ -1101,7 +1132,7 @@ async def stream_chat_completion_sdk(
|
||||
if not raw_transcript and use_resume and resume_file:
|
||||
raw_transcript = read_transcript_file(resume_file)
|
||||
|
||||
if raw_transcript:
|
||||
if raw_transcript and session is not None:
|
||||
await asyncio.shield(
|
||||
_try_upload_transcript(
|
||||
user_id,
|
||||
@@ -1121,6 +1152,9 @@ async def stream_chat_completion_sdk(
|
||||
if sdk_cwd:
|
||||
_cleanup_sdk_tool_results(sdk_cwd)
|
||||
|
||||
# Release stream lock to allow new streams for this session
|
||||
await lock.release()
|
||||
|
||||
|
||||
async def _try_upload_transcript(
|
||||
user_id: str,
|
||||
|
||||
@@ -352,7 +352,8 @@ async def assign_user_to_session(
|
||||
if not session:
|
||||
raise NotFoundError(f"Session {session_id} not found")
|
||||
session.user_id = user_id
|
||||
return await upsert_chat_session(session)
|
||||
session = await upsert_chat_session(session)
|
||||
return session
|
||||
|
||||
|
||||
async def stream_chat_completion(
|
||||
@@ -1563,7 +1564,11 @@ async def _yield_tool_call(
|
||||
await _mark_operation_completed(tool_call_id)
|
||||
# Mark stream registry task as failed if it was created
|
||||
try:
|
||||
await stream_registry.mark_task_completed(task_id, status="failed")
|
||||
await stream_registry.mark_task_completed(
|
||||
task_id,
|
||||
status="failed",
|
||||
error_message=f"Failed to setup tool {tool_name}: {e}",
|
||||
)
|
||||
except Exception as mark_err:
|
||||
logger.warning(f"Failed to mark task {task_id} as failed: {mark_err}")
|
||||
logger.error(
|
||||
@@ -1731,7 +1736,11 @@ async def _execute_long_running_tool_with_streaming(
|
||||
session = await get_chat_session(session_id, user_id)
|
||||
if not session:
|
||||
logger.error(f"Session {session_id} not found for background tool")
|
||||
await stream_registry.mark_task_completed(task_id, status="failed")
|
||||
await stream_registry.mark_task_completed(
|
||||
task_id,
|
||||
status="failed",
|
||||
error_message=f"Session {session_id} not found",
|
||||
)
|
||||
return
|
||||
|
||||
# Pass operation_id and task_id to the tool for async processing
|
||||
|
||||
@@ -644,6 +644,8 @@ async def _stream_listener(
|
||||
async def mark_task_completed(
|
||||
task_id: str,
|
||||
status: Literal["completed", "failed"] = "completed",
|
||||
*,
|
||||
error_message: str | None = None,
|
||||
) -> bool:
|
||||
"""Mark a task as completed and publish finish event.
|
||||
|
||||
@@ -654,6 +656,10 @@ async def mark_task_completed(
|
||||
Args:
|
||||
task_id: Task ID to mark as completed
|
||||
status: Final status ("completed" or "failed")
|
||||
error_message: If provided and status="failed", publish a StreamError
|
||||
before StreamFinish so connected clients see why the task ended.
|
||||
If not provided, no StreamError is published (caller should publish
|
||||
manually if needed to avoid duplicates).
|
||||
|
||||
Returns:
|
||||
True if task was newly marked completed, False if already completed/failed
|
||||
@@ -669,6 +675,17 @@ async def mark_task_completed(
|
||||
logger.debug(f"Task {task_id} already completed/failed, skipping")
|
||||
return False
|
||||
|
||||
# Publish error event before finish so connected clients know WHY the
|
||||
# task ended. Only publish if caller provided an explicit error message
|
||||
# to avoid duplicates with code paths that manually publish StreamError.
|
||||
# This is best-effort — if it fails, the StreamFinish still ensures
|
||||
# listeners clean up.
|
||||
if status == "failed" and error_message:
|
||||
try:
|
||||
await publish_chunk(task_id, StreamError(errorText=error_message))
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to publish error event for task {task_id}: {e}")
|
||||
|
||||
# THEN publish finish event (best-effort - listeners can detect via status polling)
|
||||
try:
|
||||
await publish_chunk(task_id, StreamFinish())
|
||||
@@ -821,27 +838,6 @@ async def get_active_task_for_session(
|
||||
if task_user_id and user_id != task_user_id:
|
||||
continue
|
||||
|
||||
# Auto-expire stale tasks that exceeded stream_timeout
|
||||
created_at_str = meta.get("created_at", "")
|
||||
if created_at_str:
|
||||
try:
|
||||
created_at = datetime.fromisoformat(created_at_str)
|
||||
age_seconds = (
|
||||
datetime.now(timezone.utc) - created_at
|
||||
).total_seconds()
|
||||
if age_seconds > config.stream_timeout:
|
||||
logger.warning(
|
||||
f"[TASK_LOOKUP] Auto-expiring stale task {task_id[:8]}... "
|
||||
f"(age={age_seconds:.0f}s > timeout={config.stream_timeout}s)"
|
||||
)
|
||||
await mark_task_completed(task_id, "failed")
|
||||
continue
|
||||
except (ValueError, TypeError) as exc:
|
||||
logger.warning(
|
||||
f"[TASK_LOOKUP] Failed to parse created_at "
|
||||
f"for task {task_id[:8]}...: {exc}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"[TASK_LOOKUP] Found running task {task_id[:8]}... for session {session_id[:8]}..."
|
||||
)
|
||||
|
||||
@@ -303,7 +303,7 @@ class DatabaseManager(AppService):
|
||||
get_user_chat_sessions = _(chat_db.get_user_chat_sessions)
|
||||
get_user_session_count = _(chat_db.get_user_session_count)
|
||||
delete_chat_session = _(chat_db.delete_chat_session)
|
||||
get_chat_session_message_count = _(chat_db.get_chat_session_message_count)
|
||||
get_next_sequence = _(chat_db.get_next_sequence)
|
||||
update_tool_message_content = _(chat_db.update_tool_message_content)
|
||||
|
||||
|
||||
@@ -473,5 +473,5 @@ class DatabaseManagerAsyncClient(AppServiceClient):
|
||||
get_user_chat_sessions = d.get_user_chat_sessions
|
||||
get_user_session_count = d.get_user_session_count
|
||||
delete_chat_session = d.delete_chat_session
|
||||
get_chat_session_message_count = d.get_chat_session_message_count
|
||||
get_next_sequence = d.get_next_sequence
|
||||
update_tool_message_content = d.update_tool_message_content
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Redis-based distributed locking for cluster coordination."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
@@ -7,6 +8,7 @@ from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from redis import Redis
|
||||
from redis.asyncio import Redis as AsyncRedis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -126,3 +128,124 @@ class ClusterLock:
|
||||
|
||||
with self._refresh_lock:
|
||||
self._last_refresh = 0.0
|
||||
|
||||
|
||||
class AsyncClusterLock:
|
||||
"""Async Redis-based distributed lock for preventing duplicate execution."""
|
||||
|
||||
def __init__(
|
||||
self, redis: "AsyncRedis", key: str, owner_id: str, timeout: int = 300
|
||||
):
|
||||
self.redis = redis
|
||||
self.key = key
|
||||
self.owner_id = owner_id
|
||||
self.timeout = timeout
|
||||
self._last_refresh = 0.0
|
||||
self._refresh_lock = asyncio.Lock()
|
||||
|
||||
async def try_acquire(self) -> str | None:
|
||||
"""Try to acquire the lock.
|
||||
|
||||
Returns:
|
||||
- owner_id (self.owner_id) if successfully acquired
|
||||
- different owner_id if someone else holds the lock
|
||||
- None if Redis is unavailable or other error
|
||||
"""
|
||||
try:
|
||||
success = await self.redis.set(
|
||||
self.key, self.owner_id, nx=True, ex=self.timeout
|
||||
)
|
||||
if success:
|
||||
async with self._refresh_lock:
|
||||
self._last_refresh = time.time()
|
||||
return self.owner_id # Successfully acquired
|
||||
|
||||
# Failed to acquire, get current owner
|
||||
current_value = await self.redis.get(self.key)
|
||||
if current_value:
|
||||
current_owner = (
|
||||
current_value.decode("utf-8")
|
||||
if isinstance(current_value, bytes)
|
||||
else str(current_value)
|
||||
)
|
||||
return current_owner
|
||||
|
||||
# Key doesn't exist but we failed to set it - race condition or Redis issue
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AsyncClusterLock.try_acquire failed for key {self.key}: {e}")
|
||||
return None
|
||||
|
||||
async def refresh(self) -> bool:
|
||||
"""Refresh lock TTL if we still own it.
|
||||
|
||||
Rate limited to at most once every timeout/10 seconds (minimum 1 second).
|
||||
During rate limiting, still verifies lock existence but skips TTL extension.
|
||||
Setting _last_refresh to 0 bypasses rate limiting for testing.
|
||||
|
||||
Async-safe: uses asyncio.Lock to protect _last_refresh access.
|
||||
"""
|
||||
# Calculate refresh interval: max(timeout // 10, 1)
|
||||
refresh_interval = max(self.timeout // 10, 1)
|
||||
current_time = time.time()
|
||||
|
||||
# Check if we're within the rate limit period (async-safe read)
|
||||
# _last_refresh == 0 forces a refresh (bypasses rate limiting for testing)
|
||||
async with self._refresh_lock:
|
||||
last_refresh = self._last_refresh
|
||||
is_rate_limited = (
|
||||
last_refresh > 0 and (current_time - last_refresh) < refresh_interval
|
||||
)
|
||||
|
||||
try:
|
||||
# Always verify lock existence, even during rate limiting
|
||||
current_value = await self.redis.get(self.key)
|
||||
if not current_value:
|
||||
async with self._refresh_lock:
|
||||
self._last_refresh = 0
|
||||
return False
|
||||
|
||||
stored_owner = (
|
||||
current_value.decode("utf-8")
|
||||
if isinstance(current_value, bytes)
|
||||
else str(current_value)
|
||||
)
|
||||
if stored_owner != self.owner_id:
|
||||
async with self._refresh_lock:
|
||||
self._last_refresh = 0
|
||||
return False
|
||||
|
||||
# If rate limited, return True but don't update TTL or timestamp
|
||||
if is_rate_limited:
|
||||
return True
|
||||
|
||||
# Perform actual refresh
|
||||
if await self.redis.expire(self.key, self.timeout):
|
||||
async with self._refresh_lock:
|
||||
self._last_refresh = current_time
|
||||
return True
|
||||
|
||||
async with self._refresh_lock:
|
||||
self._last_refresh = 0
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AsyncClusterLock.refresh failed for key {self.key}: {e}")
|
||||
async with self._refresh_lock:
|
||||
self._last_refresh = 0
|
||||
return False
|
||||
|
||||
async def release(self):
|
||||
"""Release the lock."""
|
||||
async with self._refresh_lock:
|
||||
if self._last_refresh == 0:
|
||||
return
|
||||
|
||||
try:
|
||||
await self.redis.delete(self.key)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async with self._refresh_lock:
|
||||
self._last_refresh = 0.0
|
||||
|
||||
@@ -364,11 +364,11 @@ class Config(UpdateTrackingModel["Config"], BaseSettings):
|
||||
)
|
||||
|
||||
agentgenerator_host: str = Field(
|
||||
default="",
|
||||
description="The host for the Agent Generator service (empty to use built-in)",
|
||||
default="localhost",
|
||||
description="The host for the Agent Generator service",
|
||||
)
|
||||
agentgenerator_port: int = Field(
|
||||
default=8000,
|
||||
default=8009,
|
||||
description="The port for the Agent Generator service",
|
||||
)
|
||||
agentgenerator_timeout: int = Field(
|
||||
|
||||
Reference in New Issue
Block a user