Compare commits

...

5 Commits

Author SHA1 Message Date
Ubbe
efe2f18719 Merge branch 'dev' into fix/spinner-height-copilot 2026-02-18 20:44:42 +08:00
Lluis Agusti
23eda10cb1 chore: format 2026-02-18 20:34:48 +08:00
Lluis Agusti
7bfc5140b0 fix: make spinner centred when loading 2026-02-18 20:29:27 +08:00
Otto
15bcdae4e8 fix(backend/copilot): Clean up GCSWorkspaceStorage per worker (#12153)
The copilot executor runs each worker in its own thread with a dedicated
event loop (`asyncio.new_event_loop()`). `aiohttp.ClientSession` is
bound to the event loop where it was created — using it from a different
loop causes `asyncio.timeout()` to fail with:

```
RuntimeError: Timeout context manager should be used inside a task
```

This was the root cause of transcript upload failures tracked in
SECRT-2009 and [Sentry
#7272473694](https://significant-gravitas.sentry.io/issues/7272473694/).

### Fix

**One `GCSWorkspaceStorage` instance per event loop** instead of a
single shared global.

- `get_workspace_storage()` now returns a per-loop GCS instance (keyed
by `id(asyncio.get_running_loop())`). Local storage remains shared since
it has no async I/O.
- `shutdown_workspace_storage()` closes the instance for the **current**
loop only, so `session.close()` always runs on the loop that created the
session.
- `CoPilotProcessor.cleanup()` shuts down workspace storage on the
worker's own loop, then stops the loop.
- Manager cleanup submits `cleanup_worker` to each thread pool worker
before shutting down the executor — replacing the old approach of
creating a temporary event loop that couldn't close cross-loop sessions.

### Changes

| File | Change |
|------|--------|
| `util/workspace_storage.py` | `GCSWorkspaceStorage` back to simple
single-session class; `get_workspace_storage()` returns per-loop GCS
instance; `shutdown_workspace_storage()` scoped to current loop |
| `copilot/executor/processor.py` | Added `CoPilotProcessor.cleanup()`
and `cleanup_worker()` |
| `copilot/executor/manager.py` | Calls `cleanup_worker` on each thread
pool worker during shutdown |

Fixes SECRT-2009

---------

Co-authored-by: Reinier van der Leer <pwuts@agpt.co>
2026-02-18 11:17:39 +00:00
Otto
e9ba7e51db fix(copilot): Route workspace through db_accessors, fix transcript upload (#12148)
## Summary

Fixes two bugs in the copilot executor:

### SECRT-2008: WorkspaceManager bypasses db_accessors
`backend/util/workspace.py` imported 6 workspace functions directly from
`backend/data/workspace.py`, which call `prisma()` directly. In the
copilot executor (no Prisma connection), these fail.

**Fix:** Replace direct imports with `workspace_db()` from
`db_accessors`, routing through the database_manager HTTP client when
Prisma is unavailable. Also:
- Register all 6 workspace functions in `DatabaseManager` and
`DatabaseManagerAsyncClient`
- Add `UniqueViolationError` to the service `EXCEPTION_MAPPING` so it's
properly re-raised over HTTP (needed for race-condition retry logic)

### SECRT-2009: Transcript upload asyncio.timeout error
`asyncio.create_task()` at line 696 of `service.py` creates an orphaned
background task in the executor's thread event loop.
`gcloud-aio-storage`'s `asyncio.timeout()` fails in this context.

**Fix:** Replace `create_task` with direct `await`. The upload runs
after streaming completes (all chunks already yielded), so no
user-facing latency impact. The function already has internal try/except
error handling.
2026-02-17 22:24:19 +00:00
9 changed files with 183 additions and 90 deletions

View File

@@ -164,21 +164,23 @@ class CoPilotExecutor(AppProcess):
self._cancel_thread, self.cancel_client, "[cleanup][cancel]"
)
# Shutdown executor
# Clean up worker threads (closes per-loop workspace storage sessions)
if self._executor:
from .processor import cleanup_worker
logger.info(f"[cleanup {pid}] Cleaning up workers...")
futures = []
for _ in range(self._executor._max_workers):
futures.append(self._executor.submit(cleanup_worker))
for f in futures:
try:
f.result(timeout=10)
except Exception as e:
logger.warning(f"[cleanup {pid}] Worker cleanup error: {e}")
logger.info(f"[cleanup {pid}] Shutting down executor...")
self._executor.shutdown(wait=False)
# Close async resources (workspace storage aiohttp session, etc.)
try:
from backend.util.workspace_storage import shutdown_workspace_storage
loop = asyncio.new_event_loop()
loop.run_until_complete(shutdown_workspace_storage())
loop.close()
except Exception as e:
logger.warning(f"[cleanup {pid}] Error closing workspace storage: {e}")
# Release any remaining locks
for task_id, lock in list(self._task_locks.items()):
try:

View File

@@ -60,6 +60,18 @@ def init_worker():
_tls.processor.on_executor_start()
def cleanup_worker():
"""Clean up the processor for the current worker thread.
Should be called before the worker thread's event loop is destroyed so
that event-loop-bound resources (e.g. ``aiohttp.ClientSession``) are
closed on the correct loop.
"""
processor: CoPilotProcessor | None = getattr(_tls, "processor", None)
if processor is not None:
processor.cleanup()
# ============ Processor Class ============ #
@@ -98,6 +110,28 @@ class CoPilotProcessor:
logger.info(f"[CoPilotExecutor] Worker {self.tid} started")
def cleanup(self):
"""Clean up event-loop-bound resources before the loop is destroyed.
Shuts down the workspace storage instance that belongs to this
worker's event loop, ensuring ``aiohttp.ClientSession.close()``
runs on the same loop that created the session.
"""
from backend.util.workspace_storage import shutdown_workspace_storage
try:
future = asyncio.run_coroutine_threadsafe(
shutdown_workspace_storage(), self.execution_loop
)
future.result(timeout=5)
except Exception as e:
logger.warning(f"[CoPilotExecutor] Worker {self.tid} cleanup error: {e}")
# Stop the event loop
self.execution_loop.call_soon_threadsafe(self.execution_loop.stop)
self.execution_thread.join(timeout=5)
logger.info(f"[CoPilotExecutor] Worker {self.tid} cleaned up")
@error_logged(swallow=False)
def execute(
self,

View File

@@ -693,11 +693,15 @@ async def stream_chat_completion_sdk(
await asyncio.sleep(0.5)
raw_transcript = read_transcript_file(captured_transcript.path)
if raw_transcript:
task = asyncio.create_task(
_upload_transcript_bg(user_id, session_id, raw_transcript)
)
_background_tasks.add(task)
task.add_done_callback(_background_tasks.discard)
try:
async with asyncio.timeout(30):
await _upload_transcript_bg(
user_id, session_id, raw_transcript
)
except asyncio.TimeoutError:
logger.warning(
f"[SDK] Transcript upload timed out for {session_id}"
)
else:
logger.debug("[SDK] Stop hook fired but transcript not usable")

View File

@@ -93,7 +93,15 @@ from backend.data.user import (
get_user_notification_preference,
update_user_integrations,
)
from backend.data.workspace import get_or_create_workspace
from backend.data.workspace import (
count_workspace_files,
create_workspace_file,
get_or_create_workspace,
get_workspace_file,
get_workspace_file_by_path,
list_workspace_files,
soft_delete_workspace_file,
)
from backend.util.service import (
AppService,
AppServiceClient,
@@ -274,7 +282,13 @@ class DatabaseManager(AppService):
get_user_execution_summary_data = _(get_user_execution_summary_data)
# ============ Workspace ============ #
count_workspace_files = _(count_workspace_files)
create_workspace_file = _(create_workspace_file)
get_or_create_workspace = _(get_or_create_workspace)
get_workspace_file = _(get_workspace_file)
get_workspace_file_by_path = _(get_workspace_file_by_path)
list_workspace_files = _(list_workspace_files)
soft_delete_workspace_file = _(soft_delete_workspace_file)
# ============ Understanding ============ #
get_business_understanding = _(get_business_understanding)
@@ -438,7 +452,13 @@ class DatabaseManagerAsyncClient(AppServiceClient):
get_user_execution_summary_data = d.get_user_execution_summary_data
# ============ Workspace ============ #
count_workspace_files = d.count_workspace_files
create_workspace_file = d.create_workspace_file
get_or_create_workspace = d.get_or_create_workspace
get_workspace_file = d.get_workspace_file
get_workspace_file_by_path = d.get_workspace_file_by_path
list_workspace_files = d.list_workspace_files
soft_delete_workspace_file = d.soft_delete_workspace_file
# ============ Understanding ============ #
get_business_understanding = d.get_business_understanding

View File

@@ -164,21 +164,23 @@ async def create_workspace_file(
async def get_workspace_file(
file_id: str,
workspace_id: Optional[str] = None,
workspace_id: str,
) -> Optional[WorkspaceFile]:
"""
Get a workspace file by ID.
Args:
file_id: The file ID
workspace_id: Optional workspace ID for validation
workspace_id: Workspace ID for scoping (required)
Returns:
WorkspaceFile instance or None
"""
where_clause: dict = {"id": file_id, "isDeleted": False}
if workspace_id:
where_clause["workspaceId"] = workspace_id
where_clause: UserWorkspaceFileWhereInput = {
"id": file_id,
"isDeleted": False,
"workspaceId": workspace_id,
}
file = await UserWorkspaceFile.prisma().find_first(where=where_clause)
return WorkspaceFile.from_db(file) if file else None
@@ -268,7 +270,7 @@ async def count_workspace_files(
Returns:
Number of files
"""
where_clause: dict = {"workspaceId": workspace_id}
where_clause: UserWorkspaceFileWhereInput = {"workspaceId": workspace_id}
if not include_deleted:
where_clause["isDeleted"] = False
@@ -283,7 +285,7 @@ async def count_workspace_files(
async def soft_delete_workspace_file(
file_id: str,
workspace_id: Optional[str] = None,
workspace_id: str,
) -> Optional[WorkspaceFile]:
"""
Soft-delete a workspace file.
@@ -293,7 +295,7 @@ async def soft_delete_workspace_file(
Args:
file_id: The file ID
workspace_id: Optional workspace ID for validation
workspace_id: Workspace ID for scoping (required)
Returns:
Updated WorkspaceFile instance or None if not found

View File

@@ -28,7 +28,7 @@ from typing import (
import httpx
import uvicorn
from fastapi import FastAPI, Request, responses
from prisma.errors import DataError
from prisma.errors import DataError, UniqueViolationError
from pydantic import BaseModel, TypeAdapter, create_model
import backend.util.exceptions as exceptions
@@ -201,6 +201,7 @@ EXCEPTION_MAPPING = {
UnhealthyServiceError,
HTTPClientError,
HTTPServerError,
UniqueViolationError,
*[
ErrorType
for _, ErrorType in inspect.getmembers(exceptions)
@@ -416,6 +417,9 @@ class AppService(BaseAppService, ABC):
self.fastapi_app.add_exception_handler(
DataError, self._handle_internal_http_error(400)
)
self.fastapi_app.add_exception_handler(
UniqueViolationError, self._handle_internal_http_error(400)
)
self.fastapi_app.add_exception_handler(
Exception, self._handle_internal_http_error(500)
)
@@ -478,6 +482,7 @@ def get_service_client(
# Don't retry these specific exceptions that won't be fixed by retrying
ValueError, # Invalid input/parameters
DataError, # Prisma data integrity errors (foreign key, unique constraints)
UniqueViolationError, # Unique constraint violations
KeyError, # Missing required data
TypeError, # Wrong data types
AttributeError, # Missing attributes

View File

@@ -12,15 +12,8 @@ from typing import Optional
from prisma.errors import UniqueViolationError
from backend.data.workspace import (
WorkspaceFile,
count_workspace_files,
create_workspace_file,
get_workspace_file,
get_workspace_file_by_path,
list_workspace_files,
soft_delete_workspace_file,
)
from backend.data.db_accessors import workspace_db
from backend.data.workspace import WorkspaceFile
from backend.util.settings import Config
from backend.util.virus_scanner import scan_content_safe
from backend.util.workspace_storage import compute_file_checksum, get_workspace_storage
@@ -125,8 +118,9 @@ class WorkspaceManager:
Raises:
FileNotFoundError: If file doesn't exist
"""
db = workspace_db()
resolved_path = self._resolve_path(path)
file = await get_workspace_file_by_path(self.workspace_id, resolved_path)
file = await db.get_workspace_file_by_path(self.workspace_id, resolved_path)
if file is None:
raise FileNotFoundError(f"File not found at path: {resolved_path}")
@@ -146,7 +140,8 @@ class WorkspaceManager:
Raises:
FileNotFoundError: If file doesn't exist
"""
file = await get_workspace_file(file_id, self.workspace_id)
db = workspace_db()
file = await db.get_workspace_file(file_id, self.workspace_id)
if file is None:
raise FileNotFoundError(f"File not found: {file_id}")
@@ -204,8 +199,10 @@ class WorkspaceManager:
# For overwrite=True, we let the write proceed and handle via UniqueViolationError
# This ensures the new file is written to storage BEFORE the old one is deleted,
# preventing data loss if the new write fails
db = workspace_db()
if not overwrite:
existing = await get_workspace_file_by_path(self.workspace_id, path)
existing = await db.get_workspace_file_by_path(self.workspace_id, path)
if existing is not None:
raise ValueError(f"File already exists at path: {path}")
@@ -232,7 +229,7 @@ class WorkspaceManager:
# Create database record - handle race condition where another request
# created a file at the same path between our check and create
try:
file = await create_workspace_file(
file = await db.create_workspace_file(
workspace_id=self.workspace_id,
file_id=file_id,
name=filename,
@@ -246,12 +243,12 @@ class WorkspaceManager:
# Race condition: another request created a file at this path
if overwrite:
# Re-fetch and delete the conflicting file, then retry
existing = await get_workspace_file_by_path(self.workspace_id, path)
existing = await db.get_workspace_file_by_path(self.workspace_id, path)
if existing:
await self.delete_file(existing.id)
# Retry the create - if this also fails, clean up storage file
try:
file = await create_workspace_file(
file = await db.create_workspace_file(
workspace_id=self.workspace_id,
file_id=file_id,
name=filename,
@@ -314,8 +311,9 @@ class WorkspaceManager:
List of WorkspaceFile instances
"""
effective_path = self._get_effective_path(path, include_all_sessions)
db = workspace_db()
return await list_workspace_files(
return await db.list_workspace_files(
workspace_id=self.workspace_id,
path_prefix=effective_path,
limit=limit,
@@ -332,7 +330,8 @@ class WorkspaceManager:
Returns:
True if deleted, False if not found
"""
file = await get_workspace_file(file_id, self.workspace_id)
db = workspace_db()
file = await db.get_workspace_file(file_id, self.workspace_id)
if file is None:
return False
@@ -345,7 +344,7 @@ class WorkspaceManager:
# Continue with database soft-delete even if storage delete fails
# Soft-delete database record
result = await soft_delete_workspace_file(file_id, self.workspace_id)
result = await db.soft_delete_workspace_file(file_id, self.workspace_id)
return result is not None
async def get_download_url(self, file_id: str, expires_in: int = 3600) -> str:
@@ -362,7 +361,8 @@ class WorkspaceManager:
Raises:
FileNotFoundError: If file doesn't exist
"""
file = await get_workspace_file(file_id, self.workspace_id)
db = workspace_db()
file = await db.get_workspace_file(file_id, self.workspace_id)
if file is None:
raise FileNotFoundError(f"File not found: {file_id}")
@@ -379,7 +379,8 @@ class WorkspaceManager:
Returns:
WorkspaceFile instance or None
"""
return await get_workspace_file(file_id, self.workspace_id)
db = workspace_db()
return await db.get_workspace_file(file_id, self.workspace_id)
async def get_file_info_by_path(self, path: str) -> Optional[WorkspaceFile]:
"""
@@ -394,8 +395,9 @@ class WorkspaceManager:
Returns:
WorkspaceFile instance or None
"""
db = workspace_db()
resolved_path = self._resolve_path(path)
return await get_workspace_file_by_path(self.workspace_id, resolved_path)
return await db.get_workspace_file_by_path(self.workspace_id, resolved_path)
async def get_file_count(
self,
@@ -417,7 +419,8 @@ class WorkspaceManager:
Number of files
"""
effective_path = self._get_effective_path(path, include_all_sessions)
db = workspace_db()
return await count_workspace_files(
return await db.count_workspace_files(
self.workspace_id, path_prefix=effective_path
)

View File

@@ -93,7 +93,14 @@ class WorkspaceStorageBackend(ABC):
class GCSWorkspaceStorage(WorkspaceStorageBackend):
"""Google Cloud Storage implementation for workspace storage."""
"""Google Cloud Storage implementation for workspace storage.
Each instance owns a single ``aiohttp.ClientSession`` and GCS async
client. Because ``ClientSession`` is bound to the event loop on which it
was created, callers that run on separate loops (e.g. copilot executor
worker threads) **must** obtain their own ``GCSWorkspaceStorage`` instance
via :func:`get_workspace_storage` which is event-loop-aware.
"""
def __init__(self, bucket_name: str):
self.bucket_name = bucket_name
@@ -337,60 +344,73 @@ class LocalWorkspaceStorage(WorkspaceStorageBackend):
raise ValueError(f"Invalid storage path format: {storage_path}")
# Global storage backend instance
_workspace_storage: Optional[WorkspaceStorageBackend] = None
# ---------------------------------------------------------------------------
# Storage instance management
# ---------------------------------------------------------------------------
# ``aiohttp.ClientSession`` is bound to the event loop where it is created.
# The copilot executor runs each worker in its own thread with a dedicated
# event loop, so a single global ``GCSWorkspaceStorage`` instance would break.
#
# For **local storage** a single shared instance is fine (no async I/O).
# For **GCS storage** we keep one instance *per event loop* so every loop
# gets its own ``ClientSession``.
# ---------------------------------------------------------------------------
_local_storage: Optional[LocalWorkspaceStorage] = None
_gcs_storages: dict[int, GCSWorkspaceStorage] = {}
_storage_lock = asyncio.Lock()
async def get_workspace_storage() -> WorkspaceStorageBackend:
"""Return a workspace storage backend for the **current** event loop.
* Local storage → single shared instance (no event-loop affinity).
* GCS storage → one instance per event loop to avoid cross-loop
``aiohttp`` errors.
"""
Get the workspace storage backend instance.
global _local_storage
Uses GCS if media_gcs_bucket_name is configured, otherwise uses local storage.
"""
global _workspace_storage
config = Config()
if _workspace_storage is None:
async with _storage_lock:
if _workspace_storage is None:
config = Config()
# --- Local storage (shared) ---
if not config.media_gcs_bucket_name:
if _local_storage is None:
storage_dir = (
config.workspace_storage_dir if config.workspace_storage_dir else None
)
logger.info(f"Using local workspace storage: {storage_dir or 'default'}")
_local_storage = LocalWorkspaceStorage(storage_dir)
return _local_storage
if config.media_gcs_bucket_name:
logger.info(
f"Using GCS workspace storage: {config.media_gcs_bucket_name}"
)
_workspace_storage = GCSWorkspaceStorage(
config.media_gcs_bucket_name
)
else:
storage_dir = (
config.workspace_storage_dir
if config.workspace_storage_dir
else None
)
logger.info(
f"Using local workspace storage: {storage_dir or 'default'}"
)
_workspace_storage = LocalWorkspaceStorage(storage_dir)
return _workspace_storage
# --- GCS storage (per event loop) ---
loop_id = id(asyncio.get_running_loop())
if loop_id not in _gcs_storages:
logger.info(
f"Creating GCS workspace storage for loop {loop_id}: "
f"{config.media_gcs_bucket_name}"
)
_gcs_storages[loop_id] = GCSWorkspaceStorage(config.media_gcs_bucket_name)
return _gcs_storages[loop_id]
async def shutdown_workspace_storage() -> None:
"""
Properly shutdown the global workspace storage backend.
"""Shut down workspace storage for the **current** event loop.
Closes aiohttp sessions and other resources for GCS backend.
Should be called during application shutdown.
Closes the ``aiohttp`` session owned by the current loop's GCS instance.
Each worker thread should call this on its own loop before the loop is
destroyed. The REST API lifespan hook calls it for the main server loop.
"""
global _workspace_storage
global _local_storage
if _workspace_storage is not None:
async with _storage_lock:
if _workspace_storage is not None:
if isinstance(_workspace_storage, GCSWorkspaceStorage):
await _workspace_storage.close()
_workspace_storage = None
loop_id = id(asyncio.get_running_loop())
storage = _gcs_storages.pop(loop_id, None)
if storage is not None:
await storage.close()
# Clear local storage only when the last GCS instance is gone
# (i.e. full shutdown, not just a single worker stopping).
if not _gcs_storages:
_local_storage = None
def compute_file_checksum(content: bytes) -> str:

View File

@@ -169,7 +169,10 @@ export const ChatMessagesContainer = ({
<ConversationContent className="flex flex-1 flex-col gap-6 px-3 py-6">
{headerSlot}
{isLoading && messages.length === 0 && (
<div className="flex min-h-full flex-1 items-center justify-center">
<div
className="flex flex-1 items-center justify-center"
style={{ minHeight: "calc(100vh - 12rem)" }}
>
<LoadingSpinner className="text-neutral-600" />
</div>
)}