diff --git a/autogpt_platform/backend/backend/copilot/sdk/cli_openrouter_compat_test.py b/autogpt_platform/backend/backend/copilot/sdk/cli_openrouter_compat_test.py
index 56b8bc2dd6..ffc3590e20 100644
--- a/autogpt_platform/backend/backend/copilot/sdk/cli_openrouter_compat_test.py
+++ b/autogpt_platform/backend/backend/copilot/sdk/cli_openrouter_compat_test.py
@@ -57,6 +57,7 @@ import asyncio
 import json
 import logging
 import os
+import re
 import subprocess
 from pathlib import Path
 from typing import Any
@@ -71,12 +72,21 @@ logger = logging.getLogger(__name__)
 # Forbidden patterns we scan for in captured request bodies
 # ---------------------------------------------------------------------------
 
-# Substring of the `tool_reference` content block that breaks OpenRouter's
-# stricter Zod validation in tool_result.content. PR #12294 root-cause.
-_FORBIDDEN_TOOL_REFERENCE = '"type": "tool_reference"'
+# Match the `tool_reference` content block that breaks OpenRouter's stricter
+# Zod validation in tool_result.content. PR #12294 root-cause.
+#
+# We use a whitespace-tolerant regex rather than a literal substring because
+# the Claude Code CLI is Node.js and `JSON.stringify` without an indent
+# argument emits no whitespace between the key, colon, and value
+# (`{"type":"tool_reference"}`), while a Python serializer would emit
+# `{"type": "tool_reference"}`. A naive substring with one specific spacing
+# would silently miss the real regression.
+_FORBIDDEN_TOOL_REFERENCE_RE = re.compile(r'"type"\s*:\s*"tool_reference"')
 
 # Beta string OpenRouter rejects in upstream issue #789. Can appear in
-# either `betas` arrays or the `anthropic-beta` header value.
+# either `betas` arrays or the `anthropic-beta` header value. This is a
+# unique opaque token (no JSON punctuation around it that could vary), so
+# a plain substring match is robust.
 _FORBIDDEN_CONTEXT_MANAGEMENT_BETA = "context-management-2025-06-27"
 
 
@@ -90,7 +100,7 @@ def _scan_request_for_forbidden_patterns(
     OpenRouter-incompatible features.
     """
     findings: list[str] = []
-    if _FORBIDDEN_TOOL_REFERENCE in body_text:
+    if _FORBIDDEN_TOOL_REFERENCE_RE.search(body_text):
         findings.append(
             "`tool_reference` content block in request body — "
             "PR #12294 / CLI 2.1.69 regression"
@@ -188,6 +198,7 @@ async def _start_fake_anthropic_server(
     answered with a valid streaming response.  Returns ``(runner, port)``
     so the caller can ``await runner.cleanup()`` when finished.
     """
+    import socket
 
     async def messages_handler(request: web.Request) -> web.StreamResponse:
         body = await request.text()
@@ -213,22 +224,28 @@ async def _start_fake_anthropic_server(
         await response.write_eof()
         return response
 
+    async def fallback_handler(_request: web.Request) -> web.Response:
+        # OAuth/profile endpoints the CLI may probe — answer 404 so it
+        # falls through quickly without retrying.
+        return web.Response(status=404)
+
     app = web.Application()
     app.router.add_post("/v1/messages", messages_handler)
-    # OAuth/profile endpoints the CLI may probe — answer 404 so it falls
-    # through quickly without retrying.
-    app.router.add_route("*", "/{tail:.*}", lambda _r: web.Response(status=404))
+    app.router.add_route("*", "/{tail:.*}", fallback_handler)
+
+    # Bind an ephemeral port ourselves so we can read it back via the
+    # public ``getsockname`` API rather than reaching into ``site._server``
+    # private aiohttp internals.
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind(("127.0.0.1", 0))
+    port: int = sock.getsockname()[1]
 
     runner = web.AppRunner(app)
     await runner.setup()
-    site = web.TCPSite(runner, "127.0.0.1", 0)
+    site = web.SockSite(runner, sock)
     await site.start()
 
-    server = site._server
-    assert server is not None
-    sockets = getattr(server, "sockets", None)
-    assert sockets is not None
-    port: int = sockets[0].getsockname()[1]
     return runner, port
 
 
@@ -335,6 +352,14 @@ async def _run_cli_against_fake_server(
             proc.kill()
         except ProcessLookupError:
             pass
+        # Reap the process to avoid leaving a zombie + open pipe FDs.
+        # Without this the asyncio transport keeps the stdout/stderr
+        # pipes alive until the loop exits, and in CI loops where this
+        # test runs many times the file-descriptor count creeps up.
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=5.0)
+        except (asyncio.TimeoutError, TimeoutError):
+            pass
         stdout_bytes, stderr_bytes = b"", b""
 
     return (
@@ -534,13 +559,14 @@ class TestResolveCliPath:
     def test_returns_none_when_env_var_points_to_missing_file(self, monkeypatch):
         monkeypatch.delenv("CHAT_CLAUDE_AGENT_CLI_PATH", raising=False)
         monkeypatch.setenv("CLAUDE_AGENT_CLI_PATH", "/nonexistent/path/to/claude")
-        # Should fall through to the bundled binary OR return None,
-        # but never raise.
+        # When the override is set but the file is missing, the resolver
+        # returns ``None`` outright — it does NOT silently fall through to
+        # the bundled binary, because doing so would defeat the purpose of
+        # the override (the operator explicitly asked for a specific path).
+        # The strict ``is None`` assertion catches any future regression
+        # that swaps this fail-loud behaviour for a silent fallback.
         resolved = _resolve_cli_path()
-        # We can't assert exact value (depends on whether the bundled
-        # CLI is installed in the test env) but the function must not
-        # raise — the caller is supposed to handle None gracefully.
-        assert resolved is None or resolved.is_file()
+        assert resolved is None
 
     def test_falls_back_to_bundled_when_env_var_unset(self, monkeypatch):
         monkeypatch.delenv("CLAUDE_AGENT_CLI_PATH", raising=False)