Revert "fix(backend/util): rewrite SafeJson to prevent Invalid \escape errors (#11187)"

This reverts commit e62a56e8ba.
2026-04-08 03:00:28 -04:00 · 2025-10-17 08:31:23 +01:00
parent e62a56e8ba
commit 8258338caf
4 changed files with 27 additions and 127 deletions
--- a/autogpt_platform/backend/backend/blocks/iteration.py
+++ b/autogpt_platform/backend/backend/blocks/iteration.py
@@ -2,7 +2,7 @@ from typing import Any

 from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
 from backend.data.model import SchemaField
-from backend.util.json import loads
+from backend.util.json import json


 class StepThroughItemsBlock(Block):
@@ -68,7 +68,7 @@ class StepThroughItemsBlock(Block):
                    raise ValueError(
                        f"Input too large: {len(data)} bytes > {MAX_ITEM_SIZE} bytes"
                    )
-                items = loads(data)
+                items = json.loads(data)
            else:
                items = data

--- a/autogpt_platform/backend/backend/util/json.py
+++ b/autogpt_platform/backend/backend/util/json.py
@@ -1,3 +1,4 @@
+import json
 import re
 from typing import Any, Type, TypeGuard, TypeVar, overload

@@ -13,6 +14,13 @@ from .type import type_match
 # Removes \u0000-\u0008, \u000B-\u000C, \u000E-\u001F, \u007F (keeps tab \u0009, newline \u000A, carriage return \u000D)
 POSTGRES_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")

+# Comprehensive regex to remove all PostgreSQL-incompatible control character sequences in JSON
+# Handles both Unicode escapes (\\u0000-\\u0008, \\u000B-\\u000C, \\u000E-\\u001F, \\u007F)
+# and JSON single-char escapes (\\b, \\f) while preserving legitimate file paths
+POSTGRES_JSON_ESCAPES = re.compile(
+    r"\\u000[0-8]|\\u000[bB]|\\u000[cC]|\\u00[0-1][0-9a-fA-F]|\\u007[fF]|(?<!\\)\\[bf](?!\\)"
+)
+

 def to_dict(data) -> dict:
    if isinstance(data, BaseModel):
@@ -122,64 +130,24 @@ def convert_pydantic_to_json(output_data: Any) -> Any:
    return output_data


-def _sanitize_value(value: Any) -> Any:
-    """
-    Recursively sanitize values by removing PostgreSQL-incompatible control characters.
-
-    This function walks through data structures and removes control characters from strings.
-    It handles:
-    - Strings: Remove control chars directly from the string
-    - Lists: Recursively sanitize each element
-    - Dicts: Recursively sanitize keys and values
-    - Other types: Return as-is
-
-    Args:
-        value: The value to sanitize
-
-    Returns:
-        Sanitized version of the value with control characters removed
-    """
-    if isinstance(value, str):
-        # Remove control characters directly from the string
-        return POSTGRES_CONTROL_CHARS.sub("", value)
-    elif isinstance(value, dict):
-        # Recursively sanitize dictionary keys and values
-        return {_sanitize_value(k): _sanitize_value(v) for k, v in value.items()}
-    elif isinstance(value, list):
-        # Recursively sanitize list elements
-        return [_sanitize_value(item) for item in value]
-    elif isinstance(value, tuple):
-        # Recursively sanitize tuple elements
-        return tuple(_sanitize_value(item) for item in value)
-    else:
-        # For other types (int, float, bool, None, etc.), return as-is
-        return value
-
-
 def SafeJson(data: Any) -> Json:
    """
    Safely serialize data and return Prisma's Json type.
-    Sanitizes control characters to prevent PostgreSQL 22P05 errors.
-
-    This function:
-    1. Converts Pydantic models to dicts
-    2. Recursively removes PostgreSQL-incompatible control characters from strings
-    3. Returns a Prisma Json object safe for database storage
-
-    Args:
-        data: Input data to sanitize and convert to Json
-
-    Returns:
-        Prisma Json object with control characters removed
-
-    Examples:
-        >>> SafeJson({"text": "Hello\\x00World"})  # null char removed
-        >>> SafeJson({"path": "C:\\\\temp"})  # backslashes preserved
-        >>> SafeJson({"data": "Text\\\\u0000here"})  # literal backslash-u preserved
+    Sanitizes null bytes to prevent PostgreSQL 22P05 errors.
    """
-    # Convert Pydantic models to dict first
    if isinstance(data, BaseModel):
-        data = data.model_dump(exclude_none=True)
+        json_string = data.model_dump_json(
+            warnings="error",
+            exclude_none=True,
+            fallback=lambda v: None,
+        )
+    else:
+        json_string = dumps(data, default=lambda v: None)

-    # Return as Prisma Json type
-    return Json(_sanitize_value(data))
+    # Remove PostgreSQL-incompatible control characters in JSON string
+    # Single comprehensive regex handles all control character sequences
+    sanitized_json = POSTGRES_JSON_ESCAPES.sub("", json_string)
+
+    # Remove any remaining raw control characters (fallback safety net)
+    sanitized_json = POSTGRES_CONTROL_CHARS.sub("", sanitized_json)
+    return Json(json.loads(sanitized_json))
--- a/autogpt_platform/backend/backend/util/request.py
+++ b/autogpt_platform/backend/backend/util/request.py
@@ -13,7 +13,7 @@ import idna
 from aiohttp import FormData, abc
 from tenacity import retry, retry_if_result, wait_exponential_jitter

-from backend.util.json import loads
+from backend.util.json import json

 # Retry status codes for which we will automatically retry the request
 THROTTLE_RETRY_STATUS_CODES: set[int] = {429, 500, 502, 503, 504, 408}
@@ -259,7 +259,7 @@ class Response:
        """
        Parse the body as JSON and return the resulting Python object.
        """
-        return loads(
+        return json.loads(
            self.content.decode(encoding or "utf-8", errors="replace"), **kwargs
        )

--- a/autogpt_platform/backend/backend/util/test_json.py
+++ b/autogpt_platform/backend/backend/util/test_json.py
@@ -411,71 +411,3 @@ class TestSafeJson:
        assert "C:\\temp\\file" in str(file_path_with_null)
        assert ".txt" in str(file_path_with_null)
        assert "\x00" not in str(file_path_with_null)  # Null removed from path
-
-    def test_invalid_escape_error_prevention(self):
-        """Test that SafeJson prevents 'Invalid \\escape' errors that occurred in upsert_execution_output."""
-        # This reproduces the exact scenario that was causing the error:
-        # POST /upsert_execution_output failed: Invalid \escape: line 1 column 36404 (char 36403)
-
-        # Create data with various problematic escape sequences that could cause JSON parsing errors
-        problematic_output_data = {
-            "web_content": "Article text\x00with null\x01and control\x08chars\x0C\x1F\x7F",
-            "file_path": "C:\\Users\\test\\file\x00.txt",
-            "json_like_string": '{"text": "data\x00\x08\x1F"}',
-            "escaped_sequences": "Text with \\u0000 and \\u0008 sequences",
-            "mixed_content": "Normal text\tproperly\nformatted\rwith\x00invalid\x08chars\x1Fmixed",
-            "large_text": "A" * 35000
-            + "\x00\x08\x1F"
-            + "B" * 5000,  # Large text like in the error
-        }
-
-        # This should not raise any JSON parsing errors
-        result = SafeJson(problematic_output_data)
-        assert isinstance(result, Json)
-
-        # Verify the result is a valid Json object that can be safely stored in PostgreSQL
-        result_data = cast(dict[str, Any], result.data)
-        assert isinstance(result_data, dict)
-
-        # Verify problematic characters are removed but safe content preserved
-        web_content = result_data.get("web_content", "")
-        file_path = result_data.get("file_path", "")
-        large_text = result_data.get("large_text", "")
-
-        # Check that control characters are removed
-        assert "\x00" not in str(web_content)
-        assert "\x01" not in str(web_content)
-        assert "\x08" not in str(web_content)
-        assert "\x0C" not in str(web_content)
-        assert "\x1F" not in str(web_content)
-        assert "\x7F" not in str(web_content)
-
-        # Check that legitimate content is preserved
-        assert "Article text" in str(web_content)
-        assert "with null" in str(web_content)
-        assert "and control" in str(web_content)
-        assert "chars" in str(web_content)
-
-        # Check file path handling
-        assert "C:\\Users\\test\\file" in str(file_path)
-        assert ".txt" in str(file_path)
-        assert "\x00" not in str(file_path)
-
-        # Check large text handling (the scenario from the error at char 36403)
-        assert len(str(large_text)) > 35000  # Content preserved
-        assert "A" * 1000 in str(large_text)  # A's preserved
-        assert "B" * 1000 in str(large_text)  # B's preserved
-        assert "\x00" not in str(large_text)  # Control chars removed
-        assert "\x08" not in str(large_text)
-        assert "\x1F" not in str(large_text)
-
-        # Most importantly: ensure the result can be JSON-serialized without errors
-        # This would have failed with the old approach
-        import json
-
-        json_string = json.dumps(result.data)  # Should not raise "Invalid \escape"
-        assert len(json_string) > 0
-
-        # And can be parsed back
-        parsed_back = json.loads(json_string)
-        assert isinstance(parsed_back, dict)