mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
Revert "fix(backend/util): rewrite SafeJson to prevent Invalid \escape errors (#11187)"
This reverts commit e62a56e8ba.
This commit is contained in:
@@ -2,7 +2,7 @@ from typing import Any
|
||||
|
||||
from backend.data.block import Block, BlockCategory, BlockOutput, BlockSchema
|
||||
from backend.data.model import SchemaField
|
||||
from backend.util.json import loads
|
||||
from backend.util.json import json
|
||||
|
||||
|
||||
class StepThroughItemsBlock(Block):
|
||||
@@ -68,7 +68,7 @@ class StepThroughItemsBlock(Block):
|
||||
raise ValueError(
|
||||
f"Input too large: {len(data)} bytes > {MAX_ITEM_SIZE} bytes"
|
||||
)
|
||||
items = loads(data)
|
||||
items = json.loads(data)
|
||||
else:
|
||||
items = data
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Type, TypeGuard, TypeVar, overload
|
||||
|
||||
@@ -13,6 +14,13 @@ from .type import type_match
|
||||
# Removes \u0000-\u0008, \u000B-\u000C, \u000E-\u001F, \u007F (keeps tab \u0009, newline \u000A, carriage return \u000D)
|
||||
POSTGRES_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")
|
||||
|
||||
# Comprehensive regex to remove all PostgreSQL-incompatible control character sequences in JSON
|
||||
# Handles both Unicode escapes (\\u0000-\\u0008, \\u000B-\\u000C, \\u000E-\\u001F, \\u007F)
|
||||
# and JSON single-char escapes (\\b, \\f) while preserving legitimate file paths
|
||||
POSTGRES_JSON_ESCAPES = re.compile(
|
||||
r"\\u000[0-8]|\\u000[bB]|\\u000[cC]|\\u00[0-1][0-9a-fA-F]|\\u007[fF]|(?<!\\)\\[bf](?!\\)"
|
||||
)
|
||||
|
||||
|
||||
def to_dict(data) -> dict:
|
||||
if isinstance(data, BaseModel):
|
||||
@@ -122,64 +130,24 @@ def convert_pydantic_to_json(output_data: Any) -> Any:
|
||||
return output_data
|
||||
|
||||
|
||||
def _sanitize_value(value: Any) -> Any:
|
||||
"""
|
||||
Recursively sanitize values by removing PostgreSQL-incompatible control characters.
|
||||
|
||||
This function walks through data structures and removes control characters from strings.
|
||||
It handles:
|
||||
- Strings: Remove control chars directly from the string
|
||||
- Lists: Recursively sanitize each element
|
||||
- Dicts: Recursively sanitize keys and values
|
||||
- Other types: Return as-is
|
||||
|
||||
Args:
|
||||
value: The value to sanitize
|
||||
|
||||
Returns:
|
||||
Sanitized version of the value with control characters removed
|
||||
"""
|
||||
if isinstance(value, str):
|
||||
# Remove control characters directly from the string
|
||||
return POSTGRES_CONTROL_CHARS.sub("", value)
|
||||
elif isinstance(value, dict):
|
||||
# Recursively sanitize dictionary keys and values
|
||||
return {_sanitize_value(k): _sanitize_value(v) for k, v in value.items()}
|
||||
elif isinstance(value, list):
|
||||
# Recursively sanitize list elements
|
||||
return [_sanitize_value(item) for item in value]
|
||||
elif isinstance(value, tuple):
|
||||
# Recursively sanitize tuple elements
|
||||
return tuple(_sanitize_value(item) for item in value)
|
||||
else:
|
||||
# For other types (int, float, bool, None, etc.), return as-is
|
||||
return value
|
||||
|
||||
|
||||
def SafeJson(data: Any) -> Json:
|
||||
"""
|
||||
Safely serialize data and return Prisma's Json type.
|
||||
Sanitizes control characters to prevent PostgreSQL 22P05 errors.
|
||||
|
||||
This function:
|
||||
1. Converts Pydantic models to dicts
|
||||
2. Recursively removes PostgreSQL-incompatible control characters from strings
|
||||
3. Returns a Prisma Json object safe for database storage
|
||||
|
||||
Args:
|
||||
data: Input data to sanitize and convert to Json
|
||||
|
||||
Returns:
|
||||
Prisma Json object with control characters removed
|
||||
|
||||
Examples:
|
||||
>>> SafeJson({"text": "Hello\\x00World"}) # null char removed
|
||||
>>> SafeJson({"path": "C:\\\\temp"}) # backslashes preserved
|
||||
>>> SafeJson({"data": "Text\\\\u0000here"}) # literal backslash-u preserved
|
||||
Sanitizes null bytes to prevent PostgreSQL 22P05 errors.
|
||||
"""
|
||||
# Convert Pydantic models to dict first
|
||||
if isinstance(data, BaseModel):
|
||||
data = data.model_dump(exclude_none=True)
|
||||
json_string = data.model_dump_json(
|
||||
warnings="error",
|
||||
exclude_none=True,
|
||||
fallback=lambda v: None,
|
||||
)
|
||||
else:
|
||||
json_string = dumps(data, default=lambda v: None)
|
||||
|
||||
# Return as Prisma Json type
|
||||
return Json(_sanitize_value(data))
|
||||
# Remove PostgreSQL-incompatible control characters in JSON string
|
||||
# Single comprehensive regex handles all control character sequences
|
||||
sanitized_json = POSTGRES_JSON_ESCAPES.sub("", json_string)
|
||||
|
||||
# Remove any remaining raw control characters (fallback safety net)
|
||||
sanitized_json = POSTGRES_CONTROL_CHARS.sub("", sanitized_json)
|
||||
return Json(json.loads(sanitized_json))
|
||||
|
||||
@@ -13,7 +13,7 @@ import idna
|
||||
from aiohttp import FormData, abc
|
||||
from tenacity import retry, retry_if_result, wait_exponential_jitter
|
||||
|
||||
from backend.util.json import loads
|
||||
from backend.util.json import json
|
||||
|
||||
# Retry status codes for which we will automatically retry the request
|
||||
THROTTLE_RETRY_STATUS_CODES: set[int] = {429, 500, 502, 503, 504, 408}
|
||||
@@ -259,7 +259,7 @@ class Response:
|
||||
"""
|
||||
Parse the body as JSON and return the resulting Python object.
|
||||
"""
|
||||
return loads(
|
||||
return json.loads(
|
||||
self.content.decode(encoding or "utf-8", errors="replace"), **kwargs
|
||||
)
|
||||
|
||||
|
||||
@@ -411,71 +411,3 @@ class TestSafeJson:
|
||||
assert "C:\\temp\\file" in str(file_path_with_null)
|
||||
assert ".txt" in str(file_path_with_null)
|
||||
assert "\x00" not in str(file_path_with_null) # Null removed from path
|
||||
|
||||
def test_invalid_escape_error_prevention(self):
|
||||
"""Test that SafeJson prevents 'Invalid \\escape' errors that occurred in upsert_execution_output."""
|
||||
# This reproduces the exact scenario that was causing the error:
|
||||
# POST /upsert_execution_output failed: Invalid \escape: line 1 column 36404 (char 36403)
|
||||
|
||||
# Create data with various problematic escape sequences that could cause JSON parsing errors
|
||||
problematic_output_data = {
|
||||
"web_content": "Article text\x00with null\x01and control\x08chars\x0C\x1F\x7F",
|
||||
"file_path": "C:\\Users\\test\\file\x00.txt",
|
||||
"json_like_string": '{"text": "data\x00\x08\x1F"}',
|
||||
"escaped_sequences": "Text with \\u0000 and \\u0008 sequences",
|
||||
"mixed_content": "Normal text\tproperly\nformatted\rwith\x00invalid\x08chars\x1Fmixed",
|
||||
"large_text": "A" * 35000
|
||||
+ "\x00\x08\x1F"
|
||||
+ "B" * 5000, # Large text like in the error
|
||||
}
|
||||
|
||||
# This should not raise any JSON parsing errors
|
||||
result = SafeJson(problematic_output_data)
|
||||
assert isinstance(result, Json)
|
||||
|
||||
# Verify the result is a valid Json object that can be safely stored in PostgreSQL
|
||||
result_data = cast(dict[str, Any], result.data)
|
||||
assert isinstance(result_data, dict)
|
||||
|
||||
# Verify problematic characters are removed but safe content preserved
|
||||
web_content = result_data.get("web_content", "")
|
||||
file_path = result_data.get("file_path", "")
|
||||
large_text = result_data.get("large_text", "")
|
||||
|
||||
# Check that control characters are removed
|
||||
assert "\x00" not in str(web_content)
|
||||
assert "\x01" not in str(web_content)
|
||||
assert "\x08" not in str(web_content)
|
||||
assert "\x0C" not in str(web_content)
|
||||
assert "\x1F" not in str(web_content)
|
||||
assert "\x7F" not in str(web_content)
|
||||
|
||||
# Check that legitimate content is preserved
|
||||
assert "Article text" in str(web_content)
|
||||
assert "with null" in str(web_content)
|
||||
assert "and control" in str(web_content)
|
||||
assert "chars" in str(web_content)
|
||||
|
||||
# Check file path handling
|
||||
assert "C:\\Users\\test\\file" in str(file_path)
|
||||
assert ".txt" in str(file_path)
|
||||
assert "\x00" not in str(file_path)
|
||||
|
||||
# Check large text handling (the scenario from the error at char 36403)
|
||||
assert len(str(large_text)) > 35000 # Content preserved
|
||||
assert "A" * 1000 in str(large_text) # A's preserved
|
||||
assert "B" * 1000 in str(large_text) # B's preserved
|
||||
assert "\x00" not in str(large_text) # Control chars removed
|
||||
assert "\x08" not in str(large_text)
|
||||
assert "\x1F" not in str(large_text)
|
||||
|
||||
# Most importantly: ensure the result can be JSON-serialized without errors
|
||||
# This would have failed with the old approach
|
||||
import json
|
||||
|
||||
json_string = json.dumps(result.data) # Should not raise "Invalid \escape"
|
||||
assert len(json_string) > 0
|
||||
|
||||
# And can be parsed back
|
||||
parsed_back = json.loads(json_string)
|
||||
assert isinstance(parsed_back, dict)
|
||||
|
||||
Reference in New Issue
Block a user