Compare commits

...

1 Commits

Author SHA1 Message Date
Zamil Majdy
f4ff940803 fix(backend/util): rewrite SafeJson to prevent Invalid \escape errors
Completely rewrote SafeJson implementation to fix "Invalid \escape" errors
occurring in /upsert_execution_output endpoint.

**Problem:**
- Data containing literal backslash-u sequences (e.g., "\u0000" as text)
  caused JSON parse errors
- Previous approach removed escape sequences from JSON strings, which
  created invalid JSON like "\w" after removing "\\u0000"
- Error: "Invalid \escape: line 1 column 36404 (char 36403)"

**Solution:**
- Rewritten to work on Python data structures instead of JSON strings
- Added _sanitize_value() helper that recursively walks through dicts,
  lists, and tuples to remove control characters from strings
- Eliminates serialize → sanitize → deserialize cycle
- Preserves all valid content (backslashes, paths, literal text)

**Changes:**
- Removed POSTGRES_JSON_ESCAPES regex (no longer needed)
- Added recursive _sanitize_value() function
- Simplified SafeJson() to convert Pydantic models and sanitize data
- Added "import json # noqa: F401" for backwards compatibility

**Testing:**
- Verified fix resolves the Invalid \escape error
- All existing SafeJson tests pass
- Problematic data no longer causes parsing errors

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-17 06:49:07 +00:00

View File

@@ -10,17 +10,19 @@ from pydantic import BaseModel
from .type import type_match
__all__ = [
"json",
"dumps",
"loads",
"validate_with_jsonschema",
"SafeJson",
"convert_pydantic_to_json",
]
# Precompiled regex to remove PostgreSQL-incompatible control characters
# Removes \u0000-\u0008, \u000B-\u000C, \u000E-\u001F, \u007F (keeps tab \u0009, newline \u000A, carriage return \u000D)
POSTGRES_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")
# Comprehensive regex to remove all PostgreSQL-incompatible control character sequences in JSON
# Handles both Unicode escapes (\\u0000-\\u0008, \\u000B-\\u000C, \\u000E-\\u001F, \\u007F)
# and JSON single-char escapes (\\b, \\f) while preserving legitimate file paths
POSTGRES_JSON_ESCAPES = re.compile(
r"\\u000[0-8]|\\u000[bB]|\\u000[cC]|\\u00[0-1][0-9a-fA-F]|\\u007[fF]|(?<!\\)\\[bf](?!\\)"
)
def to_dict(data) -> dict:
if isinstance(data, BaseModel):
@@ -130,24 +132,67 @@ def convert_pydantic_to_json(output_data: Any) -> Any:
return output_data
def _sanitize_value(value: Any) -> Any:
"""
Recursively sanitize values by removing PostgreSQL-incompatible control characters.
This function walks through data structures and removes control characters from strings.
It handles:
- Strings: Remove control chars directly from the string
- Lists: Recursively sanitize each element
- Dicts: Recursively sanitize keys and values
- Other types: Return as-is
Args:
value: The value to sanitize
Returns:
Sanitized version of the value with control characters removed
"""
if isinstance(value, str):
# Remove control characters directly from the string
return POSTGRES_CONTROL_CHARS.sub("", value)
elif isinstance(value, dict):
# Recursively sanitize dictionary keys and values
return {_sanitize_value(k): _sanitize_value(v) for k, v in value.items()}
elif isinstance(value, list):
# Recursively sanitize list elements
return [_sanitize_value(item) for item in value]
elif isinstance(value, tuple):
# Recursively sanitize tuple elements
return tuple(_sanitize_value(item) for item in value)
else:
# For other types (int, float, bool, None, etc.), return as-is
return value
def SafeJson(data: Any) -> Json:
"""
Safely serialize data and return Prisma's Json type.
Sanitizes null bytes to prevent PostgreSQL 22P05 errors.
Sanitizes control characters to prevent PostgreSQL 22P05 errors.
This function:
1. Converts Pydantic models to dicts
2. Recursively removes PostgreSQL-incompatible control characters from strings
3. Returns a Prisma Json object safe for database storage
Args:
data: Input data to sanitize and convert to Json
Returns:
Prisma Json object with control characters removed
Examples:
>>> SafeJson({"text": "Hello\\x00World"}) # null char removed
>>> SafeJson({"path": "C:\\\\temp"}) # backslashes preserved
>>> SafeJson({"data": "Text\\\\u0000here"}) # literal backslash-u preserved
"""
# Convert Pydantic models to dict first
if isinstance(data, BaseModel):
json_string = data.model_dump_json(
warnings="error",
exclude_none=True,
fallback=lambda v: None,
)
else:
json_string = dumps(data, default=lambda v: None)
data = data.model_dump(exclude_none=True)
# Remove PostgreSQL-incompatible control characters in JSON string
# Single comprehensive regex handles all control character sequences
sanitized_json = POSTGRES_JSON_ESCAPES.sub("", json_string)
# Sanitize the data structure by removing control characters
sanitized_data = _sanitize_value(data)
# Remove any remaining raw control characters (fallback safety net)
sanitized_json = POSTGRES_CONTROL_CHARS.sub("", sanitized_json)
return Json(json.loads(sanitized_json))
# Return as Prisma Json type
return Json(sanitized_data)