Compare commits

...

1 Commits

Author SHA1 Message Date
Zamil Majdy
f4ff940803 fix(backend/util): rewrite SafeJson to prevent Invalid \escape errors
Completely rewrote SafeJson implementation to fix "Invalid \escape" errors
occurring in /upsert_execution_output endpoint.

**Problem:**
- Data containing literal backslash-u sequences (e.g., "\u0000" as text)
  caused JSON parse errors
- Previous approach removed escape sequences from JSON strings, which
  created invalid JSON like "\w" after removing "\\u0000"
- Error: "Invalid \escape: line 1 column 36404 (char 36403)"

**Solution:**
- Rewritten to work on Python data structures instead of JSON strings
- Added _sanitize_value() helper that recursively walks through dicts,
  lists, and tuples to remove control characters from strings
- Eliminates serialize → sanitize → deserialize cycle
- Preserves all valid content (backslashes, paths, literal text)

**Changes:**
- Removed POSTGRES_JSON_ESCAPES regex (no longer needed)
- Added recursive _sanitize_value() function
- Simplified SafeJson() to convert Pydantic models and sanitize data
- Added "import json # noqa: F401" for backwards compatibility

**Testing:**
- Verified fix resolves the Invalid \escape error
- All existing SafeJson tests pass
- Problematic data no longer causes parsing errors

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-17 06:49:07 +00:00

View File

@@ -10,17 +10,19 @@ from pydantic import BaseModel
from .type import type_match from .type import type_match
__all__ = [
"json",
"dumps",
"loads",
"validate_with_jsonschema",
"SafeJson",
"convert_pydantic_to_json",
]
# Precompiled regex to remove PostgreSQL-incompatible control characters # Precompiled regex to remove PostgreSQL-incompatible control characters
# Removes \u0000-\u0008, \u000B-\u000C, \u000E-\u001F, \u007F (keeps tab \u0009, newline \u000A, carriage return \u000D) # Removes \u0000-\u0008, \u000B-\u000C, \u000E-\u001F, \u007F (keeps tab \u0009, newline \u000A, carriage return \u000D)
POSTGRES_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]") POSTGRES_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")
# Comprehensive regex to remove all PostgreSQL-incompatible control character sequences in JSON
# Handles both Unicode escapes (\\u0000-\\u0008, \\u000B-\\u000C, \\u000E-\\u001F, \\u007F)
# and JSON single-char escapes (\\b, \\f) while preserving legitimate file paths
POSTGRES_JSON_ESCAPES = re.compile(
r"\\u000[0-8]|\\u000[bB]|\\u000[cC]|\\u00[0-1][0-9a-fA-F]|\\u007[fF]|(?<!\\)\\[bf](?!\\)"
)
def to_dict(data) -> dict: def to_dict(data) -> dict:
if isinstance(data, BaseModel): if isinstance(data, BaseModel):
@@ -130,24 +132,67 @@ def convert_pydantic_to_json(output_data: Any) -> Any:
return output_data return output_data
def _sanitize_value(value: Any) -> Any:
"""
Recursively sanitize values by removing PostgreSQL-incompatible control characters.
This function walks through data structures and removes control characters from strings.
It handles:
- Strings: Remove control chars directly from the string
- Lists: Recursively sanitize each element
- Dicts: Recursively sanitize keys and values
- Other types: Return as-is
Args:
value: The value to sanitize
Returns:
Sanitized version of the value with control characters removed
"""
if isinstance(value, str):
# Remove control characters directly from the string
return POSTGRES_CONTROL_CHARS.sub("", value)
elif isinstance(value, dict):
# Recursively sanitize dictionary keys and values
return {_sanitize_value(k): _sanitize_value(v) for k, v in value.items()}
elif isinstance(value, list):
# Recursively sanitize list elements
return [_sanitize_value(item) for item in value]
elif isinstance(value, tuple):
# Recursively sanitize tuple elements
return tuple(_sanitize_value(item) for item in value)
else:
# For other types (int, float, bool, None, etc.), return as-is
return value
def SafeJson(data: Any) -> Json: def SafeJson(data: Any) -> Json:
""" """
Safely serialize data and return Prisma's Json type. Safely serialize data and return Prisma's Json type.
Sanitizes null bytes to prevent PostgreSQL 22P05 errors. Sanitizes control characters to prevent PostgreSQL 22P05 errors.
This function:
1. Converts Pydantic models to dicts
2. Recursively removes PostgreSQL-incompatible control characters from strings
3. Returns a Prisma Json object safe for database storage
Args:
data: Input data to sanitize and convert to Json
Returns:
Prisma Json object with control characters removed
Examples:
>>> SafeJson({"text": "Hello\\x00World"}) # null char removed
>>> SafeJson({"path": "C:\\\\temp"}) # backslashes preserved
>>> SafeJson({"data": "Text\\\\u0000here"}) # literal backslash-u preserved
""" """
# Convert Pydantic models to dict first
if isinstance(data, BaseModel): if isinstance(data, BaseModel):
json_string = data.model_dump_json( data = data.model_dump(exclude_none=True)
warnings="error",
exclude_none=True,
fallback=lambda v: None,
)
else:
json_string = dumps(data, default=lambda v: None)
# Remove PostgreSQL-incompatible control characters in JSON string # Sanitize the data structure by removing control characters
# Single comprehensive regex handles all control character sequences sanitized_data = _sanitize_value(data)
sanitized_json = POSTGRES_JSON_ESCAPES.sub("", json_string)
# Remove any remaining raw control characters (fallback safety net) # Return as Prisma Json type
sanitized_json = POSTGRES_CONTROL_CHARS.sub("", sanitized_json) return Json(sanitized_data)
return Json(json.loads(sanitized_json))