mirror of
https://github.com/Significant-Gravitas/AutoGPT.git
synced 2026-04-08 03:00:28 -04:00
### Why / What / How **Why:** GitHub's code scanning detected a HIGH severity security vulnerability in `/autogpt_platform/backend/backend/util/json.py:172`. The error handler in `sanitize_json()` was logging sensitive data (potentially including secrets, API keys, credentials) as clear text when serialization fails. **What:** This PR removes the logging of actual data content from the error handler while preserving useful debugging metadata (error type, error message, and data type). **How:** Removed the `"Data preview: %s"` format parameter and the corresponding `truncate(str(data), 100)` argument from the logger.error() call. The error handler now logs only safe metadata that helps debugging without exposing sensitive information. ### Changes 🏗️ - **Security Fix**: Modified `sanitize_json()` function in `backend/util/json.py` - Removed logging of data content (`truncate(str(data), 100)`) from the error handler - Retained logging of error type (`type(e).__name__`) - Retained logging of truncated error message (`truncate(str(e), 200)`) - Retained logging of data type (`type(data).__name__`) - Error handler still provides useful debugging information without exposing secrets ### Checklist 📋 #### For code changes: - [x] I have clearly listed my changes in the PR description - [x] I have made a test plan - [x] I have tested my changes according to the test plan: - [x] Verified the code passes type checking (`poetry run pyright backend/util/json.py`) - [x] Verified the code passes linting (`poetry run ruff check backend/util/json.py`) - [x] Verified all pre-commit hooks pass - [x] Reviewed the diff to ensure only the sensitive data logging was removed - [x] Confirmed that useful debugging information (error type, error message, data type) is still logged #### For configuration changes: - N/A - No configuration changes required
205 lines
6.2 KiB
Python
205 lines
6.2 KiB
Python
import logging
|
|
import re
|
|
from typing import Any, Type, TypeVar, overload
|
|
|
|
import jsonschema
|
|
import orjson
|
|
from fastapi.encoders import jsonable_encoder as to_dict
|
|
from prisma import Json
|
|
|
|
from .truncate import truncate
|
|
from .type import type_match
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Precompiled regex to remove PostgreSQL-incompatible control characters
|
|
# Removes \u0000-\u0008, \u000B-\u000C, \u000E-\u001F, \u007F (keeps tab \u0009, newline \u000A, carriage return \u000D)
|
|
POSTGRES_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")
|
|
|
|
|
|
def dumps(
|
|
data: Any, *args: Any, indent: int | None = None, option: int = 0, **kwargs: Any
|
|
) -> str:
|
|
"""
|
|
Serialize data to JSON string with automatic conversion of Pydantic models and complex types.
|
|
|
|
This function converts the input data to a JSON-serializable format using FastAPI's
|
|
jsonable_encoder before dumping to JSON. It handles Pydantic models, complex types,
|
|
and ensures proper serialization.
|
|
|
|
Parameters
|
|
----------
|
|
data : Any
|
|
The data to serialize. Can be any type including Pydantic models, dicts, lists, etc.
|
|
*args : Any
|
|
Additional positional arguments
|
|
indent : int | None
|
|
If not None, pretty-print with indentation
|
|
option : int
|
|
orjson option flags (default: 0)
|
|
**kwargs : Any
|
|
Additional keyword arguments. Supported: default, ensure_ascii, separators, indent
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
JSON string representation of the data
|
|
|
|
Examples
|
|
--------
|
|
>>> dumps({"name": "Alice", "age": 30})
|
|
'{"name": "Alice", "age": 30}'
|
|
|
|
>>> dumps(pydantic_model_instance, indent=2)
|
|
'{\n "field1": "value1",\n "field2": "value2"\n}'
|
|
"""
|
|
serializable_data = to_dict(data)
|
|
|
|
# Handle indent parameter
|
|
if indent is not None or kwargs.get("indent") is not None:
|
|
option |= orjson.OPT_INDENT_2
|
|
|
|
# orjson only accepts specific parameters, filter out stdlib json params
|
|
# ensure_ascii: orjson always produces UTF-8 (better than ASCII)
|
|
# separators: orjson uses compact separators by default
|
|
supported_orjson_params = {"default"}
|
|
orjson_kwargs = {k: v for k, v in kwargs.items() if k in supported_orjson_params}
|
|
|
|
return orjson.dumps(serializable_data, option=option, **orjson_kwargs).decode(
|
|
"utf-8"
|
|
)
|
|
|
|
|
|
T = TypeVar("T")
|
|
|
|
# Sentinel value to detect when fallback is not provided
|
|
_NO_FALLBACK = object()
|
|
|
|
|
|
@overload
|
|
def loads(
|
|
data: str | bytes, *args, target_type: Type[T], fallback: T | None = None, **kwargs
|
|
) -> T:
|
|
pass
|
|
|
|
|
|
@overload
|
|
def loads(data: str | bytes, *args, fallback: Any = None, **kwargs) -> Any:
|
|
pass
|
|
|
|
|
|
def loads(
|
|
data: str | bytes,
|
|
*args,
|
|
target_type: Type[T] | None = None,
|
|
fallback: Any = _NO_FALLBACK,
|
|
**kwargs,
|
|
) -> Any:
|
|
"""Parse JSON with optional fallback on decode errors.
|
|
|
|
Args:
|
|
data: JSON string or bytes to parse
|
|
target_type: Optional type to validate/cast result to
|
|
fallback: Value to return on JSONDecodeError. If not provided, raises.
|
|
**kwargs: Additional arguments (unused, for compatibility)
|
|
|
|
Returns:
|
|
Parsed JSON data, or fallback value if parsing fails
|
|
|
|
Raises:
|
|
orjson.JSONDecodeError: Only if fallback is not provided
|
|
|
|
Examples:
|
|
>>> loads('{"valid": "json"}')
|
|
{'valid': 'json'}
|
|
>>> loads('invalid json', fallback=None)
|
|
None
|
|
>>> loads('invalid json', fallback={})
|
|
{}
|
|
>>> loads('invalid json') # raises orjson.JSONDecodeError
|
|
"""
|
|
try:
|
|
parsed = orjson.loads(data)
|
|
except orjson.JSONDecodeError:
|
|
if fallback is not _NO_FALLBACK:
|
|
return fallback
|
|
raise
|
|
|
|
if target_type:
|
|
return type_match(parsed, target_type)
|
|
return parsed
|
|
|
|
|
|
def validate_with_jsonschema(
|
|
schema: dict[str, Any], data: dict[str, Any]
|
|
) -> str | None:
|
|
"""
|
|
Validate the data against the schema.
|
|
Returns the validation error message if the data does not match the schema.
|
|
"""
|
|
try:
|
|
jsonschema.validate(data, schema)
|
|
return None
|
|
except jsonschema.ValidationError as e:
|
|
return str(e)
|
|
|
|
|
|
def sanitize_string(value: str) -> str:
|
|
"""Remove PostgreSQL-incompatible control characters from string.
|
|
|
|
Strips \\x00-\\x08, \\x0B-\\x0C, \\x0E-\\x1F, \\x7F while keeping tab,
|
|
newline, and carriage return. Use this before inserting free-form text
|
|
into PostgreSQL text/varchar columns.
|
|
"""
|
|
return POSTGRES_CONTROL_CHARS.sub("", value)
|
|
|
|
|
|
def sanitize_json(data: Any) -> Any:
|
|
try:
|
|
# Use two-pass approach for consistent string sanitization:
|
|
# 1. First convert to basic JSON-serializable types (handles Pydantic models)
|
|
# 2. Then sanitize strings in the result
|
|
basic_result = to_dict(data)
|
|
return to_dict(basic_result, custom_encoder={str: sanitize_string})
|
|
except Exception as e:
|
|
# Log the failure and fall back to string representation
|
|
logger.error(
|
|
"SafeJson fallback to string representation due to serialization error: %s (%s). "
|
|
"Data type: %s",
|
|
type(e).__name__,
|
|
truncate(str(e), 200),
|
|
type(data).__name__,
|
|
)
|
|
|
|
# Ultimate fallback: convert to string representation and sanitize
|
|
return sanitize_string(str(data))
|
|
|
|
|
|
class SafeJson(Json):
|
|
"""
|
|
Safely serialize data and return Prisma's Json type.
|
|
Sanitizes control characters to prevent PostgreSQL 22P05 errors.
|
|
|
|
This function:
|
|
1. Converts Pydantic models to dicts (recursively using to_dict)
|
|
2. Recursively removes PostgreSQL-incompatible control characters from strings
|
|
3. Returns a Prisma Json object safe for database storage
|
|
|
|
Uses to_dict (jsonable_encoder) with a custom encoder to handle both Pydantic
|
|
conversion and control character sanitization in a two-pass approach.
|
|
|
|
Args:
|
|
data: Input data to sanitize and convert to Json
|
|
|
|
Returns:
|
|
Prisma Json object with control characters removed
|
|
|
|
Examples:
|
|
>>> SafeJson({"text": "Hello\\x00World"}) # null char removed
|
|
>>> SafeJson({"path": "C:\\\\temp"}) # backslashes preserved
|
|
>>> SafeJson({"data": "Text\\\\u0000here"}) # literal backslash-u preserved
|
|
"""
|
|
|
|
def __init__(self, data: Any):
|
|
super().__init__(sanitize_json(data))
|