Merge branch 'dev' into swiftyos/secrt-1706-improve-store-search

2026-01-22 05:28:02 -05:00 · 2025-10-17 11:40:34 +02:00
parent e5e2d45937 dfdd632161
commit f7724ede8e
3 changed files with 311 additions and 64 deletions
--- a/autogpt_platform/backend/backend/executor/manager.py
+++ b/autogpt_platform/backend/backend/executor/manager.py
@@ -246,7 +246,7 @@ async def execute_node(
        async for output_name, output_data in node_block.execute(
            input_data, **extra_exec_kwargs
        ):
-            output_data = json.convert_pydantic_to_json(output_data)
+            output_data = json.to_dict(output_data)
            output_size += len(json.dumps(output_data))
            log_metadata.debug("Node produced output", **{output_name: output_data})
            yield output_name, output_data
--- a/autogpt_platform/backend/backend/util/json.py
+++ b/autogpt_platform/backend/backend/util/json.py
@@ -1,25 +1,22 @@
+import logging
 import re
-from typing import Any, Type, TypeGuard, TypeVar, overload
+from typing import Any, Type, TypeVar, overload

 import jsonschema
 import orjson
-from fastapi.encoders import jsonable_encoder
+from fastapi.encoders import jsonable_encoder as to_dict
 from prisma import Json
-from pydantic import BaseModel

+from .truncate import truncate
 from .type import type_match

+logger = logging.getLogger(__name__)
+
 # Precompiled regex to remove PostgreSQL-incompatible control characters
 # Removes \u0000-\u0008, \u000B-\u000C, \u000E-\u001F, \u007F (keeps tab \u0009, newline \u000A, carriage return \u000D)
 POSTGRES_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")


-def to_dict(data) -> dict:
-    if isinstance(data, BaseModel):
-        data = data.model_dump()
-    return jsonable_encoder(data)
-
-
 def dumps(
    data: Any, *args: Any, indent: int | None = None, option: int = 0, **kwargs: Any
 ) -> str:
@@ -108,64 +105,19 @@ def validate_with_jsonschema(
        return str(e)


-def is_list_of_basemodels(value: object) -> TypeGuard[list[BaseModel]]:
-    return isinstance(value, list) and all(
-        isinstance(item, BaseModel) for item in value
-    )
-
-
-def convert_pydantic_to_json(output_data: Any) -> Any:
-    if isinstance(output_data, BaseModel):
-        return output_data.model_dump()
-    if is_list_of_basemodels(output_data):
-        return [item.model_dump() for item in output_data]
-    return output_data
-
-
-def _sanitize_value(value: Any) -> Any:
-    """
-    Recursively sanitize values by removing PostgreSQL-incompatible control characters.
-
-    This function walks through data structures and removes control characters from strings.
-    It handles:
-    - Strings: Remove control chars directly from the string
-    - Lists: Recursively sanitize each element
-    - Dicts: Recursively sanitize keys and values
-    - Other types: Return as-is
-
-    Args:
-        value: The value to sanitize
-
-    Returns:
-        Sanitized version of the value with control characters removed
-    """
-    if isinstance(value, str):
-        # Remove control characters directly from the string
-        return POSTGRES_CONTROL_CHARS.sub("", value)
-    elif isinstance(value, dict):
-        # Recursively sanitize dictionary keys and values
-        return {_sanitize_value(k): _sanitize_value(v) for k, v in value.items()}
-    elif isinstance(value, list):
-        # Recursively sanitize list elements
-        return [_sanitize_value(item) for item in value]
-    elif isinstance(value, tuple):
-        # Recursively sanitize tuple elements
-        return tuple(_sanitize_value(item) for item in value)
-    else:
-        # For other types (int, float, bool, None, etc.), return as-is
-        return value
-
-
 def SafeJson(data: Any) -> Json:
    """
    Safely serialize data and return Prisma's Json type.
    Sanitizes control characters to prevent PostgreSQL 22P05 errors.

    This function:
-    1. Converts Pydantic models to dicts
+    1. Converts Pydantic models to dicts (recursively using to_dict)
    2. Recursively removes PostgreSQL-incompatible control characters from strings
    3. Returns a Prisma Json object safe for database storage

+    Uses to_dict (jsonable_encoder) with a custom encoder to handle both Pydantic
+    conversion and control character sanitization in a two-pass approach.
+
    Args:
        data: Input data to sanitize and convert to Json

@@ -177,9 +129,29 @@ def SafeJson(data: Any) -> Json:
        >>> SafeJson({"path": "C:\\\\temp"})  # backslashes preserved
        >>> SafeJson({"data": "Text\\\\u0000here"})  # literal backslash-u preserved
    """
-    # Convert Pydantic models to dict first
-    if isinstance(data, BaseModel):
-        data = data.model_dump(exclude_none=True)

-    # Return as Prisma Json type
-    return Json(_sanitize_value(data))
+    def _sanitize_string(value: str) -> str:
+        """Remove PostgreSQL-incompatible control characters from string."""
+        return POSTGRES_CONTROL_CHARS.sub("", value)
+
+    try:
+        # Use two-pass approach for consistent string sanitization:
+        # 1. First convert to basic JSON-serializable types (handles Pydantic models)
+        # 2. Then sanitize strings in the result
+        basic_result = to_dict(data)
+        sanitized_result = to_dict(basic_result, custom_encoder={str: _sanitize_string})
+        return Json(sanitized_result)
+    except Exception as e:
+        # Log the failure and fall back to string representation
+        logger.error(
+            "SafeJson fallback to string representation due to serialization error: %s (%s). "
+            "Data type: %s, Data preview: %s",
+            type(e).__name__,
+            truncate(str(e), 200),
+            type(data).__name__,
+            truncate(str(data), 100),
+        )
+
+        # Ultimate fallback: convert to string representation and sanitize
+        sanitized = _sanitize_string(str(data))
+        return Json(sanitized)
--- a/autogpt_platform/backend/backend/util/test_json.py
+++ b/autogpt_platform/backend/backend/util/test_json.py
@@ -479,3 +479,278 @@ class TestSafeJson:
        # And can be parsed back
        parsed_back = json.loads(json_string)
        assert isinstance(parsed_back, dict)
+
+    def test_dict_containing_pydantic_models(self):
+        """Test that dicts containing Pydantic models are properly serialized."""
+        # This reproduces the bug from PR #11187 where credential_inputs failed
+        model1 = SamplePydanticModel(name="Alice", age=30)
+        model2 = SamplePydanticModel(name="Bob", age=25)
+
+        data = {
+            "user1": model1,
+            "user2": model2,
+            "regular_data": "test",
+        }
+
+        result = SafeJson(data)
+        assert isinstance(result, Json)
+
+        # Verify it can be JSON serialized (this was the bug)
+        import json
+
+        json_string = json.dumps(result.data)
+        assert "Alice" in json_string
+        assert "Bob" in json_string
+
+    def test_nested_pydantic_in_dict(self):
+        """Test deeply nested Pydantic models in dicts."""
+        inner_model = SamplePydanticModel(name="Inner", age=20)
+        middle_model = SamplePydanticModel(
+            name="Middle", age=30, metadata={"inner": inner_model}
+        )
+
+        data = {
+            "level1": {
+                "level2": {
+                    "model": middle_model,
+                    "other": "data",
+                }
+            }
+        }
+
+        result = SafeJson(data)
+        assert isinstance(result, Json)
+
+        import json
+
+        json_string = json.dumps(result.data)
+        assert "Middle" in json_string
+        assert "Inner" in json_string
+
+    def test_list_containing_pydantic_models_in_dict(self):
+        """Test list of Pydantic models inside a dict."""
+        models = [SamplePydanticModel(name=f"User{i}", age=20 + i) for i in range(5)]
+
+        data = {
+            "users": models,
+            "count": len(models),
+        }
+
+        result = SafeJson(data)
+        assert isinstance(result, Json)
+
+        import json
+
+        json_string = json.dumps(result.data)
+        assert "User0" in json_string
+        assert "User4" in json_string
+
+    def test_credentials_meta_input_scenario(self):
+        """Test the exact scenario from create_graph_execution that was failing."""
+
+        # Simulate CredentialsMetaInput structure
+        class MockCredentialsMetaInput(BaseModel):
+            id: str
+            title: Optional[str] = None
+            provider: str
+            type: str
+
+        cred_input = MockCredentialsMetaInput(
+            id="test-123", title="Test Credentials", provider="github", type="oauth2"
+        )
+
+        # This is how credential_inputs is structured in create_graph_execution
+        credential_inputs = {"github_creds": cred_input}
+
+        # This should work without TypeError
+        result = SafeJson(credential_inputs)
+        assert isinstance(result, Json)
+
+        # Verify it can be JSON serialized
+        import json
+
+        json_string = json.dumps(result.data)
+        assert "test-123" in json_string
+        assert "github" in json_string
+        assert "oauth2" in json_string
+
+    def test_mixed_pydantic_and_primitives(self):
+        """Test complex mix of Pydantic models and primitive types."""
+        model = SamplePydanticModel(name="Test", age=25)
+
+        data = {
+            "models": [model, {"plain": "dict"}, "string", 123],
+            "nested": {
+                "model": model,
+                "list": [1, 2, model, 4],
+                "plain": "text",
+            },
+            "plain_list": [1, 2, 3],
+        }
+
+        result = SafeJson(data)
+        assert isinstance(result, Json)
+
+        import json
+
+        json_string = json.dumps(result.data)
+        assert "Test" in json_string
+        assert "plain" in json_string
+
+    def test_pydantic_model_with_control_chars_in_dict(self):
+        """Test Pydantic model with control chars when nested in dict."""
+        model = SamplePydanticModel(
+            name="Test\x00User",  # Has null byte
+            age=30,
+            metadata={"info": "data\x08with\x0Ccontrols"},
+        )
+
+        data = {"credential": model}
+
+        result = SafeJson(data)
+        assert isinstance(result, Json)
+
+        # Verify control characters are removed
+        import json
+
+        json_string = json.dumps(result.data)
+        assert "\x00" not in json_string
+        assert "\x08" not in json_string
+        assert "\x0C" not in json_string
+        assert "TestUser" in json_string  # Name preserved minus null byte
+
+    def test_deeply_nested_pydantic_models_control_char_sanitization(self):
+        """Test that control characters are sanitized in deeply nested Pydantic models."""
+
+        # Create nested Pydantic models with control characters at different levels
+        class InnerModel(BaseModel):
+            deep_string: str
+            value: int = 42
+            metadata: dict = {}
+
+        class MiddleModel(BaseModel):
+            middle_string: str
+            inner: InnerModel
+            data: str
+
+        class OuterModel(BaseModel):
+            outer_string: str
+            middle: MiddleModel
+
+        # Create test data with control characters at every nesting level
+        inner = InnerModel(
+            deep_string="Deepest\x00Level\x08Control\x0CChars",  # Multiple control chars at deepest level
+            metadata={
+                "nested_key": "Nested\x1FValue\x7FDelete"
+            },  # Control chars in nested dict
+        )
+
+        middle = MiddleModel(
+            middle_string="Middle\x01StartOfHeading\x1FUnitSeparator",
+            inner=inner,
+            data="Some\x0BVerticalTab\x0EShiftOut",
+        )
+
+        outer = OuterModel(outer_string="Outer\x00Null\x07Bell", middle=middle)
+
+        # Wrap in a dict with additional control characters
+        data = {
+            "top_level": "Top\x00Level\x08Backspace",
+            "nested_model": outer,
+            "list_with_strings": [
+                "List\x00Item1",
+                "List\x0CItem2\x1F",
+                {"dict_in_list": "Dict\x08Value"},
+            ],
+        }
+
+        # Process with SafeJson
+        result = SafeJson(data)
+        assert isinstance(result, Json)
+
+        # Verify all control characters are removed at every level
+        import json
+
+        json_string = json.dumps(result.data)
+
+        # Check that NO control characters remain anywhere
+        control_chars = [
+            "\x00",
+            "\x01",
+            "\x02",
+            "\x03",
+            "\x04",
+            "\x05",
+            "\x06",
+            "\x07",
+            "\x08",
+            "\x0B",
+            "\x0C",
+            "\x0E",
+            "\x0F",
+            "\x10",
+            "\x11",
+            "\x12",
+            "\x13",
+            "\x14",
+            "\x15",
+            "\x16",
+            "\x17",
+            "\x18",
+            "\x19",
+            "\x1A",
+            "\x1B",
+            "\x1C",
+            "\x1D",
+            "\x1E",
+            "\x1F",
+            "\x7F",
+        ]
+
+        for char in control_chars:
+            assert (
+                char not in json_string
+            ), f"Control character {repr(char)} found in result"
+
+        # Verify specific sanitized content is present (control chars removed but text preserved)
+        result_data = cast(dict[str, Any], result.data)
+
+        # Top level
+        assert "TopLevelBackspace" in json_string
+
+        # Outer model level
+        assert "OuterNullBell" in json_string
+
+        # Middle model level
+        assert "MiddleStartOfHeadingUnitSeparator" in json_string
+        assert "SomeVerticalTabShiftOut" in json_string
+
+        # Inner model level (deepest nesting)
+        assert "DeepestLevelControlChars" in json_string
+
+        # Nested dict in model
+        assert "NestedValueDelete" in json_string
+
+        # List items
+        assert "ListItem1" in json_string
+        assert "ListItem2" in json_string
+        assert "DictValue" in json_string
+
+        # Verify structure is preserved (not just converted to string)
+        assert isinstance(result_data, dict)
+        assert isinstance(result_data["nested_model"], dict)
+        assert isinstance(result_data["nested_model"]["middle"], dict)
+        assert isinstance(result_data["nested_model"]["middle"]["inner"], dict)
+        assert isinstance(result_data["list_with_strings"], list)
+
+        # Verify specific deep values are accessible and sanitized
+        nested_model = cast(dict[str, Any], result_data["nested_model"])
+        middle = cast(dict[str, Any], nested_model["middle"])
+        inner = cast(dict[str, Any], middle["inner"])
+
+        deep_string = inner["deep_string"]
+        assert deep_string == "DeepestLevelControlChars"
+
+        metadata = cast(dict[str, Any], inner["metadata"])
+        nested_metadata = metadata["nested_key"]
+        assert nested_metadata == "NestedValueDelete"