Merge branch 'dev' into swiftyos/secrt-1706-improve-store-search

This commit is contained in:
Swifty
2025-10-17 11:40:34 +02:00
committed by GitHub
3 changed files with 311 additions and 64 deletions

View File

@@ -246,7 +246,7 @@ async def execute_node(
async for output_name, output_data in node_block.execute(
input_data, **extra_exec_kwargs
):
output_data = json.convert_pydantic_to_json(output_data)
output_data = json.to_dict(output_data)
output_size += len(json.dumps(output_data))
log_metadata.debug("Node produced output", **{output_name: output_data})
yield output_name, output_data

View File

@@ -1,25 +1,22 @@
import logging
import re
from typing import Any, Type, TypeGuard, TypeVar, overload
from typing import Any, Type, TypeVar, overload
import jsonschema
import orjson
from fastapi.encoders import jsonable_encoder
from fastapi.encoders import jsonable_encoder as to_dict
from prisma import Json
from pydantic import BaseModel
from .truncate import truncate
from .type import type_match
logger = logging.getLogger(__name__)
# Precompiled regex to remove PostgreSQL-incompatible control characters
# Removes \u0000-\u0008, \u000B-\u000C, \u000E-\u001F, \u007F (keeps tab \u0009, newline \u000A, carriage return \u000D)
POSTGRES_CONTROL_CHARS = re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]")
def to_dict(data) -> dict:
if isinstance(data, BaseModel):
data = data.model_dump()
return jsonable_encoder(data)
def dumps(
data: Any, *args: Any, indent: int | None = None, option: int = 0, **kwargs: Any
) -> str:
@@ -108,64 +105,19 @@ def validate_with_jsonschema(
return str(e)
def is_list_of_basemodels(value: object) -> TypeGuard[list[BaseModel]]:
return isinstance(value, list) and all(
isinstance(item, BaseModel) for item in value
)
def convert_pydantic_to_json(output_data: Any) -> Any:
if isinstance(output_data, BaseModel):
return output_data.model_dump()
if is_list_of_basemodels(output_data):
return [item.model_dump() for item in output_data]
return output_data
def _sanitize_value(value: Any) -> Any:
"""
Recursively sanitize values by removing PostgreSQL-incompatible control characters.
This function walks through data structures and removes control characters from strings.
It handles:
- Strings: Remove control chars directly from the string
- Lists: Recursively sanitize each element
- Dicts: Recursively sanitize keys and values
- Other types: Return as-is
Args:
value: The value to sanitize
Returns:
Sanitized version of the value with control characters removed
"""
if isinstance(value, str):
# Remove control characters directly from the string
return POSTGRES_CONTROL_CHARS.sub("", value)
elif isinstance(value, dict):
# Recursively sanitize dictionary keys and values
return {_sanitize_value(k): _sanitize_value(v) for k, v in value.items()}
elif isinstance(value, list):
# Recursively sanitize list elements
return [_sanitize_value(item) for item in value]
elif isinstance(value, tuple):
# Recursively sanitize tuple elements
return tuple(_sanitize_value(item) for item in value)
else:
# For other types (int, float, bool, None, etc.), return as-is
return value
def SafeJson(data: Any) -> Json:
"""
Safely serialize data and return Prisma's Json type.
Sanitizes control characters to prevent PostgreSQL 22P05 errors.
This function:
1. Converts Pydantic models to dicts
1. Converts Pydantic models to dicts (recursively using to_dict)
2. Recursively removes PostgreSQL-incompatible control characters from strings
3. Returns a Prisma Json object safe for database storage
Uses to_dict (jsonable_encoder) with a custom encoder to handle both Pydantic
conversion and control character sanitization in a two-pass approach.
Args:
data: Input data to sanitize and convert to Json
@@ -177,9 +129,29 @@ def SafeJson(data: Any) -> Json:
>>> SafeJson({"path": "C:\\\\temp"}) # backslashes preserved
>>> SafeJson({"data": "Text\\\\u0000here"}) # literal backslash-u preserved
"""
# Convert Pydantic models to dict first
if isinstance(data, BaseModel):
data = data.model_dump(exclude_none=True)
# Return as Prisma Json type
return Json(_sanitize_value(data))
def _sanitize_string(value: str) -> str:
"""Remove PostgreSQL-incompatible control characters from string."""
return POSTGRES_CONTROL_CHARS.sub("", value)
try:
# Use two-pass approach for consistent string sanitization:
# 1. First convert to basic JSON-serializable types (handles Pydantic models)
# 2. Then sanitize strings in the result
basic_result = to_dict(data)
sanitized_result = to_dict(basic_result, custom_encoder={str: _sanitize_string})
return Json(sanitized_result)
except Exception as e:
# Log the failure and fall back to string representation
logger.error(
"SafeJson fallback to string representation due to serialization error: %s (%s). "
"Data type: %s, Data preview: %s",
type(e).__name__,
truncate(str(e), 200),
type(data).__name__,
truncate(str(data), 100),
)
# Ultimate fallback: convert to string representation and sanitize
sanitized = _sanitize_string(str(data))
return Json(sanitized)

View File

@@ -479,3 +479,278 @@ class TestSafeJson:
# And can be parsed back
parsed_back = json.loads(json_string)
assert isinstance(parsed_back, dict)
def test_dict_containing_pydantic_models(self):
"""Test that dicts containing Pydantic models are properly serialized."""
# This reproduces the bug from PR #11187 where credential_inputs failed
model1 = SamplePydanticModel(name="Alice", age=30)
model2 = SamplePydanticModel(name="Bob", age=25)
data = {
"user1": model1,
"user2": model2,
"regular_data": "test",
}
result = SafeJson(data)
assert isinstance(result, Json)
# Verify it can be JSON serialized (this was the bug)
import json
json_string = json.dumps(result.data)
assert "Alice" in json_string
assert "Bob" in json_string
def test_nested_pydantic_in_dict(self):
"""Test deeply nested Pydantic models in dicts."""
inner_model = SamplePydanticModel(name="Inner", age=20)
middle_model = SamplePydanticModel(
name="Middle", age=30, metadata={"inner": inner_model}
)
data = {
"level1": {
"level2": {
"model": middle_model,
"other": "data",
}
}
}
result = SafeJson(data)
assert isinstance(result, Json)
import json
json_string = json.dumps(result.data)
assert "Middle" in json_string
assert "Inner" in json_string
def test_list_containing_pydantic_models_in_dict(self):
"""Test list of Pydantic models inside a dict."""
models = [SamplePydanticModel(name=f"User{i}", age=20 + i) for i in range(5)]
data = {
"users": models,
"count": len(models),
}
result = SafeJson(data)
assert isinstance(result, Json)
import json
json_string = json.dumps(result.data)
assert "User0" in json_string
assert "User4" in json_string
def test_credentials_meta_input_scenario(self):
"""Test the exact scenario from create_graph_execution that was failing."""
# Simulate CredentialsMetaInput structure
class MockCredentialsMetaInput(BaseModel):
id: str
title: Optional[str] = None
provider: str
type: str
cred_input = MockCredentialsMetaInput(
id="test-123", title="Test Credentials", provider="github", type="oauth2"
)
# This is how credential_inputs is structured in create_graph_execution
credential_inputs = {"github_creds": cred_input}
# This should work without TypeError
result = SafeJson(credential_inputs)
assert isinstance(result, Json)
# Verify it can be JSON serialized
import json
json_string = json.dumps(result.data)
assert "test-123" in json_string
assert "github" in json_string
assert "oauth2" in json_string
def test_mixed_pydantic_and_primitives(self):
"""Test complex mix of Pydantic models and primitive types."""
model = SamplePydanticModel(name="Test", age=25)
data = {
"models": [model, {"plain": "dict"}, "string", 123],
"nested": {
"model": model,
"list": [1, 2, model, 4],
"plain": "text",
},
"plain_list": [1, 2, 3],
}
result = SafeJson(data)
assert isinstance(result, Json)
import json
json_string = json.dumps(result.data)
assert "Test" in json_string
assert "plain" in json_string
def test_pydantic_model_with_control_chars_in_dict(self):
"""Test Pydantic model with control chars when nested in dict."""
model = SamplePydanticModel(
name="Test\x00User", # Has null byte
age=30,
metadata={"info": "data\x08with\x0Ccontrols"},
)
data = {"credential": model}
result = SafeJson(data)
assert isinstance(result, Json)
# Verify control characters are removed
import json
json_string = json.dumps(result.data)
assert "\x00" not in json_string
assert "\x08" not in json_string
assert "\x0C" not in json_string
assert "TestUser" in json_string # Name preserved minus null byte
def test_deeply_nested_pydantic_models_control_char_sanitization(self):
"""Test that control characters are sanitized in deeply nested Pydantic models."""
# Create nested Pydantic models with control characters at different levels
class InnerModel(BaseModel):
deep_string: str
value: int = 42
metadata: dict = {}
class MiddleModel(BaseModel):
middle_string: str
inner: InnerModel
data: str
class OuterModel(BaseModel):
outer_string: str
middle: MiddleModel
# Create test data with control characters at every nesting level
inner = InnerModel(
deep_string="Deepest\x00Level\x08Control\x0CChars", # Multiple control chars at deepest level
metadata={
"nested_key": "Nested\x1FValue\x7FDelete"
}, # Control chars in nested dict
)
middle = MiddleModel(
middle_string="Middle\x01StartOfHeading\x1FUnitSeparator",
inner=inner,
data="Some\x0BVerticalTab\x0EShiftOut",
)
outer = OuterModel(outer_string="Outer\x00Null\x07Bell", middle=middle)
# Wrap in a dict with additional control characters
data = {
"top_level": "Top\x00Level\x08Backspace",
"nested_model": outer,
"list_with_strings": [
"List\x00Item1",
"List\x0CItem2\x1F",
{"dict_in_list": "Dict\x08Value"},
],
}
# Process with SafeJson
result = SafeJson(data)
assert isinstance(result, Json)
# Verify all control characters are removed at every level
import json
json_string = json.dumps(result.data)
# Check that NO control characters remain anywhere
control_chars = [
"\x00",
"\x01",
"\x02",
"\x03",
"\x04",
"\x05",
"\x06",
"\x07",
"\x08",
"\x0B",
"\x0C",
"\x0E",
"\x0F",
"\x10",
"\x11",
"\x12",
"\x13",
"\x14",
"\x15",
"\x16",
"\x17",
"\x18",
"\x19",
"\x1A",
"\x1B",
"\x1C",
"\x1D",
"\x1E",
"\x1F",
"\x7F",
]
for char in control_chars:
assert (
char not in json_string
), f"Control character {repr(char)} found in result"
# Verify specific sanitized content is present (control chars removed but text preserved)
result_data = cast(dict[str, Any], result.data)
# Top level
assert "TopLevelBackspace" in json_string
# Outer model level
assert "OuterNullBell" in json_string
# Middle model level
assert "MiddleStartOfHeadingUnitSeparator" in json_string
assert "SomeVerticalTabShiftOut" in json_string
# Inner model level (deepest nesting)
assert "DeepestLevelControlChars" in json_string
# Nested dict in model
assert "NestedValueDelete" in json_string
# List items
assert "ListItem1" in json_string
assert "ListItem2" in json_string
assert "DictValue" in json_string
# Verify structure is preserved (not just converted to string)
assert isinstance(result_data, dict)
assert isinstance(result_data["nested_model"], dict)
assert isinstance(result_data["nested_model"]["middle"], dict)
assert isinstance(result_data["nested_model"]["middle"]["inner"], dict)
assert isinstance(result_data["list_with_strings"], list)
# Verify specific deep values are accessible and sanitized
nested_model = cast(dict[str, Any], result_data["nested_model"])
middle = cast(dict[str, Any], nested_model["middle"])
inner = cast(dict[str, Any], middle["inner"])
deep_string = inner["deep_string"]
assert deep_string == "DeepestLevelControlChars"
metadata = cast(dict[str, Any], inner["metadata"])
nested_metadata = metadata["nested_key"]
assert nested_metadata == "NestedValueDelete"