Implement loading MemoryItems from file in JSONFileMemory (#4703)

Further changes: * remove `init` param from `get_memory()`, replace usages by `memory.clear()` * make token length calculation optional in `MemoryItem.dump()`
2026-01-10 23:58:06 -05:00 · 2023-06-15 17:45:14 +02:00
parent 6e6e7fcc9a
commit f0a5250da5
9 changed files with 93 additions and 20 deletions
--- a/autogpt/commands/file_operations.py
+++ b/autogpt/commands/file_operations.py
@@ -160,7 +160,7 @@ def ingest_file(

        # TODO: differentiate between different types of files
        file_memory = MemoryItem.from_text_file(content, filename)
-        logger.debug(f"Created memory: {file_memory.dump()}")
+        logger.debug(f"Created memory: {file_memory.dump(True)}")
        memory.add(file_memory)

        logger.info(f"Ingested {len(file_memory.e_chunks)} chunks from {filename}")
--- a/autogpt/main.py
+++ b/autogpt/main.py
@@ -175,7 +175,8 @@ def run_auto_gpt(

    # Initialize memory and make sure it is empty.
    # this is particularly important for indexing and referencing pinecone memory
-    memory = get_memory(cfg, init=True)
+    memory = get_memory(cfg)
+    memory.clear()
    logger.typewriter_log(
        "Using memory of type:", Fore.GREEN, f"{memory.__class__.__name__}"
    )
--- a/autogpt/memory/vector/init.py
+++ b/autogpt/memory/vector/init.py
@@ -39,7 +39,7 @@ supported_memory = ["json_file", "no_memory"]
 #     MilvusMemory = None


-def get_memory(cfg: Config, init=False) -> VectorMemory:
+def get_memory(cfg: Config) -> VectorMemory:
    memory = None

    match cfg.memory_backend:
@@ -60,7 +60,7 @@ def get_memory(cfg: Config, init=False) -> VectorMemory:
            #     )
            # else:
            #     memory = PineconeMemory(cfg)
-            #     if init:
+            #     if clear:
            #         memory.clear()

        case "redis":
--- a/autogpt/memory/vector/memory_item.py
+++ b/autogpt/memory/vector/memory_item.py
@@ -109,21 +109,21 @@ class MemoryItem:
        # The result_message contains either user feedback
        # or the result of the command specified in ai_message

-        if ai_message["role"] != "assistant":
-            raise ValueError(f"Invalid role on 'ai_message': {ai_message['role']}")
+        if ai_message.role != "assistant":
+            raise ValueError(f"Invalid role on 'ai_message': {ai_message.role}")

        result = (
-            result_message["content"]
-            if result_message["content"].startswith("Command")
+            result_message.content
+            if result_message.content.startswith("Command")
            else "None"
        )
        user_input = (
-            result_message["content"]
-            if result_message["content"].startswith("Human feedback")
+            result_message.content
+            if result_message.content.startswith("Human feedback")
            else "None"
        )
        memory_content = (
-            f"Assistant Reply: {ai_message['content']}"
+            f"Assistant Reply: {ai_message.content}"
            "\n\n"
            f"Result: {result}"
            "\n\n"
@@ -145,11 +145,14 @@ class MemoryItem:
            question_for_summary=question,
        )

-    def dump(self) -> str:
-        token_length = count_string_tokens(self.raw_content, Config().embedding_model)
+    def dump(self, calculate_length=False) -> str:
+        if calculate_length:
+            token_length = count_string_tokens(
+                self.raw_content, Config().embedding_model
+            )
        return f"""
 =============== MemoryItem ===============
-Length: {token_length} tokens in {len(self.e_chunks)} chunks
+Size: {f'{token_length} tokens in ' if calculate_length else ''}{len(self.e_chunks)} chunks
 Metadata: {json.dumps(self.metadata, indent=2)}
 ---------------- SUMMARY -----------------
 {self.summary}
@@ -158,6 +161,31 @@ Metadata: {json.dumps(self.metadata, indent=2)}
 ==========================================
 """

+    def __eq__(self, other: MemoryItem):
+        return (
+            self.raw_content == other.raw_content
+            and self.chunks == other.chunks
+            and self.chunk_summaries == other.chunk_summaries
+            # Embeddings can either be list[float] or np.ndarray[float32],
+            # and for comparison they must be of the same type
+            and np.array_equal(
+                self.e_summary
+                if isinstance(self.e_summary, np.ndarray)
+                else np.array(self.e_summary, dtype=np.float32),
+                other.e_summary
+                if isinstance(other.e_summary, np.ndarray)
+                else np.array(other.e_summary, dtype=np.float32),
+            )
+            and np.array_equal(
+                self.e_chunks
+                if isinstance(self.e_chunks[0], np.ndarray)
+                else [np.array(c, dtype=np.float32) for c in self.e_chunks],
+                other.e_chunks
+                if isinstance(other.e_chunks[0], np.ndarray)
+                else [np.array(c, dtype=np.float32) for c in other.e_chunks],
+            )
+        )
+

@dataclasses.dataclass
 class MemoryItemRelevance:
--- a/autogpt/memory/vector/providers/json_file.py
+++ b/autogpt/memory/vector/providers/json_file.py
@@ -32,10 +32,17 @@ class JSONFileMemory(VectorMemoryProvider):
        workspace_path = Path(cfg.workspace_path)
        self.file_path = workspace_path / f"{cfg.memory_index}.json"
        self.file_path.touch()
-        logger.debug(f"Initialized {__name__} with index path {self.file_path}")
+        logger.debug(
+            f"Initialized {__class__.__name__} with index path {self.file_path}"
+        )

        self.memories = []
-        self.save_index()
+        try:
+            self.load_index()
+            logger.debug(f"Loaded {len(self.memories)} MemoryItems from file")
+        except Exception as e:
+            logger.warn(f"Could not load MemoryItems from file: {e}")
+            self.save_index()

    def __iter__(self) -> Iterator[MemoryItem]:
        return iter(self.memories)
@@ -48,6 +55,7 @@ class JSONFileMemory(VectorMemoryProvider):

    def add(self, item: MemoryItem):
        self.memories.append(item)
+        logger.debug(f"Adding item to memory: {item.dump()}")
        self.save_index()
        return len(self.memories)

@@ -62,6 +70,17 @@ class JSONFileMemory(VectorMemoryProvider):
        self.memories.clear()
        self.save_index()

+    def load_index(self):
+        """Loads all memories from the index file"""
+        if not self.file_path.is_file():
+            logger.debug(f"Index file '{self.file_path}' does not exist")
+            return
+        with self.file_path.open("r") as f:
+            logger.debug(f"Loading memories from index file '{self.file_path}'")
+            json_index = orjson.loads(f.read())
+            for memory_item_dict in json_index:
+                self.memories.append(MemoryItem(**memory_item_dict))
+
    def save_index(self):
        logger.debug(f"Saving memory index to file {self.file_path}")
        with self.file_path.open("wb") as f:
--- a/data_ingestion.py
+++ b/data_ingestion.py
@@ -70,7 +70,9 @@ def main() -> None:
    args = parser.parse_args()

    # Initialize memory
-    memory = get_memory(cfg, init=args.init)
+    memory = get_memory(cfg)
+    if args.init:
+        memory.clear()
    logger.debug("Using memory of type: " + memory.__class__.__name__)

    if args.file:
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -94,7 +94,8 @@ def agent(config: Config, workspace: Workspace) -> Agent:
    ai_config.command_registry = command_registry

    config.set_memory_backend("json_file")
-    memory_json_file = get_memory(config, init=True)
+    memory_json_file = get_memory(config)
+    memory_json_file.clear()

    system_prompt = ai_config.construct_full_prompt()

--- a/tests/integration/agent_factory.py
+++ b/tests/integration/agent_factory.py
@@ -28,7 +28,9 @@ def memory_json_file(agent_test_config: Config):
    was_memory_backend = agent_test_config.memory_backend

    agent_test_config.set_memory_backend("json_file")
-    yield get_memory(agent_test_config, init=True)
+    memory = get_memory(agent_test_config)
+    memory.clear()
+    yield memory

    agent_test_config.set_memory_backend(was_memory_backend)

--- a/tests/integration/memory/test_json_file_memory.py
+++ b/tests/integration/memory/test_json_file_memory.py
@@ -34,7 +34,9 @@ def test_json_memory_init_with_backing_empty_file(config: Config, workspace: Wor
    assert index_file.read_text() == "[]"


-def test_json_memory_init_with_backing_file(config: Config, workspace: Workspace):
+def test_json_memory_init_with_backing_invalid_file(
+    config: Config, workspace: Workspace
+):
    index_file = workspace.root / f"{config.memory_index}.json"
    index_file.touch()

@@ -78,6 +80,24 @@ def test_json_memory_get(config: Config, memory_item: MemoryItem, mock_get_embed
    assert retrieved.memory_item == memory_item


+def test_json_memory_load_index(config: Config, memory_item: MemoryItem):
+    index = JSONFileMemory(config)
+    index.add(memory_item)
+
+    try:
+        assert index.file_path.exists(), "index was not saved to file"
+        assert len(index) == 1, f"index constains {len(index)} items instead of 1"
+        assert index.memories[0] == memory_item, "item in index != added mock item"
+    except AssertionError as e:
+        raise ValueError(f"Setting up for load_index test failed: {e}")
+
+    index.memories = []
+    index.load_index()
+
+    assert len(index) == 1
+    assert index.memories[0] == memory_item
+
+
@pytest.mark.vcr
@requires_api_key("OPENAI_API_KEY")
 def test_json_memory_get_relevant(config: Config, patched_api_requestor: None) -> None: