memory-neo4j: improve tag coverage with stronger extraction + retroactive tagging

- Strengthen extraction prompt to always generate 2-4 tags per memory - Add Phase 2b: Retroactive Tagging to sleep cycle for untagged memories - Include 'skipped' memories in extraction pipeline (imported memories) - Add listUntaggedMemories() helper to neo4j-client - Add extractTagsOnly() lightweight prompt for tag-only extraction - Add CLI display for Phase 2b stats Fixes: 79% of memories had zero tags due to weak prompt guidance and imported memories never going through extraction.
2026-04-03 03:03:24 -04:00 · 2026-02-16 15:49:19 +08:00
parent f093be7b3a
commit 18b8007d23
4 changed files with 232 additions and 5 deletions
--- a/extensions/memory-neo4j/cli.ts
+++ b/extensions/memory-neo4j/cli.ts
@@ -315,6 +315,7 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void {
            console.log("  Phase 1c:  Conflict Detection  — Resolve contradictory memories");
            console.log("  Phase 1d:  Entity Dedup        — Merge duplicate entity nodes");
            console.log("  Phase 2:   Extraction          — Extract entities and categorize");
+            console.log("  Phase 2b:  Retroactive Tagging — Tag memories missing topic tags");
            console.log("  Phase 3:   Decay & Pruning     — Remove stale low-importance memories");
            console.log("  Phase 4:   Orphan Cleanup      — Remove disconnected nodes");
            console.log("  Phase 5:   Noise Cleanup       — Remove dangerous pattern memories");
@@ -399,6 +400,7 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void {
                    conflict: "Phase 1c: Conflict Detection",
                    entityDedup: "Phase 1d: Entity Deduplication",
                    extraction: "Phase 2: Extraction",
+                    retroactiveTagging: "Phase 2b: Retroactive Tagging",
                    decay: "Phase 3: Decay & Pruning",
                    cleanup: "Phase 4: Orphan Cleanup",
                    noiseCleanup: "Phase 5: Noise Cleanup",
@@ -430,6 +432,12 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void {
                `   Extraction:     ${result.extraction.succeeded}/${result.extraction.total} extracted` +
                  (result.extraction.failed > 0 ? ` (${result.extraction.failed} failed)` : ""),
              );
+              console.log(
+                `   Retro-Tagging:  ${result.retroactiveTagging.tagged}/${result.retroactiveTagging.total} tagged` +
+                  (result.retroactiveTagging.failed > 0
+                    ? ` (${result.retroactiveTagging.failed} failed)`
+                    : ""),
+              );
              console.log(
                `   Cleanup:        ${result.cleanup.entitiesRemoved} entities, ${result.cleanup.tagsRemoved} tags removed`,
              );
--- a/extensions/memory-neo4j/extractor.ts
+++ b/extensions/memory-neo4j/extractor.ts
@@ -53,9 +53,40 @@ Rules:
 - Good entities: "Tarun", "Abundent Academy", "Tioman Island", "LiveKit", "Neo4j", "Fish Speech S1 Mini"
 - Bad entities: "python", "ai", "automation", "email", "docker", "machine learning", "api"
 - When in doubt, do NOT extract — fewer high-quality entities beat many generic ones
- Return empty arrays if nothing specific to extract
 - Keep entity descriptions brief (1 sentence max)
- Category: "preference" for opinions/preferences, "fact" for factual info, "decision" for choices made, "entity" for entity-focused, "other" for miscellaneous`;
+- Category: "preference" for opinions/preferences, "fact" for factual info, "decision" for choices made, "entity" for entity-focused, "other" for miscellaneous
+- ALWAYS generate at least 2 tags. Every memory has a topic — there are no exceptions.
+- Tags describe the TOPIC or DOMAIN of the memory, not the entities themselves.
+- Do NOT use entity names as tags (e.g., don't tag "tarun" if Tarun is already an entity).
+- Good tags: "travel planning", "family", "voice synthesis", "linkedin automation", "expense tracking", "cron scheduling", "api integration"
+- Tag categories: "topic", "domain", "workflow", "technology", "personal", "business"
+- Return empty entity/relationship arrays if nothing specific to extract, but NEVER return empty tags.`;
+
+// ============================================================================
+// Retroactive Tagging Prompt
+// ============================================================================
+
+/**
+ * Lightweight prompt for retroactive tagging of memories that were extracted
+ * without tags. Only asks for tags — no entities or relationships.
+ */
+const RETROACTIVE_TAGGING_SYSTEM = `You are a topic tagging system for a personal memory store.
+Generate 2-4 topic tags that describe what this memory is about.
+
+Return JSON:
+{
+  "tags": [
+    {"name": "tag name", "category": "topic|domain|workflow|technology|personal|business"}
+  ]
+}
+
+Rules:
+- Tags describe the TOPIC or DOMAIN of the memory, not specific people or tools mentioned.
+- Good tags: "travel planning", "family", "voice synthesis", "linkedin automation", "expense tracking", "cron scheduling", "api integration", "system configuration", "memory management"
+- Bad tags: names of people, companies, or specific tools (those are entities, not topics)
+- Tag categories: "topic" (general subject), "domain" (field/area), "workflow" (process/procedure), "technology" (tech area), "personal" (personal life), "business" (work/business)
+- ALWAYS return at least 2 tags. Every memory has a topic.
+- Normalize tag names to lowercase with spaces (no hyphens or underscores).`;

 // ============================================================================
 // Entity Extraction
@@ -118,6 +149,57 @@ export async function extractEntities(
  }
 }

+/**
+ * Extract only tags from a memory text using a lightweight LLM prompt.
+ * Used for retroactive tagging of memories that were extracted without tags.
+ *
+ * Returns an array of tags, or null on failure.
+ */
+export async function extractTagsOnly(
+  text: string,
+  config: ExtractionConfig,
+  abortSignal?: AbortSignal,
+): Promise<Array<{ name: string; category: string }> | null> {
+  if (!config.enabled) {
+    return null;
+  }
+
+  const messages = [
+    { role: "system", content: RETROACTIVE_TAGGING_SYSTEM },
+    { role: "user", content: text },
+  ];
+
+  let content: string | null;
+  try {
+    content = await callOpenRouterStream(config, messages, abortSignal);
+  } catch {
+    return null;
+  }
+
+  if (!content) {
+    return null;
+  }
+
+  try {
+    const parsed = JSON.parse(content) as { tags?: unknown };
+    const rawTags = Array.isArray(parsed.tags) ? parsed.tags : [];
+    return rawTags
+      .filter(
+        (t: unknown): t is Record<string, unknown> =>
+          t !== null &&
+          typeof t === "object" &&
+          typeof (t as Record<string, unknown>).name === "string",
+      )
+      .map((t) => ({
+        name: normalizeTagName(String(t.name)),
+        category: typeof t.category === "string" ? t.category : "topic",
+      }))
+      .filter((t) => t.name.length > 0);
+  } catch {
+    return null;
+  }
+}
+
 /**
 * Normalize a tag name: lowercase, collapse hyphens/underscores to spaces,
 * collapse multiple spaces, trim. Ensures "machine-learning", "machine_learning",
--- a/extensions/memory-neo4j/neo4j-client.ts
+++ b/extensions/memory-neo4j/neo4j-client.ts
@@ -915,7 +915,7 @@ export class Neo4jMemoryClient {
      const agentFilter = agentId ? "AND m.agentId = $agentId" : "";
      const result = await session.run(
        `MATCH (m:Memory)
-         WHERE m.extractionStatus = 'pending' ${agentFilter}
+         WHERE m.extractionStatus IN ['pending', 'skipped'] ${agentFilter}
         RETURN m.id AS id, m.text AS text, m.agentId AS agentId,
                coalesce(m.extractionRetries, 0) AS extractionRetries
         ORDER BY m.createdAt ASC
@@ -967,6 +967,36 @@ export class Neo4jMemoryClient {
    }
  }

+  /**
+   * List memories with completed extraction but no TAGGED relationships.
+   * Used by the retroactive tagging phase to find memories that need tags.
+   */
+  async listUntaggedMemories(
+    limit: number = 50,
+    agentId?: string,
+  ): Promise<Array<{ id: string; text: string }>> {
+    await this.ensureInitialized();
+    const session = this.driver!.session();
+    try {
+      const agentFilter = agentId ? "AND m.agentId = $agentId" : "";
+      const result = await session.run(
+        `MATCH (m:Memory)
+         WHERE m.extractionStatus = 'complete' ${agentFilter}
+           AND NOT EXISTS { MATCH (m)-[:TAGGED]->(:Tag) }
+         RETURN m.id AS id, m.text AS text
+         ORDER BY m.createdAt ASC
+         LIMIT $limit`,
+        { limit: neo4j.int(limit), ...(agentId ? { agentId } : {}) },
+      );
+      return result.records.map((r) => ({
+        id: r.get("id") as string,
+        text: r.get("text") as string,
+      }));
+    } finally {
+      await session.close();
+    }
+  }
+
  // --------------------------------------------------------------------------
  // Sleep Cycle: Deduplication
  // --------------------------------------------------------------------------
--- a/extensions/memory-neo4j/sleep-cycle.ts
+++ b/extensions/memory-neo4j/sleep-cycle.ts
@@ -23,7 +23,12 @@ import type { ExtractionConfig } from "./config.js";
 import type { Embeddings } from "./embeddings.js";
 import type { Neo4jMemoryClient } from "./neo4j-client.js";
 import type { Logger } from "./schema.js";
-import { isSemanticDuplicate, resolveConflict, runBackgroundExtraction } from "./extractor.js";
+import {
+  extractTagsOnly,
+  isSemanticDuplicate,
+  resolveConflict,
+  runBackgroundExtraction,
+} from "./extractor.js";
 import { makePairKey } from "./schema.js";
 import { reviewAndArchiveStaleTasks, type StaleTaskResult } from "./task-ledger.js";

@@ -59,6 +64,12 @@ export type SleepCycleResult = {
    succeeded: number;
    failed: number;
  };
+  // Phase 2b: Retroactive Tagging
+  retroactiveTagging: {
+    total: number;
+    tagged: number;
+    failed: number;
+  };
  // Phase 3: Decay & Pruning
  decay: {
    memoriesPruned: number;
@@ -105,6 +116,10 @@ export type SleepCycleOptions = {
  extractionBatchSize?: number; // Memories per batch (default: 50)
  extractionDelayMs?: number; // Delay between batches (default: 1000)

+  // Phase 2b: Retroactive Tagging
+  skipRetroactiveTagging?: boolean; // Skip retroactive tagging (default: false)
+  retroactiveTagBatchSize?: number; // Memories per batch (default: 50)
+
  // Phase 4: Cleanup
  singleUseTagMinAgeDays?: number; // Min age before single-use tag pruning (default: 14)

@@ -127,6 +142,7 @@ export type SleepCycleOptions = {
      | "entityDedup"
      | "decay"
      | "extraction"
+      | "retroactiveTagging"
      | "cleanup"
      | "noiseCleanup"
      | "credentialScan"
@@ -225,6 +241,8 @@ export async function runSleepCycle(
    decayCurves,
    extractionBatchSize = 50,
    extractionDelayMs = 1000,
+    skipRetroactiveTagging = false,
+    retroactiveTagBatchSize = 50,
    singleUseTagMinAgeDays = 14,
    workspaceDir,
    staleTaskMaxAgeMs,
@@ -239,6 +257,7 @@ export async function runSleepCycle(
    entityDedup: { pairsFound: 0, merged: 0 },
    decay: { memoriesPruned: 0 },
    extraction: { total: 0, processed: 0, succeeded: 0, failed: 0 },
+    retroactiveTagging: { total: 0, tagged: 0, failed: 0 },
    cleanup: { entitiesRemoved: 0, tagsRemoved: 0, singleUseTagsRemoved: 0 },
    credentialScan: { memoriesScanned: 0, credentialsFound: 0, memoriesRemoved: 0 },
    taskLedger: { staleCount: 0, archivedCount: 0, archivedIds: [] },
@@ -541,7 +560,7 @@ export async function runSleepCycle(
    try {
      // Get initial count
      const counts = await db.countByExtractionStatus(agentId);
-      result.extraction.total = counts.pending;
+      result.extraction.total = counts.pending + counts.skipped;

      if (result.extraction.total > 0) {
        let hasMore = true;
@@ -616,6 +635,94 @@ export async function runSleepCycle(
    logger.info("memory-neo4j: [sleep] Phase 2 skipped — extraction not enabled");
  }

+  // --------------------------------------------------------------------------
+  // Phase 2b: Retroactive Tagging
+  // Find memories with completed extraction but no tags, and generate tags
+  // using a lightweight LLM prompt. This fixes the historical gap where
+  // the extraction prompt treated tags as optional.
+  // --------------------------------------------------------------------------
+  if (!abortSignal?.aborted && config.enabled && !skipRetroactiveTagging) {
+    onPhaseStart?.("retroactiveTagging");
+    logger.info("memory-neo4j: [sleep] Phase 2b: Retroactive Tagging");
+
+    try {
+      let hasMore = true;
+      while (hasMore && !abortSignal?.aborted) {
+        const untagged = await db.listUntaggedMemories(retroactiveTagBatchSize, agentId);
+
+        if (untagged.length === 0) {
+          hasMore = false;
+          break;
+        }
+
+        // Count total on first batch
+        if (result.retroactiveTagging.total === 0) {
+          result.retroactiveTagging.total = untagged.length;
+        }
+
+        // Process in parallel chunks of llmConcurrency
+        for (let i = 0; i < untagged.length && !abortSignal?.aborted; i += llmConcurrency) {
+          const chunk = untagged.slice(i, i + llmConcurrency);
+          const outcomes = await Promise.allSettled(
+            chunk.map((memory) => extractTagsOnly(memory.text, config, abortSignal)),
+          );
+
+          for (let k = 0; k < outcomes.length; k++) {
+            const outcome = outcomes[k];
+            const memory = chunk[k];
+
+            if (outcome.status === "fulfilled" && outcome.value && outcome.value.length > 0) {
+              try {
+                await db.batchEntityOperations(memory.id, [], [], outcome.value);
+                result.retroactiveTagging.tagged++;
+                onProgress?.(
+                  "retroactiveTagging",
+                  `Tagged "${memory.text.slice(0, 50)}..." with ${outcome.value.length} tags`,
+                );
+              } catch (err) {
+                result.retroactiveTagging.failed++;
+                logger.warn(
+                  `memory-neo4j: [sleep] retroactive tagging write failed for ${memory.id.slice(0, 8)}: ${String(err)}`,
+                );
+              }
+            } else {
+              result.retroactiveTagging.failed++;
+            }
+          }
+        }
+
+        // Check if there are more untagged memories
+        const nextBatch = await db.listUntaggedMemories(1, agentId);
+        hasMore = nextBatch.length > 0;
+
+        // Delay between batches (abort-aware)
+        if (hasMore && !abortSignal?.aborted) {
+          await new Promise<void>((resolve) => {
+            const timer = setTimeout(resolve, extractionDelayMs);
+            abortSignal?.addEventListener(
+              "abort",
+              () => {
+                clearTimeout(timer);
+                resolve();
+              },
+              { once: true },
+            );
+          });
+        }
+      }
+
+      logger.info(
+        `memory-neo4j: [sleep] Phase 2b complete — ${result.retroactiveTagging.tagged} tagged, ${result.retroactiveTagging.failed} failed`,
+      );
+    } catch (err) {
+      logger.warn(`memory-neo4j: [sleep] Phase 2b error: ${String(err)}`);
+    }
+  } else if (!config.enabled) {
+    logger.info("memory-neo4j: [sleep] Phase 2b skipped — extraction not enabled");
+  } else if (skipRetroactiveTagging) {
+    logger.info("memory-neo4j: [sleep] Phase 2b skipped — retroactive tagging disabled");
+  }
+
  // --------------------------------------------------------------------------
  // Phase 3: Decay & Pruning (after extraction so freshly extracted memories
  // aren't pruned before they build entity connections)