memory-neo4j: single-use tag pruning, alias-based entity dedup, tag normalization

- Add findSingleUseTags() to prune tags with only 1 reference after 14 days - Enhance findDuplicateEntityPairs() to match on entity aliases - Add normalizeTagName() to collapse hyphens/underscores to spaces - Monitor 'other' category accumulation in sleep cycle Phase 2 - Tighten extraction prompt with explicit entity blocklist (80 terms) - Raise auto-capture threshold from 0.5 to 0.65 - Fix tests for entity dedup phase and skipPromotion default
2026-04-03 03:03:24 -04:00 · 2026-02-14 08:42:51 +08:00
parent 08b08c66f1
commit 4d54736b98
5 changed files with 426 additions and 12 deletions
--- a/extensions/memory-neo4j/extractor.test.ts
+++ b/extensions/memory-neo4j/extractor.test.ts
@@ -1770,6 +1770,8 @@ describe("runSleepCycle", () => {
      findOrphanTags: vi.fn().mockResolvedValue([]),
      deleteOrphanTags: vi.fn().mockResolvedValue(0),
      updateExtractionStatus: vi.fn().mockResolvedValue(undefined),
+      findDuplicateEntityPairs: vi.fn().mockResolvedValue([]),
+      mergeEntityPair: vi.fn().mockResolvedValue(true),
    };
  });

@@ -2193,6 +2195,7 @@ describe("runSleepCycle", () => {
      mockDb.promoteToCore.mockResolvedValue(1);

      const result = await runSleepCycle(mockDb, mockEmbeddings, mockConfig, mockLogger, {
+        skipPromotion: false,
        paretoPercentile: 0.2,
        promotionMinAgeDays: 7,
      });
@@ -2219,6 +2222,7 @@ describe("runSleepCycle", () => {
      mockDb.calculateParetoThreshold.mockReturnValue(0.5);

      const result = await runSleepCycle(mockDb, mockEmbeddings, mockConfig, mockLogger, {
+        skipPromotion: false,
        promotionMinAgeDays: 7,
      });

@@ -2544,8 +2548,8 @@ describe("runSleepCycle", () => {
      expect(onPhaseStart).toHaveBeenCalledWith("dedup");
      expect(onPhaseStart).toHaveBeenCalledWith("conflict");
      expect(onPhaseStart).toHaveBeenCalledWith("semanticDedup");
+      expect(onPhaseStart).toHaveBeenCalledWith("entityDedup");
      expect(onPhaseStart).toHaveBeenCalledWith("pareto");
-      expect(onPhaseStart).toHaveBeenCalledWith("promotion");
      expect(onPhaseStart).toHaveBeenCalledWith("extraction");
      expect(onPhaseStart).toHaveBeenCalledWith("decay");
      expect(onPhaseStart).toHaveBeenCalledWith("cleanup");
@@ -2579,6 +2583,7 @@ describe("runSleepCycle", () => {
      expect(result).toHaveProperty("dedup");
      expect(result).toHaveProperty("conflict");
      expect(result).toHaveProperty("semanticDedup");
+      expect(result).toHaveProperty("entityDedup");
      expect(result).toHaveProperty("pareto");
      expect(result).toHaveProperty("promotion");
      expect(result).toHaveProperty("decay");
--- a/extensions/memory-neo4j/extractor.ts
+++ b/extensions/memory-neo4j/extractor.ts
@@ -46,8 +46,14 @@ Rules:
 - Entity types: person, organization, location, event, concept
 - Relationship types: WORKS_AT, LIVES_AT, KNOWS, MARRIED_TO, PREFERS, DECIDED, RELATED_TO
 - Confidence: 0.0-1.0
- Only extract what's explicitly stated or strongly implied
- Return empty arrays if nothing to extract
+- Only extract SPECIFIC named entities: real people, companies, products, tools, places, events
+- Do NOT extract generic technology terms (python, javascript, docker, linux, api, sql, html, css, json, etc.)
+- Do NOT extract generic concepts (meeting, project, training, email, code, data, server, file, script, etc.)
+- Do NOT extract programming abstractions (function, class, module, async, sync, process, etc.)
+- Good entities: "Tarun", "Abundent Academy", "Tioman Island", "LiveKit", "Neo4j", "Fish Speech S1 Mini"
+- Bad entities: "python", "ai", "automation", "email", "docker", "machine learning", "api"
+- When in doubt, do NOT extract — fewer high-quality entities beat many generic ones
+- Return empty arrays if nothing specific to extract
 - Keep entity descriptions brief (1 sentence max)
 - Category: "preference" for opinions/preferences, "fact" for factual info, "decision" for choices made, "entity" for entity-focused, "other" for miscellaneous`;

@@ -112,6 +118,165 @@ export async function extractEntities(
  }
 }

+/**
+ * Normalize a tag name: lowercase, collapse hyphens/underscores to spaces,
+ * collapse multiple spaces, trim. Ensures "machine-learning", "machine_learning",
+ * and "machine learning" all resolve to the same tag node.
+ */
+function normalizeTagName(name: string): string {
+  return name.trim().toLowerCase().replace(/[-_]+/g, " ").replace(/\s+/g, " ").trim();
+}
+
+/**
+ * Generic terms that should never be extracted as entities.
+ * These are common technology/concept words that the LLM tends to
+ * extract despite prompt instructions. Post-filter is more reliable
+ * than prompt engineering alone.
+ */
+const GENERIC_ENTITY_BLOCKLIST = new Set([
+  // Programming languages & frameworks
+  "python",
+  "javascript",
+  "typescript",
+  "java",
+  "go",
+  "rust",
+  "ruby",
+  "php",
+  "c",
+  "c++",
+  "c#",
+  "swift",
+  "kotlin",
+  "bash",
+  "shell",
+  "html",
+  "css",
+  "sql",
+  "nosql",
+  "json",
+  "xml",
+  "yaml",
+  "react",
+  "vue",
+  "angular",
+  "svelte",
+  "next.js",
+  "express",
+  "fastapi",
+  "django",
+  "flask",
+  // Generic tech concepts
+  "ai",
+  "artificial intelligence",
+  "machine learning",
+  "deep learning",
+  "neural network",
+  "automation",
+  "api",
+  "rest api",
+  "graphql",
+  "webhook",
+  "websocket",
+  "database",
+  "server",
+  "client",
+  "cloud",
+  "microservice",
+  "monolith",
+  "frontend",
+  "backend",
+  "fullstack",
+  "devops",
+  "ci/cd",
+  "deployment",
+  // Generic tools/infra
+  "docker",
+  "kubernetes",
+  "linux",
+  "windows",
+  "macos",
+  "nginx",
+  "apache",
+  "git",
+  "npm",
+  "pnpm",
+  "yarn",
+  "pip",
+  "node",
+  "nodejs",
+  "node.js",
+  // Generic work concepts
+  "meeting",
+  "project",
+  "training",
+  "email",
+  "calendar",
+  "task",
+  "ticket",
+  "code",
+  "data",
+  "file",
+  "folder",
+  "directory",
+  "script",
+  "module",
+  "debug",
+  "deploy",
+  "build",
+  "release",
+  "update",
+  "upgrade",
+  "user",
+  "admin",
+  "system",
+  "service",
+  "process",
+  "job",
+  "worker",
+  // Programming abstractions
+  "function",
+  "class",
+  "method",
+  "variable",
+  "object",
+  "array",
+  "string",
+  "async",
+  "sync",
+  "promise",
+  "callback",
+  "event",
+  "hook",
+  "middleware",
+  "component",
+  "plugin",
+  "extension",
+  "library",
+  "package",
+  "dependency",
+  // Generic descriptors
+  "app",
+  "application",
+  "web",
+  "mobile",
+  "desktop",
+  "browser",
+  "config",
+  "configuration",
+  "settings",
+  "environment",
+  "production",
+  "staging",
+  "error",
+  "bug",
+  "issue",
+  "fix",
+  "patch",
+  "feature",
+  "improvement",
+]);
+
 /**
 * Validate and sanitize LLM extraction output.
 */
@@ -146,7 +311,7 @@ function validateExtractionResult(raw: Record<string, unknown>): ExtractionResul
          : undefined,
        description: typeof e.description === "string" ? e.description : undefined,
      }))
-      .filter((e) => e.name.length > 0),
+      .filter((e) => e.name.length > 0 && !GENERIC_ENTITY_BLOCKLIST.has(e.name)),

    relationships: relationships
      .filter(
@@ -173,7 +338,7 @@ function validateExtractionResult(raw: Record<string, unknown>): ExtractionResul
          typeof (t as Record<string, unknown>).name === "string",
      )
      .map((t) => ({
-        name: String(t.name).trim().toLowerCase(),
+        name: normalizeTagName(String(t.name)),
        category: typeof t.category === "string" ? t.category : "topic",
      }))
      .filter((t) => t.name.length > 0),
--- a/extensions/memory-neo4j/index.ts
+++ b/extensions/memory-neo4j/index.ts
@@ -875,7 +875,7 @@ async function runAutoCapture(

    for (const text of retained) {
      allTexts.push(text);
-      allMeta.push({ text, source: "auto-capture", threshold: 0.5, discount: 1.0 });
+      allMeta.push({ text, source: "auto-capture", threshold: 0.65, discount: 1.0 });
    }
    for (const text of retainedAssistant) {
      allTexts.push(text);
--- a/extensions/memory-neo4j/neo4j-client.ts
+++ b/extensions/memory-neo4j/neo4j-client.ts
@@ -1526,6 +1526,39 @@ export class Neo4jMemoryClient {
    }
  }

+  /**
+   * Find tags with exactly 1 TAGGED relationship, older than minAgeDays.
+   * Single-use tags add noise without providing useful cross-memory connections.
+   * Only prunes tags that have had enough time to accrue additional references.
+   */
+  async findSingleUseTags(
+    minAgeDays: number = 14,
+    limit: number = 500,
+  ): Promise<Array<{ id: string; name: string }>> {
+    await this.ensureInitialized();
+    const session = this.driver!.session();
+    try {
+      const cutoffDate = new Date(Date.now() - minAgeDays * 24 * 60 * 60 * 1000).toISOString();
+      const result = await session.run(
+        `MATCH (t:Tag)
+         WHERE t.createdAt < $cutoffDate
+         WITH t
+         MATCH (t)<-[:TAGGED]-(m:Memory)
+         WITH t, count(m) AS usageCount
+         WHERE usageCount = 1
+         RETURN t.id AS id, t.name AS name
+         LIMIT $limit`,
+        { cutoffDate, limit: neo4j.int(limit) },
+      );
+      return result.records.map((r) => ({
+        id: r.get("id") as string,
+        name: r.get("name") as string,
+      }));
+    } finally {
+      await session.close();
+    }
+  }
+
  // --------------------------------------------------------------------------
  // Sleep Cycle: Conflict Detection
  // --------------------------------------------------------------------------
@@ -1864,4 +1897,129 @@ export class Neo4jMemoryClient {
    }
    throw lastError;
  }
+
+  // --------------------------------------------------------------------------
+  // Sleep Cycle: Entity Deduplication
+  // --------------------------------------------------------------------------
+
+  /**
+   * Find entity pairs that are likely duplicates based on name containment.
+   * Returns pairs where one entity name is a substring of another (same type),
+   * which catches the most common dedup patterns:
+   *   - "fish speech" → "fish speech s1 mini"
+   *   - "aaditya" → "aaditya sukhani"
+   *   - "abundent" → "abundent academy"
+   */
+  async findDuplicateEntityPairs(
+    agentId?: string,
+    limit: number = 200,
+  ): Promise<
+    Array<{
+      keepId: string;
+      keepName: string;
+      removeId: string;
+      removeName: string;
+      keepMentions: number;
+      removeMentions: number;
+    }>
+  > {
+    await this.ensureInitialized();
+    const session = this.driver!.session();
+    try {
+      // Find pairs where one name contains the other (same type),
+      // OR one entity's alias matches the other's name.
+      // Keep the entity with more mentions, or the shorter/more canonical name
+      // if mention counts are equal.
+      const result = await session.run(
+        `MATCH (e1:Entity), (e2:Entity)
+         WHERE e1.name < e2.name
+           AND e1.type = e2.type
+           AND size(e1.name) > 2
+           AND size(e2.name) > 2
+           AND (
+             e1.name CONTAINS e2.name
+             OR e2.name CONTAINS e1.name
+             OR ANY(alias IN coalesce(e1.aliases, []) WHERE toLower(alias) = e2.name)
+             OR ANY(alias IN coalesce(e2.aliases, []) WHERE toLower(alias) = e1.name)
+           )
+         WITH e1, e2,
+              coalesce(e1.mentionCount, 0) AS mc1,
+              coalesce(e2.mentionCount, 0) AS mc2
+         RETURN e1.id AS id1, e1.name AS name1, mc1,
+                e2.id AS id2, e2.name AS name2, mc2
+         LIMIT $limit`,
+        { limit: neo4j.int(limit) },
+      );
+
+      return result.records.map((r) => {
+        const name1 = r.get("name1") as string;
+        const name2 = r.get("name2") as string;
+        const mc1 = (r.get("mc1") as number) ?? 0;
+        const mc2 = (r.get("mc2") as number) ?? 0;
+        const id1 = r.get("id1") as string;
+        const id2 = r.get("id2") as string;
+
+        // Keep the entity with more mentions; if tied, keep the shorter (more canonical) name
+        const keepFirst = mc1 > mc2 || (mc1 === mc2 && name1.length <= name2.length);
+        return {
+          keepId: keepFirst ? id1 : id2,
+          keepName: keepFirst ? name1 : name2,
+          removeId: keepFirst ? id2 : id1,
+          removeName: keepFirst ? name2 : name1,
+          keepMentions: keepFirst ? mc1 : mc2,
+          removeMentions: keepFirst ? mc2 : mc1,
+        };
+      });
+    } finally {
+      await session.close();
+    }
+  }
+
+  /**
+   * Merge two entities: transfer MENTIONS relationships from source to target,
+   * update mention count, then delete the source entity.
+   * Inter-entity relationships on the source are dropped (they'll be
+   * re-created by future extractions against the canonical entity).
+   */
+  async mergeEntityPair(keepId: string, removeId: string): Promise<boolean> {
+    await this.ensureInitialized();
+    return this.retryOnTransient(async () => {
+      const session = this.driver!.session();
+      try {
+        const result = await session.executeWrite(async (tx) => {
+          // Transfer MENTIONS relationships from removed entity to kept entity
+          const transferred = await tx.run(
+            `MATCH (remove:Entity {id: $removeId})<-[r:MENTIONS]-(m:Memory)
+             MATCH (keep:Entity {id: $keepId})
+             MERGE (m)-[:MENTIONS]->(keep)
+             DELETE r
+             RETURN count(*) AS transferred`,
+            { removeId, keepId },
+          );
+          const transferCount = (transferred.records[0]?.get("transferred") as number) ?? 0;
+
+          // Update kept entity's mention count
+          if (transferCount > 0) {
+            await tx.run(
+              `MATCH (e:Entity {id: $keepId})
+               SET e.mentionCount = coalesce(e.mentionCount, 0) + $count,
+                   e.lastSeen = $now`,
+              { keepId, count: neo4j.int(transferCount), now: new Date().toISOString() },
+            );
+          }
+
+          // Delete the removed entity (DETACH removes all remaining relationships)
+          await tx.run(`MATCH (e:Entity {id: $removeId}) DETACH DELETE e`, { removeId });
+
+          return transferCount;
+        });
+
+        return true;
+      } catch {
+        return false;
+      } finally {
+        await session.close();
+      }
+    });
+  }
 }
--- a/extensions/memory-neo4j/sleep-cycle.ts
+++ b/extensions/memory-neo4j/sleep-cycle.ts
@@ -1,17 +1,18 @@
 /**
- * Seven-phase sleep cycle for memory consolidation.
+ * Eight-phase sleep cycle for memory consolidation.
 *
 * Implements a Pareto-based memory ecosystem where core memory
 * is bounded to the top 20% of memories by effective score.
 *
 * Phases:
 * 1. DEDUPLICATION - Merge near-duplicate memories (reduce redundancy)
+ * 1d. ENTITY DEDUP - Merge near-duplicate entities (reduce entity bloat)
 * 2. PARETO SCORING - Calculate effective scores for all memories
 * 3. CORE PROMOTION - Regular memories above threshold -> core
- * 4. CORE DEMOTION - Core memories below threshold -> regular
+ * 4. EXTRACTION - Form entity relationships (strengthen connections)
 * 5. DECAY/PRUNING - Remove old, low-importance memories (forgetting curve)
- * 6. EXTRACTION - Form entity relationships (strengthen connections)
- * 7. CLEANUP - Remove orphaned entities/tags (garbage collection)
+ * 6. CLEANUP - Remove orphaned entities/tags (garbage collection)
+ * 7. NOISE CLEANUP - Remove dangerous pattern memories
 *
 * Research basis:
 * - Pareto principle (20/80 rule) for memory tiering
@@ -47,6 +48,11 @@ export type SleepCycleResult = {
    pairsChecked: number;
    duplicatesMerged: number;
  };
+  // Phase 1d: Entity Deduplication
+  entityDedup: {
+    pairsFound: number;
+    merged: number;
+  };
  // Phase 2: Pareto Scoring & Threshold
  pareto: {
    totalMemories: number;
@@ -74,6 +80,7 @@ export type SleepCycleResult = {
  cleanup: {
    entitiesRemoved: number;
    tagsRemoved: number;
+    singleUseTagsRemoved: number;
  };
  // Overall
  durationMs: number;
@@ -104,6 +111,9 @@ export type SleepCycleOptions = {
  extractionBatchSize?: number; // Memories per batch (default: 50)
  extractionDelayMs?: number; // Delay between batches (default: 1000)

+  // Phase 5: Cleanup
+  singleUseTagMinAgeDays?: number; // Min age before single-use tag pruning (default: 14)
+
  // Phase 4: Decay
  decayRetentionThreshold?: number; // Below this, memory is pruned (default: 0.1)
  decayBaseHalfLifeDays?: number; // Base half-life in days (default: 30)
@@ -116,6 +126,7 @@ export type SleepCycleOptions = {
      | "dedup"
      | "conflict"
      | "semanticDedup"
+      | "entityDedup"
      | "pareto"
      | "promotion"
      | "decay"
@@ -168,6 +179,7 @@ export async function runSleepCycle(
    decayCurves,
    extractionBatchSize = 50,
    extractionDelayMs = 1000,
+    singleUseTagMinAgeDays = 14,
    onPhaseStart,
    onProgress,
  } = options;
@@ -176,6 +188,7 @@ export async function runSleepCycle(
    dedup: { clustersFound: 0, memoriesMerged: 0 },
    conflict: { pairsFound: 0, resolved: 0, invalidated: 0 },
    semanticDedup: { pairsChecked: 0, duplicatesMerged: 0 },
+    entityDedup: { pairsFound: 0, merged: 0 },
    pareto: {
      totalMemories: 0,
      coreMemories: 0,
@@ -185,7 +198,7 @@ export async function runSleepCycle(
    promotion: { candidatesFound: 0, promoted: 0 },
    decay: { memoriesPruned: 0 },
    extraction: { total: 0, processed: 0, succeeded: 0, failed: 0 },
-    cleanup: { entitiesRemoved: 0, tagsRemoved: 0 },
+    cleanup: { entitiesRemoved: 0, tagsRemoved: 0, singleUseTagsRemoved: 0 },
    durationMs: 0,
    aborted: false,
  };
@@ -419,6 +432,51 @@ export async function runSleepCycle(
    }
  }

+  // --------------------------------------------------------------------------
+  // Phase 1d: Entity Deduplication
+  // Merge entities where one name is a substring of another (same type).
+  // Catches: "fish speech" → "fish speech s1 mini", "aaditya" → "aaditya sukhani"
+  // Transfers MENTIONS relationships to the canonical entity, then deletes the duplicate.
+  // --------------------------------------------------------------------------
+  if (!abortSignal?.aborted) {
+    onPhaseStart?.("entityDedup");
+    logger.info("memory-neo4j: [sleep] Phase 1d: Entity Deduplication");
+
+    try {
+      const pairs = await db.findDuplicateEntityPairs(agentId);
+      result.entityDedup.pairsFound = pairs.length;
+
+      // Track removed entity IDs to skip cascading merges on already-deleted entities
+      const removedIds = new Set<string>();
+
+      for (const pair of pairs) {
+        if (abortSignal?.aborted) {
+          break;
+        }
+        // Skip if either entity was already removed in a previous merge
+        if (removedIds.has(pair.keepId) || removedIds.has(pair.removeId)) {
+          continue;
+        }
+
+        const merged = await db.mergeEntityPair(pair.keepId, pair.removeId);
+        if (merged) {
+          removedIds.add(pair.removeId);
+          result.entityDedup.merged++;
+          onProgress?.(
+            "entityDedup",
+            `Merged "${pair.removeName}" → "${pair.keepName}" (${pair.removeMentions} mentions transferred)`,
+          );
+        }
+      }
+
+      logger.info(
+        `memory-neo4j: [sleep] Phase 1d complete — ${result.entityDedup.pairsFound} pairs found, ${result.entityDedup.merged} merged`,
+      );
+    } catch (err) {
+      logger.warn(`memory-neo4j: [sleep] Phase 1d error: ${String(err)}`);
+    }
+  }
+
  // --------------------------------------------------------------------------
  // Phase 2: Pareto Scoring & Threshold Calculation
  // --------------------------------------------------------------------------
@@ -438,6 +496,8 @@ export async function runSleepCycle(
      paretoThreshold = db.calculateParetoThreshold(allScores, 1 - paretoPercentile);
      result.pareto.threshold = paretoThreshold;

+      const otherCount = allScores.filter((s) => s.category === "other").length;
+
      onProgress?.(
        "pareto",
        `Scored ${allScores.length} memories (${result.pareto.coreMemories} core, ${result.pareto.regularMemories} regular)`,
@@ -447,6 +507,17 @@ export async function runSleepCycle(
        `Pareto threshold (top ${paretoPercentile * 100}%): ${paretoThreshold.toFixed(4)}`,
      );

+      if (otherCount > 0) {
+        const otherPct = ((otherCount / allScores.length) * 100).toFixed(1);
+        onProgress?.(
+          "pareto",
+          `⚠️ "other" category: ${otherCount} memories (${otherPct}%) — monitor for conversational noise`,
+        );
+        logger.info(
+          `memory-neo4j: [sleep] "other" category monitor: ${otherCount}/${allScores.length} (${otherPct}%)`,
+        );
+      }
+
      logger.info(
        `memory-neo4j: [sleep] Phase 2 complete — threshold=${paretoThreshold.toFixed(4)} for top ${paretoPercentile * 100}%`,
      );
@@ -649,8 +720,23 @@ export async function runSleepCycle(
        }
      }

+      // Prune single-use tags (only 1 memory reference, older than threshold)
+      // These add noise without providing useful cross-memory connections.
+      if (!abortSignal?.aborted) {
+        const singleUseTags = await db.findSingleUseTags(singleUseTagMinAgeDays);
+        if (singleUseTags.length > 0) {
+          result.cleanup.singleUseTagsRemoved = await db.deleteOrphanTags(
+            singleUseTags.map((t) => t.id),
+          );
+          onProgress?.(
+            "cleanup",
+            `Removed ${result.cleanup.singleUseTagsRemoved} single-use tags (>${singleUseTagMinAgeDays}d old)`,
+          );
+        }
+      }
+
      logger.info(
-        `memory-neo4j: [sleep] Phase 6 complete — ${result.cleanup.entitiesRemoved} entities, ${result.cleanup.tagsRemoved} tags removed`,
+        `memory-neo4j: [sleep] Phase 6 complete — ${result.cleanup.entitiesRemoved} entities, ${result.cleanup.tagsRemoved} orphan tags, ${result.cleanup.singleUseTagsRemoved} single-use tags removed`,
      );
    } catch (err) {
      logger.warn(`memory-neo4j: [sleep] Phase 6 error: ${String(err)}`);