memory-neo4j: single-use tag pruning, alias-based entity dedup, tag normalization

- Add findSingleUseTags() to prune tags with only 1 reference after 14 days
- Enhance findDuplicateEntityPairs() to match on entity aliases
- Add normalizeTagName() to collapse hyphens/underscores to spaces
- Monitor 'other' category accumulation in sleep cycle Phase 2
- Tighten extraction prompt with explicit entity blocklist (80 terms)
- Raise auto-capture threshold from 0.5 to 0.65
- Fix tests for entity dedup phase and skipPromotion default
This commit is contained in:
Tarun Sukhani
2026-02-14 08:42:51 +08:00
parent 08b08c66f1
commit 4d54736b98
5 changed files with 426 additions and 12 deletions

View File

@@ -1770,6 +1770,8 @@ describe("runSleepCycle", () => {
findOrphanTags: vi.fn().mockResolvedValue([]),
deleteOrphanTags: vi.fn().mockResolvedValue(0),
updateExtractionStatus: vi.fn().mockResolvedValue(undefined),
findDuplicateEntityPairs: vi.fn().mockResolvedValue([]),
mergeEntityPair: vi.fn().mockResolvedValue(true),
};
});
@@ -2193,6 +2195,7 @@ describe("runSleepCycle", () => {
mockDb.promoteToCore.mockResolvedValue(1);
const result = await runSleepCycle(mockDb, mockEmbeddings, mockConfig, mockLogger, {
skipPromotion: false,
paretoPercentile: 0.2,
promotionMinAgeDays: 7,
});
@@ -2219,6 +2222,7 @@ describe("runSleepCycle", () => {
mockDb.calculateParetoThreshold.mockReturnValue(0.5);
const result = await runSleepCycle(mockDb, mockEmbeddings, mockConfig, mockLogger, {
skipPromotion: false,
promotionMinAgeDays: 7,
});
@@ -2544,8 +2548,8 @@ describe("runSleepCycle", () => {
expect(onPhaseStart).toHaveBeenCalledWith("dedup");
expect(onPhaseStart).toHaveBeenCalledWith("conflict");
expect(onPhaseStart).toHaveBeenCalledWith("semanticDedup");
expect(onPhaseStart).toHaveBeenCalledWith("entityDedup");
expect(onPhaseStart).toHaveBeenCalledWith("pareto");
expect(onPhaseStart).toHaveBeenCalledWith("promotion");
expect(onPhaseStart).toHaveBeenCalledWith("extraction");
expect(onPhaseStart).toHaveBeenCalledWith("decay");
expect(onPhaseStart).toHaveBeenCalledWith("cleanup");
@@ -2579,6 +2583,7 @@ describe("runSleepCycle", () => {
expect(result).toHaveProperty("dedup");
expect(result).toHaveProperty("conflict");
expect(result).toHaveProperty("semanticDedup");
expect(result).toHaveProperty("entityDedup");
expect(result).toHaveProperty("pareto");
expect(result).toHaveProperty("promotion");
expect(result).toHaveProperty("decay");

View File

@@ -46,8 +46,14 @@ Rules:
- Entity types: person, organization, location, event, concept
- Relationship types: WORKS_AT, LIVES_AT, KNOWS, MARRIED_TO, PREFERS, DECIDED, RELATED_TO
- Confidence: 0.0-1.0
- Only extract what's explicitly stated or strongly implied
- Return empty arrays if nothing to extract
- Only extract SPECIFIC named entities: real people, companies, products, tools, places, events
- Do NOT extract generic technology terms (python, javascript, docker, linux, api, sql, html, css, json, etc.)
- Do NOT extract generic concepts (meeting, project, training, email, code, data, server, file, script, etc.)
- Do NOT extract programming abstractions (function, class, module, async, sync, process, etc.)
- Good entities: "Tarun", "Abundent Academy", "Tioman Island", "LiveKit", "Neo4j", "Fish Speech S1 Mini"
- Bad entities: "python", "ai", "automation", "email", "docker", "machine learning", "api"
- When in doubt, do NOT extract — fewer high-quality entities beat many generic ones
- Return empty arrays if nothing specific to extract
- Keep entity descriptions brief (1 sentence max)
- Category: "preference" for opinions/preferences, "fact" for factual info, "decision" for choices made, "entity" for entity-focused, "other" for miscellaneous`;
@@ -112,6 +118,165 @@ export async function extractEntities(
}
}
/**
* Normalize a tag name: lowercase, collapse hyphens/underscores to spaces,
* collapse multiple spaces, trim. Ensures "machine-learning", "machine_learning",
* and "machine learning" all resolve to the same tag node.
*/
function normalizeTagName(name: string): string {
return name.trim().toLowerCase().replace(/[-_]+/g, " ").replace(/\s+/g, " ").trim();
}
/**
* Generic terms that should never be extracted as entities.
* These are common technology/concept words that the LLM tends to
* extract despite prompt instructions. Post-filter is more reliable
* than prompt engineering alone.
*/
const GENERIC_ENTITY_BLOCKLIST = new Set([
// Programming languages & frameworks
"python",
"javascript",
"typescript",
"java",
"go",
"rust",
"ruby",
"php",
"c",
"c++",
"c#",
"swift",
"kotlin",
"bash",
"shell",
"html",
"css",
"sql",
"nosql",
"json",
"xml",
"yaml",
"react",
"vue",
"angular",
"svelte",
"next.js",
"express",
"fastapi",
"django",
"flask",
// Generic tech concepts
"ai",
"artificial intelligence",
"machine learning",
"deep learning",
"neural network",
"automation",
"api",
"rest api",
"graphql",
"webhook",
"websocket",
"database",
"server",
"client",
"cloud",
"microservice",
"monolith",
"frontend",
"backend",
"fullstack",
"devops",
"ci/cd",
"deployment",
// Generic tools/infra
"docker",
"kubernetes",
"linux",
"windows",
"macos",
"nginx",
"apache",
"git",
"npm",
"pnpm",
"yarn",
"pip",
"node",
"nodejs",
"node.js",
// Generic work concepts
"meeting",
"project",
"training",
"email",
"calendar",
"task",
"ticket",
"code",
"data",
"file",
"folder",
"directory",
"script",
"module",
"debug",
"deploy",
"build",
"release",
"update",
"upgrade",
"user",
"admin",
"system",
"service",
"process",
"job",
"worker",
// Programming abstractions
"function",
"class",
"method",
"variable",
"object",
"array",
"string",
"async",
"sync",
"promise",
"callback",
"event",
"hook",
"middleware",
"component",
"plugin",
"extension",
"library",
"package",
"dependency",
// Generic descriptors
"app",
"application",
"web",
"mobile",
"desktop",
"browser",
"config",
"configuration",
"settings",
"environment",
"production",
"staging",
"error",
"bug",
"issue",
"fix",
"patch",
"feature",
"improvement",
]);
/**
* Validate and sanitize LLM extraction output.
*/
@@ -146,7 +311,7 @@ function validateExtractionResult(raw: Record<string, unknown>): ExtractionResul
: undefined,
description: typeof e.description === "string" ? e.description : undefined,
}))
.filter((e) => e.name.length > 0),
.filter((e) => e.name.length > 0 && !GENERIC_ENTITY_BLOCKLIST.has(e.name)),
relationships: relationships
.filter(
@@ -173,7 +338,7 @@ function validateExtractionResult(raw: Record<string, unknown>): ExtractionResul
typeof (t as Record<string, unknown>).name === "string",
)
.map((t) => ({
name: String(t.name).trim().toLowerCase(),
name: normalizeTagName(String(t.name)),
category: typeof t.category === "string" ? t.category : "topic",
}))
.filter((t) => t.name.length > 0),

View File

@@ -875,7 +875,7 @@ async function runAutoCapture(
for (const text of retained) {
allTexts.push(text);
allMeta.push({ text, source: "auto-capture", threshold: 0.5, discount: 1.0 });
allMeta.push({ text, source: "auto-capture", threshold: 0.65, discount: 1.0 });
}
for (const text of retainedAssistant) {
allTexts.push(text);

View File

@@ -1526,6 +1526,39 @@ export class Neo4jMemoryClient {
}
}
/**
* Find tags with exactly 1 TAGGED relationship, older than minAgeDays.
* Single-use tags add noise without providing useful cross-memory connections.
* Only prunes tags that have had enough time to accrue additional references.
*/
async findSingleUseTags(
minAgeDays: number = 14,
limit: number = 500,
): Promise<Array<{ id: string; name: string }>> {
await this.ensureInitialized();
const session = this.driver!.session();
try {
const cutoffDate = new Date(Date.now() - minAgeDays * 24 * 60 * 60 * 1000).toISOString();
const result = await session.run(
`MATCH (t:Tag)
WHERE t.createdAt < $cutoffDate
WITH t
MATCH (t)<-[:TAGGED]-(m:Memory)
WITH t, count(m) AS usageCount
WHERE usageCount = 1
RETURN t.id AS id, t.name AS name
LIMIT $limit`,
{ cutoffDate, limit: neo4j.int(limit) },
);
return result.records.map((r) => ({
id: r.get("id") as string,
name: r.get("name") as string,
}));
} finally {
await session.close();
}
}
// --------------------------------------------------------------------------
// Sleep Cycle: Conflict Detection
// --------------------------------------------------------------------------
@@ -1864,4 +1897,129 @@ export class Neo4jMemoryClient {
}
throw lastError;
}
// --------------------------------------------------------------------------
// Sleep Cycle: Entity Deduplication
// --------------------------------------------------------------------------
/**
* Find entity pairs that are likely duplicates based on name containment.
* Returns pairs where one entity name is a substring of another (same type),
* which catches the most common dedup patterns:
* - "fish speech" → "fish speech s1 mini"
* - "aaditya" → "aaditya sukhani"
* - "abundent" → "abundent academy"
*/
async findDuplicateEntityPairs(
agentId?: string,
limit: number = 200,
): Promise<
Array<{
keepId: string;
keepName: string;
removeId: string;
removeName: string;
keepMentions: number;
removeMentions: number;
}>
> {
await this.ensureInitialized();
const session = this.driver!.session();
try {
// Find pairs where one name contains the other (same type),
// OR one entity's alias matches the other's name.
// Keep the entity with more mentions, or the shorter/more canonical name
// if mention counts are equal.
const result = await session.run(
`MATCH (e1:Entity), (e2:Entity)
WHERE e1.name < e2.name
AND e1.type = e2.type
AND size(e1.name) > 2
AND size(e2.name) > 2
AND (
e1.name CONTAINS e2.name
OR e2.name CONTAINS e1.name
OR ANY(alias IN coalesce(e1.aliases, []) WHERE toLower(alias) = e2.name)
OR ANY(alias IN coalesce(e2.aliases, []) WHERE toLower(alias) = e1.name)
)
WITH e1, e2,
coalesce(e1.mentionCount, 0) AS mc1,
coalesce(e2.mentionCount, 0) AS mc2
RETURN e1.id AS id1, e1.name AS name1, mc1,
e2.id AS id2, e2.name AS name2, mc2
LIMIT $limit`,
{ limit: neo4j.int(limit) },
);
return result.records.map((r) => {
const name1 = r.get("name1") as string;
const name2 = r.get("name2") as string;
const mc1 = (r.get("mc1") as number) ?? 0;
const mc2 = (r.get("mc2") as number) ?? 0;
const id1 = r.get("id1") as string;
const id2 = r.get("id2") as string;
// Keep the entity with more mentions; if tied, keep the shorter (more canonical) name
const keepFirst = mc1 > mc2 || (mc1 === mc2 && name1.length <= name2.length);
return {
keepId: keepFirst ? id1 : id2,
keepName: keepFirst ? name1 : name2,
removeId: keepFirst ? id2 : id1,
removeName: keepFirst ? name2 : name1,
keepMentions: keepFirst ? mc1 : mc2,
removeMentions: keepFirst ? mc2 : mc1,
};
});
} finally {
await session.close();
}
}
/**
* Merge two entities: transfer MENTIONS relationships from source to target,
* update mention count, then delete the source entity.
* Inter-entity relationships on the source are dropped (they'll be
* re-created by future extractions against the canonical entity).
*/
async mergeEntityPair(keepId: string, removeId: string): Promise<boolean> {
await this.ensureInitialized();
return this.retryOnTransient(async () => {
const session = this.driver!.session();
try {
const result = await session.executeWrite(async (tx) => {
// Transfer MENTIONS relationships from removed entity to kept entity
const transferred = await tx.run(
`MATCH (remove:Entity {id: $removeId})<-[r:MENTIONS]-(m:Memory)
MATCH (keep:Entity {id: $keepId})
MERGE (m)-[:MENTIONS]->(keep)
DELETE r
RETURN count(*) AS transferred`,
{ removeId, keepId },
);
const transferCount = (transferred.records[0]?.get("transferred") as number) ?? 0;
// Update kept entity's mention count
if (transferCount > 0) {
await tx.run(
`MATCH (e:Entity {id: $keepId})
SET e.mentionCount = coalesce(e.mentionCount, 0) + $count,
e.lastSeen = $now`,
{ keepId, count: neo4j.int(transferCount), now: new Date().toISOString() },
);
}
// Delete the removed entity (DETACH removes all remaining relationships)
await tx.run(`MATCH (e:Entity {id: $removeId}) DETACH DELETE e`, { removeId });
return transferCount;
});
return true;
} catch {
return false;
} finally {
await session.close();
}
});
}
}

View File

@@ -1,17 +1,18 @@
/**
* Seven-phase sleep cycle for memory consolidation.
* Eight-phase sleep cycle for memory consolidation.
*
* Implements a Pareto-based memory ecosystem where core memory
* is bounded to the top 20% of memories by effective score.
*
* Phases:
* 1. DEDUPLICATION - Merge near-duplicate memories (reduce redundancy)
* 1d. ENTITY DEDUP - Merge near-duplicate entities (reduce entity bloat)
* 2. PARETO SCORING - Calculate effective scores for all memories
* 3. CORE PROMOTION - Regular memories above threshold -> core
* 4. CORE DEMOTION - Core memories below threshold -> regular
* 4. EXTRACTION - Form entity relationships (strengthen connections)
* 5. DECAY/PRUNING - Remove old, low-importance memories (forgetting curve)
* 6. EXTRACTION - Form entity relationships (strengthen connections)
* 7. CLEANUP - Remove orphaned entities/tags (garbage collection)
* 6. CLEANUP - Remove orphaned entities/tags (garbage collection)
* 7. NOISE CLEANUP - Remove dangerous pattern memories
*
* Research basis:
* - Pareto principle (20/80 rule) for memory tiering
@@ -47,6 +48,11 @@ export type SleepCycleResult = {
pairsChecked: number;
duplicatesMerged: number;
};
// Phase 1d: Entity Deduplication
entityDedup: {
pairsFound: number;
merged: number;
};
// Phase 2: Pareto Scoring & Threshold
pareto: {
totalMemories: number;
@@ -74,6 +80,7 @@ export type SleepCycleResult = {
cleanup: {
entitiesRemoved: number;
tagsRemoved: number;
singleUseTagsRemoved: number;
};
// Overall
durationMs: number;
@@ -104,6 +111,9 @@ export type SleepCycleOptions = {
extractionBatchSize?: number; // Memories per batch (default: 50)
extractionDelayMs?: number; // Delay between batches (default: 1000)
// Phase 5: Cleanup
singleUseTagMinAgeDays?: number; // Min age before single-use tag pruning (default: 14)
// Phase 4: Decay
decayRetentionThreshold?: number; // Below this, memory is pruned (default: 0.1)
decayBaseHalfLifeDays?: number; // Base half-life in days (default: 30)
@@ -116,6 +126,7 @@ export type SleepCycleOptions = {
| "dedup"
| "conflict"
| "semanticDedup"
| "entityDedup"
| "pareto"
| "promotion"
| "decay"
@@ -168,6 +179,7 @@ export async function runSleepCycle(
decayCurves,
extractionBatchSize = 50,
extractionDelayMs = 1000,
singleUseTagMinAgeDays = 14,
onPhaseStart,
onProgress,
} = options;
@@ -176,6 +188,7 @@ export async function runSleepCycle(
dedup: { clustersFound: 0, memoriesMerged: 0 },
conflict: { pairsFound: 0, resolved: 0, invalidated: 0 },
semanticDedup: { pairsChecked: 0, duplicatesMerged: 0 },
entityDedup: { pairsFound: 0, merged: 0 },
pareto: {
totalMemories: 0,
coreMemories: 0,
@@ -185,7 +198,7 @@ export async function runSleepCycle(
promotion: { candidatesFound: 0, promoted: 0 },
decay: { memoriesPruned: 0 },
extraction: { total: 0, processed: 0, succeeded: 0, failed: 0 },
cleanup: { entitiesRemoved: 0, tagsRemoved: 0 },
cleanup: { entitiesRemoved: 0, tagsRemoved: 0, singleUseTagsRemoved: 0 },
durationMs: 0,
aborted: false,
};
@@ -419,6 +432,51 @@ export async function runSleepCycle(
}
}
// --------------------------------------------------------------------------
// Phase 1d: Entity Deduplication
// Merge entities where one name is a substring of another (same type).
// Catches: "fish speech" → "fish speech s1 mini", "aaditya" → "aaditya sukhani"
// Transfers MENTIONS relationships to the canonical entity, then deletes the duplicate.
// --------------------------------------------------------------------------
if (!abortSignal?.aborted) {
onPhaseStart?.("entityDedup");
logger.info("memory-neo4j: [sleep] Phase 1d: Entity Deduplication");
try {
const pairs = await db.findDuplicateEntityPairs(agentId);
result.entityDedup.pairsFound = pairs.length;
// Track removed entity IDs to skip cascading merges on already-deleted entities
const removedIds = new Set<string>();
for (const pair of pairs) {
if (abortSignal?.aborted) {
break;
}
// Skip if either entity was already removed in a previous merge
if (removedIds.has(pair.keepId) || removedIds.has(pair.removeId)) {
continue;
}
const merged = await db.mergeEntityPair(pair.keepId, pair.removeId);
if (merged) {
removedIds.add(pair.removeId);
result.entityDedup.merged++;
onProgress?.(
"entityDedup",
`Merged "${pair.removeName}" → "${pair.keepName}" (${pair.removeMentions} mentions transferred)`,
);
}
}
logger.info(
`memory-neo4j: [sleep] Phase 1d complete — ${result.entityDedup.pairsFound} pairs found, ${result.entityDedup.merged} merged`,
);
} catch (err) {
logger.warn(`memory-neo4j: [sleep] Phase 1d error: ${String(err)}`);
}
}
// --------------------------------------------------------------------------
// Phase 2: Pareto Scoring & Threshold Calculation
// --------------------------------------------------------------------------
@@ -438,6 +496,8 @@ export async function runSleepCycle(
paretoThreshold = db.calculateParetoThreshold(allScores, 1 - paretoPercentile);
result.pareto.threshold = paretoThreshold;
const otherCount = allScores.filter((s) => s.category === "other").length;
onProgress?.(
"pareto",
`Scored ${allScores.length} memories (${result.pareto.coreMemories} core, ${result.pareto.regularMemories} regular)`,
@@ -447,6 +507,17 @@ export async function runSleepCycle(
`Pareto threshold (top ${paretoPercentile * 100}%): ${paretoThreshold.toFixed(4)}`,
);
if (otherCount > 0) {
const otherPct = ((otherCount / allScores.length) * 100).toFixed(1);
onProgress?.(
"pareto",
`⚠️ "other" category: ${otherCount} memories (${otherPct}%) — monitor for conversational noise`,
);
logger.info(
`memory-neo4j: [sleep] "other" category monitor: ${otherCount}/${allScores.length} (${otherPct}%)`,
);
}
logger.info(
`memory-neo4j: [sleep] Phase 2 complete — threshold=${paretoThreshold.toFixed(4)} for top ${paretoPercentile * 100}%`,
);
@@ -649,8 +720,23 @@ export async function runSleepCycle(
}
}
// Prune single-use tags (only 1 memory reference, older than threshold)
// These add noise without providing useful cross-memory connections.
if (!abortSignal?.aborted) {
const singleUseTags = await db.findSingleUseTags(singleUseTagMinAgeDays);
if (singleUseTags.length > 0) {
result.cleanup.singleUseTagsRemoved = await db.deleteOrphanTags(
singleUseTags.map((t) => t.id),
);
onProgress?.(
"cleanup",
`Removed ${result.cleanup.singleUseTagsRemoved} single-use tags (>${singleUseTagMinAgeDays}d old)`,
);
}
}
logger.info(
`memory-neo4j: [sleep] Phase 6 complete — ${result.cleanup.entitiesRemoved} entities, ${result.cleanup.tagsRemoved} tags removed`,
`memory-neo4j: [sleep] Phase 6 complete — ${result.cleanup.entitiesRemoved} entities, ${result.cleanup.tagsRemoved} orphan tags, ${result.cleanup.singleUseTagsRemoved} single-use tags removed`,
);
} catch (err) {
logger.warn(`memory-neo4j: [sleep] Phase 6 error: ${String(err)}`);