mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-03 03:03:24 -04:00
memory-neo4j: improve tag coverage with stronger extraction + retroactive tagging
- Strengthen extraction prompt to always generate 2-4 tags per memory - Add Phase 2b: Retroactive Tagging to sleep cycle for untagged memories - Include 'skipped' memories in extraction pipeline (imported memories) - Add listUntaggedMemories() helper to neo4j-client - Add extractTagsOnly() lightweight prompt for tag-only extraction - Add CLI display for Phase 2b stats Fixes: 79% of memories had zero tags due to weak prompt guidance and imported memories never going through extraction.
This commit is contained in:
@@ -315,6 +315,7 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void {
|
||||
console.log(" Phase 1c: Conflict Detection — Resolve contradictory memories");
|
||||
console.log(" Phase 1d: Entity Dedup — Merge duplicate entity nodes");
|
||||
console.log(" Phase 2: Extraction — Extract entities and categorize");
|
||||
console.log(" Phase 2b: Retroactive Tagging — Tag memories missing topic tags");
|
||||
console.log(" Phase 3: Decay & Pruning — Remove stale low-importance memories");
|
||||
console.log(" Phase 4: Orphan Cleanup — Remove disconnected nodes");
|
||||
console.log(" Phase 5: Noise Cleanup — Remove dangerous pattern memories");
|
||||
@@ -399,6 +400,7 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void {
|
||||
conflict: "Phase 1c: Conflict Detection",
|
||||
entityDedup: "Phase 1d: Entity Deduplication",
|
||||
extraction: "Phase 2: Extraction",
|
||||
retroactiveTagging: "Phase 2b: Retroactive Tagging",
|
||||
decay: "Phase 3: Decay & Pruning",
|
||||
cleanup: "Phase 4: Orphan Cleanup",
|
||||
noiseCleanup: "Phase 5: Noise Cleanup",
|
||||
@@ -430,6 +432,12 @@ export function registerCli(api: OpenClawPluginApi, deps: CliDeps): void {
|
||||
` Extraction: ${result.extraction.succeeded}/${result.extraction.total} extracted` +
|
||||
(result.extraction.failed > 0 ? ` (${result.extraction.failed} failed)` : ""),
|
||||
);
|
||||
console.log(
|
||||
` Retro-Tagging: ${result.retroactiveTagging.tagged}/${result.retroactiveTagging.total} tagged` +
|
||||
(result.retroactiveTagging.failed > 0
|
||||
? ` (${result.retroactiveTagging.failed} failed)`
|
||||
: ""),
|
||||
);
|
||||
console.log(
|
||||
` Cleanup: ${result.cleanup.entitiesRemoved} entities, ${result.cleanup.tagsRemoved} tags removed`,
|
||||
);
|
||||
|
||||
@@ -53,9 +53,40 @@ Rules:
|
||||
- Good entities: "Tarun", "Abundent Academy", "Tioman Island", "LiveKit", "Neo4j", "Fish Speech S1 Mini"
|
||||
- Bad entities: "python", "ai", "automation", "email", "docker", "machine learning", "api"
|
||||
- When in doubt, do NOT extract — fewer high-quality entities beat many generic ones
|
||||
- Return empty arrays if nothing specific to extract
|
||||
- Keep entity descriptions brief (1 sentence max)
|
||||
- Category: "preference" for opinions/preferences, "fact" for factual info, "decision" for choices made, "entity" for entity-focused, "other" for miscellaneous`;
|
||||
- Category: "preference" for opinions/preferences, "fact" for factual info, "decision" for choices made, "entity" for entity-focused, "other" for miscellaneous
|
||||
- ALWAYS generate at least 2 tags. Every memory has a topic — there are no exceptions.
|
||||
- Tags describe the TOPIC or DOMAIN of the memory, not the entities themselves.
|
||||
- Do NOT use entity names as tags (e.g., don't tag "tarun" if Tarun is already an entity).
|
||||
- Good tags: "travel planning", "family", "voice synthesis", "linkedin automation", "expense tracking", "cron scheduling", "api integration"
|
||||
- Tag categories: "topic", "domain", "workflow", "technology", "personal", "business"
|
||||
- Return empty entity/relationship arrays if nothing specific to extract, but NEVER return empty tags.`;
|
||||
|
||||
// ============================================================================
|
||||
// Retroactive Tagging Prompt
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Lightweight prompt for retroactive tagging of memories that were extracted
|
||||
* without tags. Only asks for tags — no entities or relationships.
|
||||
*/
|
||||
const RETROACTIVE_TAGGING_SYSTEM = `You are a topic tagging system for a personal memory store.
|
||||
Generate 2-4 topic tags that describe what this memory is about.
|
||||
|
||||
Return JSON:
|
||||
{
|
||||
"tags": [
|
||||
{"name": "tag name", "category": "topic|domain|workflow|technology|personal|business"}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Tags describe the TOPIC or DOMAIN of the memory, not specific people or tools mentioned.
|
||||
- Good tags: "travel planning", "family", "voice synthesis", "linkedin automation", "expense tracking", "cron scheduling", "api integration", "system configuration", "memory management"
|
||||
- Bad tags: names of people, companies, or specific tools (those are entities, not topics)
|
||||
- Tag categories: "topic" (general subject), "domain" (field/area), "workflow" (process/procedure), "technology" (tech area), "personal" (personal life), "business" (work/business)
|
||||
- ALWAYS return at least 2 tags. Every memory has a topic.
|
||||
- Normalize tag names to lowercase with spaces (no hyphens or underscores).`;
|
||||
|
||||
// ============================================================================
|
||||
// Entity Extraction
|
||||
@@ -118,6 +149,57 @@ export async function extractEntities(
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract only tags from a memory text using a lightweight LLM prompt.
|
||||
* Used for retroactive tagging of memories that were extracted without tags.
|
||||
*
|
||||
* Returns an array of tags, or null on failure.
|
||||
*/
|
||||
export async function extractTagsOnly(
|
||||
text: string,
|
||||
config: ExtractionConfig,
|
||||
abortSignal?: AbortSignal,
|
||||
): Promise<Array<{ name: string; category: string }> | null> {
|
||||
if (!config.enabled) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const messages = [
|
||||
{ role: "system", content: RETROACTIVE_TAGGING_SYSTEM },
|
||||
{ role: "user", content: text },
|
||||
];
|
||||
|
||||
let content: string | null;
|
||||
try {
|
||||
content = await callOpenRouterStream(config, messages, abortSignal);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!content) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(content) as { tags?: unknown };
|
||||
const rawTags = Array.isArray(parsed.tags) ? parsed.tags : [];
|
||||
return rawTags
|
||||
.filter(
|
||||
(t: unknown): t is Record<string, unknown> =>
|
||||
t !== null &&
|
||||
typeof t === "object" &&
|
||||
typeof (t as Record<string, unknown>).name === "string",
|
||||
)
|
||||
.map((t) => ({
|
||||
name: normalizeTagName(String(t.name)),
|
||||
category: typeof t.category === "string" ? t.category : "topic",
|
||||
}))
|
||||
.filter((t) => t.name.length > 0);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a tag name: lowercase, collapse hyphens/underscores to spaces,
|
||||
* collapse multiple spaces, trim. Ensures "machine-learning", "machine_learning",
|
||||
|
||||
@@ -915,7 +915,7 @@ export class Neo4jMemoryClient {
|
||||
const agentFilter = agentId ? "AND m.agentId = $agentId" : "";
|
||||
const result = await session.run(
|
||||
`MATCH (m:Memory)
|
||||
WHERE m.extractionStatus = 'pending' ${agentFilter}
|
||||
WHERE m.extractionStatus IN ['pending', 'skipped'] ${agentFilter}
|
||||
RETURN m.id AS id, m.text AS text, m.agentId AS agentId,
|
||||
coalesce(m.extractionRetries, 0) AS extractionRetries
|
||||
ORDER BY m.createdAt ASC
|
||||
@@ -967,6 +967,36 @@ export class Neo4jMemoryClient {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* List memories with completed extraction but no TAGGED relationships.
|
||||
* Used by the retroactive tagging phase to find memories that need tags.
|
||||
*/
|
||||
async listUntaggedMemories(
|
||||
limit: number = 50,
|
||||
agentId?: string,
|
||||
): Promise<Array<{ id: string; text: string }>> {
|
||||
await this.ensureInitialized();
|
||||
const session = this.driver!.session();
|
||||
try {
|
||||
const agentFilter = agentId ? "AND m.agentId = $agentId" : "";
|
||||
const result = await session.run(
|
||||
`MATCH (m:Memory)
|
||||
WHERE m.extractionStatus = 'complete' ${agentFilter}
|
||||
AND NOT EXISTS { MATCH (m)-[:TAGGED]->(:Tag) }
|
||||
RETURN m.id AS id, m.text AS text
|
||||
ORDER BY m.createdAt ASC
|
||||
LIMIT $limit`,
|
||||
{ limit: neo4j.int(limit), ...(agentId ? { agentId } : {}) },
|
||||
);
|
||||
return result.records.map((r) => ({
|
||||
id: r.get("id") as string,
|
||||
text: r.get("text") as string,
|
||||
}));
|
||||
} finally {
|
||||
await session.close();
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Sleep Cycle: Deduplication
|
||||
// --------------------------------------------------------------------------
|
||||
|
||||
@@ -23,7 +23,12 @@ import type { ExtractionConfig } from "./config.js";
|
||||
import type { Embeddings } from "./embeddings.js";
|
||||
import type { Neo4jMemoryClient } from "./neo4j-client.js";
|
||||
import type { Logger } from "./schema.js";
|
||||
import { isSemanticDuplicate, resolveConflict, runBackgroundExtraction } from "./extractor.js";
|
||||
import {
|
||||
extractTagsOnly,
|
||||
isSemanticDuplicate,
|
||||
resolveConflict,
|
||||
runBackgroundExtraction,
|
||||
} from "./extractor.js";
|
||||
import { makePairKey } from "./schema.js";
|
||||
import { reviewAndArchiveStaleTasks, type StaleTaskResult } from "./task-ledger.js";
|
||||
|
||||
@@ -59,6 +64,12 @@ export type SleepCycleResult = {
|
||||
succeeded: number;
|
||||
failed: number;
|
||||
};
|
||||
// Phase 2b: Retroactive Tagging
|
||||
retroactiveTagging: {
|
||||
total: number;
|
||||
tagged: number;
|
||||
failed: number;
|
||||
};
|
||||
// Phase 3: Decay & Pruning
|
||||
decay: {
|
||||
memoriesPruned: number;
|
||||
@@ -105,6 +116,10 @@ export type SleepCycleOptions = {
|
||||
extractionBatchSize?: number; // Memories per batch (default: 50)
|
||||
extractionDelayMs?: number; // Delay between batches (default: 1000)
|
||||
|
||||
// Phase 2b: Retroactive Tagging
|
||||
skipRetroactiveTagging?: boolean; // Skip retroactive tagging (default: false)
|
||||
retroactiveTagBatchSize?: number; // Memories per batch (default: 50)
|
||||
|
||||
// Phase 4: Cleanup
|
||||
singleUseTagMinAgeDays?: number; // Min age before single-use tag pruning (default: 14)
|
||||
|
||||
@@ -127,6 +142,7 @@ export type SleepCycleOptions = {
|
||||
| "entityDedup"
|
||||
| "decay"
|
||||
| "extraction"
|
||||
| "retroactiveTagging"
|
||||
| "cleanup"
|
||||
| "noiseCleanup"
|
||||
| "credentialScan"
|
||||
@@ -225,6 +241,8 @@ export async function runSleepCycle(
|
||||
decayCurves,
|
||||
extractionBatchSize = 50,
|
||||
extractionDelayMs = 1000,
|
||||
skipRetroactiveTagging = false,
|
||||
retroactiveTagBatchSize = 50,
|
||||
singleUseTagMinAgeDays = 14,
|
||||
workspaceDir,
|
||||
staleTaskMaxAgeMs,
|
||||
@@ -239,6 +257,7 @@ export async function runSleepCycle(
|
||||
entityDedup: { pairsFound: 0, merged: 0 },
|
||||
decay: { memoriesPruned: 0 },
|
||||
extraction: { total: 0, processed: 0, succeeded: 0, failed: 0 },
|
||||
retroactiveTagging: { total: 0, tagged: 0, failed: 0 },
|
||||
cleanup: { entitiesRemoved: 0, tagsRemoved: 0, singleUseTagsRemoved: 0 },
|
||||
credentialScan: { memoriesScanned: 0, credentialsFound: 0, memoriesRemoved: 0 },
|
||||
taskLedger: { staleCount: 0, archivedCount: 0, archivedIds: [] },
|
||||
@@ -541,7 +560,7 @@ export async function runSleepCycle(
|
||||
try {
|
||||
// Get initial count
|
||||
const counts = await db.countByExtractionStatus(agentId);
|
||||
result.extraction.total = counts.pending;
|
||||
result.extraction.total = counts.pending + counts.skipped;
|
||||
|
||||
if (result.extraction.total > 0) {
|
||||
let hasMore = true;
|
||||
@@ -616,6 +635,94 @@ export async function runSleepCycle(
|
||||
logger.info("memory-neo4j: [sleep] Phase 2 skipped — extraction not enabled");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Phase 2b: Retroactive Tagging
|
||||
// Find memories with completed extraction but no tags, and generate tags
|
||||
// using a lightweight LLM prompt. This fixes the historical gap where
|
||||
// the extraction prompt treated tags as optional.
|
||||
// --------------------------------------------------------------------------
|
||||
if (!abortSignal?.aborted && config.enabled && !skipRetroactiveTagging) {
|
||||
onPhaseStart?.("retroactiveTagging");
|
||||
logger.info("memory-neo4j: [sleep] Phase 2b: Retroactive Tagging");
|
||||
|
||||
try {
|
||||
let hasMore = true;
|
||||
while (hasMore && !abortSignal?.aborted) {
|
||||
const untagged = await db.listUntaggedMemories(retroactiveTagBatchSize, agentId);
|
||||
|
||||
if (untagged.length === 0) {
|
||||
hasMore = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// Count total on first batch
|
||||
if (result.retroactiveTagging.total === 0) {
|
||||
result.retroactiveTagging.total = untagged.length;
|
||||
}
|
||||
|
||||
// Process in parallel chunks of llmConcurrency
|
||||
for (let i = 0; i < untagged.length && !abortSignal?.aborted; i += llmConcurrency) {
|
||||
const chunk = untagged.slice(i, i + llmConcurrency);
|
||||
const outcomes = await Promise.allSettled(
|
||||
chunk.map((memory) => extractTagsOnly(memory.text, config, abortSignal)),
|
||||
);
|
||||
|
||||
for (let k = 0; k < outcomes.length; k++) {
|
||||
const outcome = outcomes[k];
|
||||
const memory = chunk[k];
|
||||
|
||||
if (outcome.status === "fulfilled" && outcome.value && outcome.value.length > 0) {
|
||||
try {
|
||||
await db.batchEntityOperations(memory.id, [], [], outcome.value);
|
||||
result.retroactiveTagging.tagged++;
|
||||
onProgress?.(
|
||||
"retroactiveTagging",
|
||||
`Tagged "${memory.text.slice(0, 50)}..." with ${outcome.value.length} tags`,
|
||||
);
|
||||
} catch (err) {
|
||||
result.retroactiveTagging.failed++;
|
||||
logger.warn(
|
||||
`memory-neo4j: [sleep] retroactive tagging write failed for ${memory.id.slice(0, 8)}: ${String(err)}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
result.retroactiveTagging.failed++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if there are more untagged memories
|
||||
const nextBatch = await db.listUntaggedMemories(1, agentId);
|
||||
hasMore = nextBatch.length > 0;
|
||||
|
||||
// Delay between batches (abort-aware)
|
||||
if (hasMore && !abortSignal?.aborted) {
|
||||
await new Promise<void>((resolve) => {
|
||||
const timer = setTimeout(resolve, extractionDelayMs);
|
||||
abortSignal?.addEventListener(
|
||||
"abort",
|
||||
() => {
|
||||
clearTimeout(timer);
|
||||
resolve();
|
||||
},
|
||||
{ once: true },
|
||||
);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logger.info(
|
||||
`memory-neo4j: [sleep] Phase 2b complete — ${result.retroactiveTagging.tagged} tagged, ${result.retroactiveTagging.failed} failed`,
|
||||
);
|
||||
} catch (err) {
|
||||
logger.warn(`memory-neo4j: [sleep] Phase 2b error: ${String(err)}`);
|
||||
}
|
||||
} else if (!config.enabled) {
|
||||
logger.info("memory-neo4j: [sleep] Phase 2b skipped — extraction not enabled");
|
||||
} else if (skipRetroactiveTagging) {
|
||||
logger.info("memory-neo4j: [sleep] Phase 2b skipped — retroactive tagging disabled");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Phase 3: Decay & Pruning (after extraction so freshly extracted memories
|
||||
// aren't pruned before they build entity connections)
|
||||
|
||||
Reference in New Issue
Block a user