Files
openclaw/extensions/memory-core/src/concept-vocabulary.ts
Vincent Koc 0609bf8581 feat(memory): harden dreaming and multilingual memory promotion (#60697)
* feat(memory): add recall audit and doctor repair flow

* refactor(memory): rename symbolic scoring and harden dreaming

* feat(memory): add multilingual concept vocabulary

* docs(changelog): note dreaming memory follow-up

* docs(changelog): shorten dreaming follow-up entry

* fix(memory): address review follow-ups

* chore(skills): tighten security triage trust model

* Update CHANGELOG.md
2026-04-04 15:48:13 +09:00

472 lines
8.5 KiB
TypeScript

import path from "node:path";
export const MAX_CONCEPT_TAGS = 8;
export type ConceptTagScriptFamily = "latin" | "cjk" | "mixed" | "other";
export type ConceptTagScriptCoverage = {
latinEntryCount: number;
cjkEntryCount: number;
mixedEntryCount: number;
otherEntryCount: number;
};
const LANGUAGE_STOP_WORDS = {
shared: [
"about",
"after",
"agent",
"again",
"also",
"because",
"before",
"being",
"between",
"build",
"called",
"could",
"daily",
"default",
"deploy",
"during",
"every",
"file",
"files",
"from",
"have",
"into",
"just",
"line",
"lines",
"long",
"main",
"make",
"memory",
"month",
"more",
"most",
"move",
"much",
"next",
"note",
"notes",
"over",
"part",
"past",
"port",
"same",
"score",
"search",
"session",
"sessions",
"short",
"should",
"since",
"some",
"than",
"that",
"their",
"there",
"these",
"they",
"this",
"through",
"today",
"using",
"with",
"work",
"workspace",
"year",
],
english: ["and", "are", "for", "into", "its", "our", "then", "were"],
spanish: [
"al",
"con",
"como",
"de",
"del",
"el",
"en",
"es",
"la",
"las",
"los",
"para",
"por",
"que",
"se",
"sin",
"su",
"sus",
"una",
"uno",
"unos",
"unas",
"y",
],
french: [
"au",
"aux",
"avec",
"dans",
"de",
"des",
"du",
"en",
"est",
"et",
"la",
"le",
"les",
"ou",
"pour",
"que",
"qui",
"sans",
"ses",
"son",
"sur",
"une",
"un",
],
german: [
"auf",
"aus",
"bei",
"das",
"dem",
"den",
"der",
"des",
"die",
"ein",
"eine",
"einem",
"einen",
"einer",
"für",
"im",
"in",
"mit",
"nach",
"oder",
"ohne",
"über",
"und",
"von",
"zu",
"zum",
"zur",
],
cjk: [
"が",
"から",
"する",
"して",
"した",
"で",
"と",
"に",
"の",
"は",
"へ",
"まで",
"も",
"や",
"を",
"与",
"为",
"了",
"及",
"和",
"在",
"将",
"或",
"把",
"是",
"用",
"的",
"과",
"는",
"도",
"로",
"를",
"에",
"에서",
"와",
"은",
"으로",
"을",
"이",
"하다",
"한",
"할",
"해",
"했다",
"했다",
],
pathNoise: [
"cjs",
"cpp",
"cts",
"jsx",
"json",
"md",
"mjs",
"mts",
"text",
"toml",
"ts",
"tsx",
"txt",
"yaml",
"yml",
],
} as const;
const CONCEPT_STOP_WORDS = new Set(
Object.values(LANGUAGE_STOP_WORDS)
.flatMap((words) => words)
.map((word) => word.toLowerCase()),
);
const PROTECTED_GLOSSARY = [
"backup",
"backups",
"embedding",
"embeddings",
"failover",
"gateway",
"glacier",
"gpt",
"kv",
"network",
"openai",
"qmd",
"router",
"s3",
"vlan",
"sauvegarde",
"routeur",
"passerelle",
"konfiguration",
"sicherung",
"überwachung",
"configuración",
"respaldo",
"enrutador",
"puerta-de-enlace",
"バックアップ",
"フェイルオーバー",
"ルーター",
"ネットワーク",
"ゲートウェイ",
"障害対応",
"路由器",
"备份",
"故障转移",
"网络",
"网关",
"라우터",
"백업",
"페일오버",
"네트워크",
"게이트웨이",
"장애대응",
].map((word) => word.normalize("NFKC").toLowerCase());
const COMPOUND_TOKEN_RE = /[\p{L}\p{N}]+(?:[._/-][\p{L}\p{N}]+)+/gu;
const LETTER_OR_NUMBER_RE = /[\p{L}\p{N}]/u;
const LATIN_RE = /\p{Script=Latin}/u;
const HAN_RE = /\p{Script=Han}/u;
const HIRAGANA_RE = /\p{Script=Hiragana}/u;
const KATAKANA_RE = /\p{Script=Katakana}/u;
const HANGUL_RE = /\p{Script=Hangul}/u;
const DEFAULT_WORD_SEGMENTER =
typeof Intl.Segmenter === "function" ? new Intl.Segmenter("und", { granularity: "word" }) : null;
function containsLetterOrNumber(value: string): boolean {
return LETTER_OR_NUMBER_RE.test(value);
}
export function classifyConceptTagScript(tag: string): ConceptTagScriptFamily {
const normalized = tag.normalize("NFKC");
const hasLatin = LATIN_RE.test(normalized);
const hasCjk =
HAN_RE.test(normalized) ||
HIRAGANA_RE.test(normalized) ||
KATAKANA_RE.test(normalized) ||
HANGUL_RE.test(normalized);
if (hasLatin && hasCjk) {
return "mixed";
}
if (hasCjk) {
return "cjk";
}
if (hasLatin) {
return "latin";
}
return "other";
}
function minimumTokenLengthForScript(script: ConceptTagScriptFamily): number {
if (script === "cjk") {
return 2;
}
return 3;
}
function isKanaOnlyToken(value: string): boolean {
return (
!HAN_RE.test(value) &&
!HANGUL_RE.test(value) &&
(HIRAGANA_RE.test(value) || KATAKANA_RE.test(value))
);
}
function normalizeConceptToken(rawToken: string): string | null {
const normalized = rawToken
.normalize("NFKC")
.replace(/^[^\p{L}\p{N}]+|[^\p{L}\p{N}]+$/gu, "")
.replaceAll("_", "-")
.toLowerCase();
if (!normalized || !containsLetterOrNumber(normalized) || normalized.length > 32) {
return null;
}
if (
/^\d+$/.test(normalized) ||
/^\d{4}-\d{2}-\d{2}$/u.test(normalized) ||
/^\d{4}-\d{2}-\d{2}\.[\p{L}\p{N}]+$/u.test(normalized)
) {
return null;
}
const script = classifyConceptTagScript(normalized);
if (normalized.length < minimumTokenLengthForScript(script)) {
return null;
}
if (isKanaOnlyToken(normalized) && normalized.length < 3) {
return null;
}
if (CONCEPT_STOP_WORDS.has(normalized)) {
return null;
}
return normalized;
}
function collectGlossaryMatches(source: string): string[] {
const normalizedSource = source.normalize("NFKC").toLowerCase();
const matches: string[] = [];
for (const entry of PROTECTED_GLOSSARY) {
if (!normalizedSource.includes(entry)) {
continue;
}
matches.push(entry);
}
return matches;
}
function collectCompoundTokens(source: string): string[] {
return source.match(COMPOUND_TOKEN_RE) ?? [];
}
function collectSegmentTokens(source: string): string[] {
if (DEFAULT_WORD_SEGMENTER) {
return Array.from(DEFAULT_WORD_SEGMENTER.segment(source), (part) =>
part.isWordLike ? part.segment : "",
).filter(Boolean);
}
return source.split(/[^\p{L}\p{N}]+/u).filter(Boolean);
}
function pushNormalizedTag(tags: string[], rawToken: string, limit: number): void {
const normalized = normalizeConceptToken(rawToken);
if (!normalized || tags.includes(normalized)) {
return;
}
tags.push(normalized);
if (tags.length > limit) {
tags.splice(limit);
}
}
export function deriveConceptTags(params: {
path: string;
snippet: string;
limit?: number;
}): string[] {
const source = `${path.basename(params.path)} ${params.snippet}`;
const limit = Number.isFinite(params.limit)
? Math.max(0, Math.floor(params.limit as number))
: MAX_CONCEPT_TAGS;
if (limit === 0) {
return [];
}
const tags: string[] = [];
for (const rawToken of [
...collectGlossaryMatches(source),
...collectCompoundTokens(source),
...collectSegmentTokens(source),
]) {
pushNormalizedTag(tags, rawToken, limit);
if (tags.length >= limit) {
break;
}
}
return tags;
}
export function summarizeConceptTagScriptCoverage(
conceptTagsByEntry: string[][],
): ConceptTagScriptCoverage {
const coverage: ConceptTagScriptCoverage = {
latinEntryCount: 0,
cjkEntryCount: 0,
mixedEntryCount: 0,
otherEntryCount: 0,
};
for (const conceptTags of conceptTagsByEntry) {
let hasLatin = false;
let hasCjk = false;
let hasOther = false;
for (const tag of conceptTags) {
const family = classifyConceptTagScript(tag);
if (family === "mixed") {
hasLatin = true;
hasCjk = true;
continue;
}
if (family === "latin") {
hasLatin = true;
continue;
}
if (family === "cjk") {
hasCjk = true;
continue;
}
hasOther = true;
}
if (hasLatin && hasCjk) {
coverage.mixedEntryCount += 1;
} else if (hasCjk) {
coverage.cjkEntryCount += 1;
} else if (hasLatin) {
coverage.latinEntryCount += 1;
} else if (hasOther) {
coverage.otherEntryCount += 1;
}
}
return coverage;
}
export const __testing = {
normalizeConceptToken,
collectGlossaryMatches,
collectCompoundTokens,
collectSegmentTokens,
};