fix(memory-wiki): support Unicode characters in slugifyWikiSegment (#64742)

* fix(memory-wiki): support Unicode characters in slugifyWikiSegment

Replace ASCII-only regex with Unicode-aware regex to preserve CJK,
Cyrillic, Arabic, and other non-ASCII characters in wiki slugs.

Fixes #64620

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* test(memory-wiki): cover Unicode slug regressions

* fix(memory-wiki): preserve combining marks in slugs

* fix(memory-wiki): cap composed source filenames

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
zhouhe-xydt
2026-04-13 00:54:41 +08:00
committed by GitHub
parent 68a64a14d9
commit 879bb5dd91
9 changed files with 234 additions and 9 deletions

View File

@@ -36,6 +36,7 @@ Docs: https://docs.openclaw.ai
- Dreaming/cron: wake managed dreaming jobs immediately instead of waiting for the next heartbeat, so scheduled dreaming runs start when the cron fires. (#65053) Thanks @l0cka and @vincentkoc.
- QA/packaging: stop packaged QA helpers from crashing when optional scenario execution config is unavailable, so npm distributions can skip the repo-only scenario pack without breaking completion-cache and startup paths. (#65118) Thanks @EdderTalmor and @vincentkoc.
- Media/audio transcription: surface the real provider failure when every audio transcription attempt fails, so status output and the CLI stop collapsing those errors into generic skips. (#65096) Thanks @l0cka and @vincentkoc.
- Memory/wiki: preserve Unicode letters, digits, and combining marks in wiki slugs and contradiction clustering, and cap Unicode filename segments to safe byte lengths so non-ASCII titles stop collapsing or overflowing path limits. (#64742) Thanks @zhouhe-xydt and @vincentkoc.
## 2026.4.11

View File

@@ -299,4 +299,47 @@ describe("syncMemoryWikiBridgeSources", () => {
code: "ENOENT",
});
});
it("caps composed bridge source filenames to the filesystem component limit", async () => {
const workspaceDir = await createBridgeWorkspace(`${"漢".repeat(50)}-workspace`);
const { rootDir: vaultDir, config } = await createVault({
rootDir: nextCaseRoot("long-bridge-vault"),
config: {
vaultMode: "bridge",
bridge: {
enabled: true,
readMemoryArtifacts: true,
indexDailyNotes: true,
},
},
});
const relativePath = `${"語".repeat(50)}/${"録".repeat(50)}.md`;
const absolutePath = path.join(workspaceDir, relativePath);
await fs.mkdir(path.dirname(absolutePath), { recursive: true });
await fs.writeFile(absolutePath, "# Deep Unicode Note\n", "utf8");
registerBridgeArtifacts([
{
kind: "daily-note",
workspaceDir,
relativePath,
absolutePath,
agentIds: ["main"],
contentType: "markdown",
},
]);
const appConfig: OpenClawConfig = {
agents: {
list: [{ id: "main", default: true, workspace: workspaceDir }],
},
};
const result = await syncMemoryWikiBridgeSources({ config, appConfig });
const pagePath = result.pagePaths[0] ?? "";
expect(result.importedCount).toBe(1);
expect(Buffer.byteLength(path.basename(pagePath))).toBeLessThanOrEqual(255);
await expect(fs.stat(path.join(vaultDir, pagePath))).resolves.toBeTruthy();
});
});

View File

@@ -8,7 +8,12 @@ import {
import type { OpenClawConfig } from "../api.js";
import type { ResolvedMemoryWikiConfig } from "./config.js";
import { appendMemoryWikiLog } from "./log.js";
import { renderMarkdownFence, renderWikiMarkdown, slugifyWikiSegment } from "./markdown.js";
import {
createWikiPageFilename,
renderMarkdownFence,
renderWikiMarkdown,
slugifyWikiSegment,
} from "./markdown.js";
import { writeImportedSourcePage } from "./source-page-shared.js";
import { resolveArtifactKey } from "./source-path-shared.js";
import {
@@ -110,11 +115,10 @@ function resolveBridgePagePath(params: { workspaceDir: string; relativePath: str
const artifactHash = createHash("sha1").update(params.relativePath).digest("hex");
const workspaceSlug = `${workspaceBaseSlug}-${workspaceHash.slice(0, 8)}`;
const artifactSlug = `${artifactBaseSlug}-${artifactHash.slice(0, 8)}`;
const fileName = createWikiPageFilename(`bridge-${workspaceSlug}-${artifactSlug}`);
return {
pageId: `source.bridge.${workspaceSlug}.${artifactSlug}`,
pagePath: path
.join("sources", `bridge-${workspaceSlug}-${artifactSlug}.md`)
.replace(/\\/g, "/"),
pagePath: path.join("sources", fileName).replace(/\\/g, "/"),
workspaceSlug,
artifactSlug,
};

View File

@@ -0,0 +1,60 @@
import { describe, expect, it } from "vitest";
import { buildPageContradictionClusters } from "./claim-health.js";
import type { WikiPageSummary } from "./markdown.js";
function createPage(params: {
relativePath: string;
title: string;
contradictions: string[];
}): WikiPageSummary {
return {
absolutePath: `/tmp/${params.relativePath}`,
relativePath: params.relativePath,
kind: "entity",
title: params.title,
sourceIds: [],
linkTargets: [],
claims: [],
contradictions: params.contradictions,
questions: [],
};
}
describe("buildPageContradictionClusters", () => {
it("clusters Unicode contradiction notes that differ only by punctuation", () => {
const clusters = buildPageContradictionClusters([
createPage({
relativePath: "entities/alpha.md",
title: "Alpha",
contradictions: ["模型冲突:版本 A"],
}),
createPage({
relativePath: "entities/beta.md",
title: "Beta",
contradictions: ["模型冲突 版本 A"],
}),
]);
expect(clusters).toHaveLength(1);
expect(clusters[0]?.entries).toHaveLength(2);
});
it("keeps combining-mark contradiction notes in separate clusters", () => {
const clusters = buildPageContradictionClusters([
createPage({
relativePath: "entities/alpha.md",
title: "Alpha",
contradictions: ["किताब"],
}),
createPage({
relativePath: "entities/beta.md",
title: "Beta",
contradictions: ["कीताब"],
}),
]);
expect(clusters).toHaveLength(2);
expect(clusters.map((cluster) => cluster.key).toSorted()).toEqual(["किताब", "कीताब"]);
expect(clusters.every((cluster) => cluster.entries)).toBe(true);
});
});

View File

@@ -66,7 +66,7 @@ function normalizeClaimTextKey(text: string): string {
function normalizeTextKey(text: string): string {
return normalizeLowercaseStringOrEmpty(text)
.replace(/[^a-z0-9]+/g, " ")
.replace(/[^\p{L}\p{N}\p{M}]+/gu, " ")
.replace(/\s+/g, " ");
}

View File

@@ -0,0 +1,42 @@
import { createHash } from "node:crypto";
import { describe, expect, it } from "vitest";
import { createWikiPageFilename, slugifyWikiSegment } from "./markdown.js";
describe("slugifyWikiSegment", () => {
it("preserves Unicode letters and numbers in wiki slugs", () => {
expect(slugifyWikiSegment("大语言模型概述")).toBe("大语言模型概述");
expect(slugifyWikiSegment("LLM 架构分析")).toBe("llm-架构分析");
expect(slugifyWikiSegment("Circuit Breaker 自動恢復")).toBe("circuit-breaker-自動恢復");
});
it("keeps ASCII behavior unchanged", () => {
expect(slugifyWikiSegment("hello world")).toBe("hello-world");
expect(slugifyWikiSegment("")).toBe("page");
});
it("retains combining marks so distinct titles do not collapse", () => {
expect(slugifyWikiSegment("किताब")).toBe("किताब");
expect(slugifyWikiSegment("कुतुब")).toBe("कुतुब");
expect(slugifyWikiSegment("कीताब")).toBe("कीताब");
});
it("caps long Unicode slugs to a safe filename byte length", () => {
const title = "漢".repeat(90);
const slug = slugifyWikiSegment(title);
expect(slug.endsWith(`-${createHash("sha1").update(title).digest("hex").slice(0, 12)}`)).toBe(
true,
);
expect(Buffer.byteLength(slug)).toBeLessThanOrEqual(240);
expect(slugifyWikiSegment(title)).toBe(slug);
});
it("caps composed wiki page filenames to a safe path-component length", () => {
const stem = `bridge-${"漢".repeat(45)}-${"語".repeat(45)}`;
const fileName = createWikiPageFilename(stem);
expect(fileName.endsWith(".md")).toBe(true);
expect(Buffer.byteLength(fileName)).toBeLessThanOrEqual(255);
expect(createWikiPageFilename(stem)).toBe(fileName);
});
});

View File

@@ -1,3 +1,4 @@
import { createHash } from "node:crypto";
import path from "node:path";
import {
normalizeLowercaseStringOrEmpty,
@@ -65,13 +66,54 @@ const RELATED_BLOCK_PATTERN = new RegExp(
`${WIKI_RELATED_START_MARKER}[\\s\\S]*?${WIKI_RELATED_END_MARKER}`,
"g",
);
const MAX_WIKI_SEGMENT_BYTES = 240;
const MAX_WIKI_FILENAME_COMPONENT_BYTES = 255;
const WIKI_SEGMENT_HASH_BYTES = 12;
function truncateUtf8CodePointSafe(value: string, maxBytes: number): string {
let result = "";
let bytes = 0;
for (const char of value) {
const nextBytes = Buffer.byteLength(char);
if (bytes + nextBytes > maxBytes) {
break;
}
result += char;
bytes += nextBytes;
}
return result;
}
function capWikiValueWithHash(raw: string, maxBytes: number, fallback: string): string {
if (Buffer.byteLength(raw) <= maxBytes) {
return raw;
}
const suffix = createHash("sha1").update(raw).digest("hex").slice(0, WIKI_SEGMENT_HASH_BYTES);
const truncated = truncateUtf8CodePointSafe(
raw,
maxBytes - Buffer.byteLength(`-${suffix}`),
).replace(/-+$/g, "");
return `${truncated || fallback}-${suffix}`;
}
export function slugifyWikiSegment(raw: string): string {
const slug = normalizeLowercaseStringOrEmpty(raw)
.replace(/[^a-z0-9]+/g, "-")
.replace(/[^\p{L}\p{N}\p{M}]+/gu, "-")
.replace(/-+/g, "-")
.replace(/^-+|-+$/g, "");
return slug || "page";
if (!slug) {
return "page";
}
return capWikiValueWithHash(slug, MAX_WIKI_SEGMENT_BYTES, "page");
}
export function createWikiPageFilename(stem: string, extension = ".md"): string {
const normalizedExtension = extension.startsWith(".") ? extension : `.${extension}`;
const maxStemBytes = Math.max(
1,
MAX_WIKI_FILENAME_COMPONENT_BYTES - Buffer.byteLength(normalizedExtension),
);
return `${capWikiValueWithHash(stem, maxStemBytes, "page")}${normalizedExtension}`;
}
export function parseWikiMarkdown(content: string): ParsedWikiMarkdown {

View File

@@ -103,4 +103,30 @@ describe("syncMemoryWikiUnsafeLocalSources", () => {
code: "ENOENT",
});
});
it("caps composed unsafe-local filenames to the filesystem component limit", async () => {
const privateDir = await createPrivateDir(`${"漢".repeat(50)}-private`);
const nestedDir = path.join(privateDir, `${"語".repeat(50)}-nested`);
const secretPath = path.join(nestedDir, `${"録".repeat(50)}.md`);
await fs.mkdir(nestedDir, { recursive: true });
await fs.writeFile(secretPath, "# very private\n", "utf8");
const { rootDir: vaultDir, config } = await createVault({
rootDir: nextCaseRoot("long-unsafe-vault"),
config: {
vaultMode: "unsafe-local",
unsafeLocal: {
allowPrivateMemoryCoreAccess: true,
paths: [privateDir],
},
},
});
const result = await syncMemoryWikiUnsafeLocalSources(config);
const pagePath = result.pagePaths[0] ?? "";
expect(result.importedCount).toBe(1);
expect(Buffer.byteLength(path.basename(pagePath))).toBeLessThanOrEqual(255);
await expect(fs.stat(path.join(vaultDir, pagePath))).resolves.toBeTruthy();
});
});

View File

@@ -5,7 +5,12 @@ import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtim
import type { BridgeMemoryWikiResult } from "./bridge.js";
import type { ResolvedMemoryWikiConfig } from "./config.js";
import { appendMemoryWikiLog } from "./log.js";
import { renderMarkdownFence, renderWikiMarkdown, slugifyWikiSegment } from "./markdown.js";
import {
createWikiPageFilename,
renderMarkdownFence,
renderWikiMarkdown,
slugifyWikiSegment,
} from "./markdown.js";
import { writeImportedSourcePage } from "./source-page-shared.js";
import { resolveArtifactKey } from "./source-path-shared.js";
import {
@@ -113,7 +118,9 @@ function resolveUnsafeLocalPagePath(params: { configuredPath: string; absolutePa
const pageSlug = `${configuredBaseSlug}-${configuredHash}-${artifactBaseSlug}-${artifactHash}`;
return {
pageId: `source.unsafe-local.${pageSlug}`,
pagePath: path.join("sources", `unsafe-local-${pageSlug}.md`).replace(/\\/g, "/"),
pagePath: path
.join("sources", createWikiPageFilename(`unsafe-local-${pageSlug}`))
.replace(/\\/g, "/"),
};
}