Memory: add Arabic query expansion stop words (#23717)

This commit is contained in:
Vincent Koc
2026-02-22 12:17:47 -05:00
committed by GitHub
parent 8c71bbe1e1
commit 9ae08ce205
3 changed files with 80 additions and 0 deletions

View File

@@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai
- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
- Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc.
- Memory/FTS: add Spanish and Portuguese stop-word filtering for query expansion in FTS-only search mode, improving conversational recall for both languages. Thanks @vincentkoc.
- Memory/FTS: add Arabic stop-word filtering for query expansion in FTS-only search mode to reduce conversational filler in Arabic memory searches. Thanks @vincentkoc.
- iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.
### Breaking

View File

@@ -143,6 +143,22 @@ describe("extractKeywords", () => {
expect(keywords).not.toContain("onde");
});
it("extracts keywords from Arabic conversational query", () => {
const keywords = extractKeywords("بالأمس ناقشنا استراتيجية النشر");
expect(keywords).toContain("ناقشنا");
expect(keywords).toContain("استراتيجية");
expect(keywords).toContain("النشر");
expect(keywords).not.toContain("بالأمس");
});
it("filters Arabic question stop words", () => {
const keywords = extractKeywords("كيف متى أين ماذا");
expect(keywords).not.toContain("كيف");
expect(keywords).not.toContain("متى");
expect(keywords).not.toContain("أين");
expect(keywords).not.toContain("ماذا");
});
it("handles empty query", () => {
expect(extractKeywords("")).toEqual([]);
expect(extractKeywords(" ")).toEqual([]);

View File

@@ -262,6 +262,68 @@ const STOP_WORDS_PT = new Set([
"ajuda",
]);
const STOP_WORDS_AR = new Set([
// Articles and connectors
"ال",
"و",
"أو",
"لكن",
"ثم",
"بل",
// Pronouns / references
"أنا",
"نحن",
"هو",
"هي",
"هم",
"هذا",
"هذه",
"ذلك",
"تلك",
"هنا",
"هناك",
// Common prepositions
"من",
"إلى",
"الى",
"في",
"على",
"عن",
"مع",
"بين",
"ل",
"ب",
"ك",
// Common auxiliaries / vague verbs
"كان",
"كانت",
"يكون",
"تكون",
"صار",
"أصبح",
"يمكن",
"ممكن",
// Time references (vague)
"بالأمس",
"امس",
"اليوم",
"غدا",
"الآن",
"قبل",
"بعد",
"مؤخرا",
// Question/request words
"لماذا",
"كيف",
"ماذا",
"متى",
"أين",
"هل",
"من فضلك",
"فضلا",
"ساعد",
]);
const STOP_WORDS_KO = new Set([
// Particles (조사)
"은",
@@ -669,6 +731,7 @@ export function extractKeywords(query: string): string[] {
STOP_WORDS_EN.has(token) ||
STOP_WORDS_ES.has(token) ||
STOP_WORDS_PT.has(token) ||
STOP_WORDS_AR.has(token) ||
STOP_WORDS_ZH.has(token) ||
STOP_WORDS_KO.has(token) ||
STOP_WORDS_JA.has(token)