diff --git a/CHANGELOG.md b/CHANGELOG.md index d4a19735f66..0b222ef837c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai - Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang. - Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc. - Memory/FTS: add Spanish and Portuguese stop-word filtering for query expansion in FTS-only search mode, improving conversational recall for both languages. Thanks @vincentkoc. +- Memory/FTS: add Arabic stop-word filtering for query expansion in FTS-only search mode to reduce conversational filler in Arabic memory searches. Thanks @vincentkoc. - iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman. ### Breaking diff --git a/src/memory/query-expansion.test.ts b/src/memory/query-expansion.test.ts index cb818364009..ac535b438e8 100644 --- a/src/memory/query-expansion.test.ts +++ b/src/memory/query-expansion.test.ts @@ -143,6 +143,22 @@ describe("extractKeywords", () => { expect(keywords).not.toContain("onde"); }); + it("extracts keywords from Arabic conversational query", () => { + const keywords = extractKeywords("بالأمس ناقشنا استراتيجية النشر"); + expect(keywords).toContain("ناقشنا"); + expect(keywords).toContain("استراتيجية"); + expect(keywords).toContain("النشر"); + expect(keywords).not.toContain("بالأمس"); + }); + + it("filters Arabic question stop words", () => { + const keywords = extractKeywords("كيف متى أين ماذا"); + expect(keywords).not.toContain("كيف"); + expect(keywords).not.toContain("متى"); + expect(keywords).not.toContain("أين"); + expect(keywords).not.toContain("ماذا"); + }); + it("handles empty query", () => { expect(extractKeywords("")).toEqual([]); expect(extractKeywords(" ")).toEqual([]); diff --git a/src/memory/query-expansion.ts b/src/memory/query-expansion.ts index 9e18816c99c..d8c12e3a128 100644 --- a/src/memory/query-expansion.ts +++ b/src/memory/query-expansion.ts @@ -262,6 +262,68 @@ const STOP_WORDS_PT = new Set([ "ajuda", ]); +const STOP_WORDS_AR = new Set([ + // Articles and connectors + "ال", + "و", + "أو", + "لكن", + "ثم", + "بل", + // Pronouns / references + "أنا", + "نحن", + "هو", + "هي", + "هم", + "هذا", + "هذه", + "ذلك", + "تلك", + "هنا", + "هناك", + // Common prepositions + "من", + "إلى", + "الى", + "في", + "على", + "عن", + "مع", + "بين", + "ل", + "ب", + "ك", + // Common auxiliaries / vague verbs + "كان", + "كانت", + "يكون", + "تكون", + "صار", + "أصبح", + "يمكن", + "ممكن", + // Time references (vague) + "بالأمس", + "امس", + "اليوم", + "غدا", + "الآن", + "قبل", + "بعد", + "مؤخرا", + // Question/request words + "لماذا", + "كيف", + "ماذا", + "متى", + "أين", + "هل", + "من فضلك", + "فضلا", + "ساعد", +]); + const STOP_WORDS_KO = new Set([ // Particles (조사) "은", @@ -669,6 +731,7 @@ export function extractKeywords(query: string): string[] { STOP_WORDS_EN.has(token) || STOP_WORDS_ES.has(token) || STOP_WORDS_PT.has(token) || + STOP_WORDS_AR.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token) || STOP_WORDS_JA.has(token)