diff --git a/CHANGELOG.md b/CHANGELOG.md index f44d8a26643..7472561f20e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai - Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior. - Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang. - Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc. +- Memory/FTS: add Spanish and Portuguese stop-word filtering for query expansion in FTS-only search mode, improving conversational recall for both languages. Thanks @vincentkoc. - iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman. ### Breaking diff --git a/src/memory/query-expansion.test.ts b/src/memory/query-expansion.test.ts index 708d24695f9..cb818364009 100644 --- a/src/memory/query-expansion.test.ts +++ b/src/memory/query-expansion.test.ts @@ -117,6 +117,32 @@ describe("extractKeywords", () => { expect(keywords).not.toContain("どう"); }); + it("extracts keywords from Spanish conversational query", () => { + const keywords = extractKeywords("ayer hablamos sobre la estrategia de despliegue"); + expect(keywords).toContain("estrategia"); + expect(keywords).toContain("despliegue"); + expect(keywords).not.toContain("ayer"); + expect(keywords).not.toContain("sobre"); + }); + + it("extracts keywords from Portuguese conversational query", () => { + const keywords = extractKeywords("ontem falamos sobre a estratégia de implantação"); + expect(keywords).toContain("estratégia"); + expect(keywords).toContain("implantação"); + expect(keywords).not.toContain("ontem"); + expect(keywords).not.toContain("sobre"); + }); + + it("filters Spanish and Portuguese question stop words", () => { + const keywords = extractKeywords("cómo cuando donde porquê quando onde"); + expect(keywords).not.toContain("cómo"); + expect(keywords).not.toContain("cuando"); + expect(keywords).not.toContain("donde"); + expect(keywords).not.toContain("porquê"); + expect(keywords).not.toContain("quando"); + expect(keywords).not.toContain("onde"); + }); + it("handles empty query", () => { expect(extractKeywords("")).toEqual([]); expect(extractKeywords(" ")).toEqual([]); diff --git a/src/memory/query-expansion.ts b/src/memory/query-expansion.ts index 7fea63b5788..9e18816c99c 100644 --- a/src/memory/query-expansion.ts +++ b/src/memory/query-expansion.ts @@ -118,6 +118,150 @@ const STOP_WORDS_EN = new Set([ "give", ]); +const STOP_WORDS_ES = new Set([ + // Articles and determiners + "el", + "la", + "los", + "las", + "un", + "una", + "unos", + "unas", + "este", + "esta", + "ese", + "esa", + // Pronouns + "yo", + "me", + "mi", + "nosotros", + "nosotras", + "tu", + "tus", + "usted", + "ustedes", + "ellos", + "ellas", + // Prepositions and conjunctions + "de", + "del", + "a", + "en", + "con", + "por", + "para", + "sobre", + "entre", + "y", + "o", + "pero", + "si", + "porque", + "como", + // Common verbs / auxiliaries + "es", + "son", + "fue", + "fueron", + "ser", + "estar", + "haber", + "tener", + "hacer", + // Time references (vague) + "ayer", + "hoy", + "mañana", + "antes", + "despues", + "después", + "ahora", + "recientemente", + // Question/request words + "que", + "qué", + "cómo", + "cuando", + "cuándo", + "donde", + "dónde", + "porqué", + "favor", + "ayuda", +]); + +const STOP_WORDS_PT = new Set([ + // Articles and determiners + "o", + "a", + "os", + "as", + "um", + "uma", + "uns", + "umas", + "este", + "esta", + "esse", + "essa", + // Pronouns + "eu", + "me", + "meu", + "minha", + "nos", + "nós", + "você", + "vocês", + "ele", + "ela", + "eles", + "elas", + // Prepositions and conjunctions + "de", + "do", + "da", + "em", + "com", + "por", + "para", + "sobre", + "entre", + "e", + "ou", + "mas", + "se", + "porque", + "como", + // Common verbs / auxiliaries + "é", + "são", + "foi", + "foram", + "ser", + "estar", + "ter", + "fazer", + // Time references (vague) + "ontem", + "hoje", + "amanhã", + "antes", + "depois", + "agora", + "recentemente", + // Question/request words + "que", + "quê", + "quando", + "onde", + "porquê", + "favor", + "ajuda", +]); + const STOP_WORDS_KO = new Set([ // Particles (조사) "은", @@ -523,6 +667,8 @@ export function extractKeywords(query: string): string[] { // Skip stop words if ( STOP_WORDS_EN.has(token) || + STOP_WORDS_ES.has(token) || + STOP_WORDS_PT.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token) || STOP_WORDS_JA.has(token)