Memory: add Spanish and Portuguese query expansion stop words (#23710)

This commit is contained in:
Vincent Koc
2026-02-22 11:26:12 -05:00
committed by GitHub
parent f14ebd743c
commit 35b162af76
3 changed files with 173 additions and 0 deletions

View File

@@ -14,6 +14,7 @@ Docs: https://docs.openclaw.ai
- Discord/Allowlist: canonicalize resolved Discord allowlist names to IDs and split resolution flow for clearer fail-closed behavior.
- Memory/FTS: add Korean stop-word filtering and particle-aware keyword extraction (including mixed Korean/English stems) for query expansion in FTS-only search mode. (#18899) Thanks @ruypang.
- Memory/FTS: add Japanese-aware query expansion tokenization and stop-word filtering (including mixed-script terms like ASCII + katakana) for FTS-only search mode. Thanks @vincentkoc.
- Memory/FTS: add Spanish and Portuguese stop-word filtering for query expansion in FTS-only search mode, improving conversational recall for both languages. Thanks @vincentkoc.
- iOS/Talk: prefetch TTS segments and suppress expected speech-cancellation errors for smoother talk playback. (#22833) Thanks @ngutman.
### Breaking

View File

@@ -117,6 +117,32 @@ describe("extractKeywords", () => {
expect(keywords).not.toContain("どう");
});
it("extracts keywords from Spanish conversational query", () => {
const keywords = extractKeywords("ayer hablamos sobre la estrategia de despliegue");
expect(keywords).toContain("estrategia");
expect(keywords).toContain("despliegue");
expect(keywords).not.toContain("ayer");
expect(keywords).not.toContain("sobre");
});
it("extracts keywords from Portuguese conversational query", () => {
const keywords = extractKeywords("ontem falamos sobre a estratégia de implantação");
expect(keywords).toContain("estratégia");
expect(keywords).toContain("implantação");
expect(keywords).not.toContain("ontem");
expect(keywords).not.toContain("sobre");
});
it("filters Spanish and Portuguese question stop words", () => {
const keywords = extractKeywords("cómo cuando donde porquê quando onde");
expect(keywords).not.toContain("cómo");
expect(keywords).not.toContain("cuando");
expect(keywords).not.toContain("donde");
expect(keywords).not.toContain("porquê");
expect(keywords).not.toContain("quando");
expect(keywords).not.toContain("onde");
});
it("handles empty query", () => {
expect(extractKeywords("")).toEqual([]);
expect(extractKeywords(" ")).toEqual([]);

View File

@@ -118,6 +118,150 @@ const STOP_WORDS_EN = new Set([
"give",
]);
const STOP_WORDS_ES = new Set([
// Articles and determiners
"el",
"la",
"los",
"las",
"un",
"una",
"unos",
"unas",
"este",
"esta",
"ese",
"esa",
// Pronouns
"yo",
"me",
"mi",
"nosotros",
"nosotras",
"tu",
"tus",
"usted",
"ustedes",
"ellos",
"ellas",
// Prepositions and conjunctions
"de",
"del",
"a",
"en",
"con",
"por",
"para",
"sobre",
"entre",
"y",
"o",
"pero",
"si",
"porque",
"como",
// Common verbs / auxiliaries
"es",
"son",
"fue",
"fueron",
"ser",
"estar",
"haber",
"tener",
"hacer",
// Time references (vague)
"ayer",
"hoy",
"mañana",
"antes",
"despues",
"después",
"ahora",
"recientemente",
// Question/request words
"que",
"qué",
"cómo",
"cuando",
"cuándo",
"donde",
"dónde",
"porqué",
"favor",
"ayuda",
]);
const STOP_WORDS_PT = new Set([
// Articles and determiners
"o",
"a",
"os",
"as",
"um",
"uma",
"uns",
"umas",
"este",
"esta",
"esse",
"essa",
// Pronouns
"eu",
"me",
"meu",
"minha",
"nos",
"nós",
"você",
"vocês",
"ele",
"ela",
"eles",
"elas",
// Prepositions and conjunctions
"de",
"do",
"da",
"em",
"com",
"por",
"para",
"sobre",
"entre",
"e",
"ou",
"mas",
"se",
"porque",
"como",
// Common verbs / auxiliaries
"é",
"são",
"foi",
"foram",
"ser",
"estar",
"ter",
"fazer",
// Time references (vague)
"ontem",
"hoje",
"amanhã",
"antes",
"depois",
"agora",
"recentemente",
// Question/request words
"que",
"quê",
"quando",
"onde",
"porquê",
"favor",
"ajuda",
]);
const STOP_WORDS_KO = new Set([
// Particles (조사)
"은",
@@ -523,6 +667,8 @@ export function extractKeywords(query: string): string[] {
// Skip stop words
if (
STOP_WORDS_EN.has(token) ||
STOP_WORDS_ES.has(token) ||
STOP_WORDS_PT.has(token) ||
STOP_WORDS_ZH.has(token) ||
STOP_WORDS_KO.has(token) ||
STOP_WORDS_JA.has(token)