/** * Query expansion for FTS-only search mode. * * When no embedding provider is available, we fall back to FTS (full-text search). * FTS works best with specific keywords, but users often ask conversational queries * like "that thing we discussed yesterday" or "之前讨论的那个方案". * * This module extracts meaningful keywords from such queries to improve FTS results. */ // Common stop words that don't add search value const STOP_WORDS_EN = new Set([ // Articles and determiners "a", "an", "the", "this", "that", "these", "those", // Pronouns "i", "me", "my", "we", "our", "you", "your", "he", "she", "it", "they", "them", // Common verbs "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "can", "may", "might", // Prepositions "in", "on", "at", "to", "for", "of", "with", "by", "from", "about", "into", "through", "during", "before", "after", "above", "below", "between", "under", "over", // Conjunctions "and", "or", "but", "if", "then", "because", "as", "while", "when", "where", "what", "which", "who", "how", "why", // Time references (vague, not useful for FTS) "yesterday", "today", "tomorrow", "earlier", "later", "recently", "before", "ago", "just", "now", // Vague references "thing", "things", "stuff", "something", "anything", "everything", "nothing", // Question words "please", "help", "find", "show", "get", "tell", "give", ]); const STOP_WORDS_ES = new Set([ // Articles and determiners "el", "la", "los", "las", "un", "una", "unos", "unas", "este", "esta", "ese", "esa", // Pronouns "yo", "me", "mi", "nosotros", "nosotras", "tu", "tus", "usted", "ustedes", "ellos", "ellas", // Prepositions and conjunctions "de", "del", "a", "en", "con", "por", "para", "sobre", "entre", "y", "o", "pero", "si", "porque", "como", // Common verbs / auxiliaries "es", "son", "fue", "fueron", "ser", "estar", "haber", "tener", "hacer", // Time references (vague) "ayer", "hoy", "mañana", "antes", "despues", "después", "ahora", "recientemente", // Question/request words "que", "qué", "cómo", "cuando", "cuándo", "donde", "dónde", "porqué", "favor", "ayuda", ]); const STOP_WORDS_PT = new Set([ // Articles and determiners "o", "a", "os", "as", "um", "uma", "uns", "umas", "este", "esta", "esse", "essa", // Pronouns "eu", "me", "meu", "minha", "nos", "nós", "você", "vocês", "ele", "ela", "eles", "elas", // Prepositions and conjunctions "de", "do", "da", "em", "com", "por", "para", "sobre", "entre", "e", "ou", "mas", "se", "porque", "como", // Common verbs / auxiliaries "é", "são", "foi", "foram", "ser", "estar", "ter", "fazer", // Time references (vague) "ontem", "hoje", "amanhã", "antes", "depois", "agora", "recentemente", // Question/request words "que", "quê", "quando", "onde", "porquê", "favor", "ajuda", ]); const STOP_WORDS_AR = new Set([ // Articles and connectors "ال", "و", "أو", "لكن", "ثم", "بل", // Pronouns / references "أنا", "نحن", "هو", "هي", "هم", "هذا", "هذه", "ذلك", "تلك", "هنا", "هناك", // Common prepositions "من", "إلى", "الى", "في", "على", "عن", "مع", "بين", "ل", "ب", "ك", // Common auxiliaries / vague verbs "كان", "كانت", "يكون", "تكون", "صار", "أصبح", "يمكن", "ممكن", // Time references (vague) "بالأمس", "امس", "اليوم", "غدا", "الآن", "قبل", "بعد", "مؤخرا", // Question/request words "لماذا", "كيف", "ماذا", "متى", "أين", "هل", "من فضلك", "فضلا", "ساعد", ]); const STOP_WORDS_KO = new Set([ // Particles (조사) "은", "는", "이", "가", "을", "를", "의", "에", "에서", "로", "으로", "와", "과", "도", "만", "까지", "부터", "한테", "에게", "께", "처럼", "같이", "보다", "마다", "밖에", "대로", // Pronouns (대명사) "나", "나는", "내가", "나를", "너", "우리", "저", "저희", "그", "그녀", "그들", "이것", "저것", "그것", "여기", "저기", "거기", // Common verbs / auxiliaries (일반 동사/보조 동사) "있다", "없다", "하다", "되다", "이다", "아니다", "보다", "주다", "오다", "가다", // Nouns (의존 명사 / vague) "것", "거", "등", "수", "때", "곳", "중", "분", // Adverbs "잘", "더", "또", "매우", "정말", "아주", "많이", "너무", "좀", // Conjunctions "그리고", "하지만", "그래서", "그런데", "그러나", "또는", "그러면", // Question words "왜", "어떻게", "뭐", "언제", "어디", "누구", "무엇", "어떤", // Time (vague) "어제", "오늘", "내일", "최근", "지금", "아까", "나중", "전에", // Request words "제발", "부탁", ]); // Common Korean trailing particles to strip from words for tokenization // Sorted by descending length so longest-match-first is guaranteed. const KO_TRAILING_PARTICLES = [ "에서", "으로", "에게", "한테", "처럼", "같이", "보다", "까지", "부터", "마다", "밖에", "대로", "은", "는", "이", "가", "을", "를", "의", "에", "로", "와", "과", "도", "만", ].toSorted((a, b) => b.length - a.length); function stripKoreanTrailingParticle(token: string): string | null { for (const particle of KO_TRAILING_PARTICLES) { if (token.length > particle.length && token.endsWith(particle)) { return token.slice(0, -particle.length); } } return null; } function isUsefulKoreanStem(stem: string): boolean { // Prevent bogus one-syllable stems from words like "논의" -> "논". if (/[\uac00-\ud7af]/.test(stem)) { return stem.length >= 2; } // Keep stripped ASCII stems for mixed tokens like "API를" -> "api". return /^[a-z0-9_]+$/i.test(stem); } const STOP_WORDS_JA = new Set([ // Pronouns and references "これ", "それ", "あれ", "この", "その", "あの", "ここ", "そこ", "あそこ", // Common auxiliaries / vague verbs "する", "した", "して", "です", "ます", "いる", "ある", "なる", "できる", // Particles / connectors "の", "こと", "もの", "ため", "そして", "しかし", "また", "でも", "から", "まで", "より", "だけ", // Question words "なぜ", "どう", "何", "いつ", "どこ", "誰", "どれ", // Time (vague) "昨日", "今日", "明日", "最近", "今", "さっき", "前", "後", ]); const STOP_WORDS_ZH = new Set([ // Pronouns "我", "我们", "你", "你们", "他", "她", "它", "他们", "这", "那", "这个", "那个", "这些", "那些", // Auxiliary words "的", "了", "着", "过", "得", "地", "吗", "呢", "吧", "啊", "呀", "嘛", "啦", // Verbs (common, vague) "是", "有", "在", "被", "把", "给", "让", "用", "到", "去", "来", "做", "说", "看", "找", "想", "要", "能", "会", "可以", // Prepositions and conjunctions "和", "与", "或", "但", "但是", "因为", "所以", "如果", "虽然", "而", "也", "都", "就", "还", "又", "再", "才", "只", // Time (vague) "之前", "以前", "之后", "以后", "刚才", "现在", "昨天", "今天", "明天", "最近", // Vague references "东西", "事情", "事", "什么", "哪个", "哪些", "怎么", "为什么", "多少", // Question/request words "请", "帮", "帮忙", "告诉", ]); /** * Check if a token looks like a meaningful keyword. * Returns false for short tokens, numbers-only, etc. */ function isValidKeyword(token: string): boolean { if (!token || token.length === 0) { return false; } // Skip very short English words (likely stop words or fragments) if (/^[a-zA-Z]+$/.test(token) && token.length < 3) { return false; } // Skip pure numbers (not useful for semantic search) if (/^\d+$/.test(token)) { return false; } // Skip tokens that are all punctuation if (/^[\p{P}\p{S}]+$/u.test(token)) { return false; } return true; } /** * Simple tokenizer that handles English, Chinese, Korean, and Japanese text. * For Chinese, we do character-based splitting since we don't have a proper segmenter. * For English, we split on whitespace and punctuation. */ function tokenize(text: string): string[] { const tokens: string[] = []; const normalized = text.toLowerCase().trim(); // Split into segments (English words, Chinese character sequences, etc.) const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean); for (const segment of segments) { // Japanese text often mixes scripts (kanji/kana/ASCII) without spaces. // Extract script-specific chunks so technical terms like "API" / "バグ" are retained. if (/[\u3040-\u30ff]/.test(segment)) { const jpParts = segment.match(/[a-z0-9_]+|[\u30a0-\u30ffー]+|[\u4e00-\u9fff]+|[\u3040-\u309f]{2,}/g) ?? []; for (const part of jpParts) { if (/^[\u4e00-\u9fff]+$/.test(part)) { tokens.push(part); for (let i = 0; i < part.length - 1; i++) { tokens.push(part[i] + part[i + 1]); } } else { tokens.push(part); } } } else if (/[\u4e00-\u9fff]/.test(segment)) { // Check if segment contains CJK characters (Chinese) // For Chinese, extract character n-grams (unigrams and bigrams) const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c)); // Add individual characters tokens.push(...chars); // Add bigrams for better phrase matching for (let i = 0; i < chars.length - 1; i++) { tokens.push(chars[i] + chars[i + 1]); } } else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) { // For Korean (Hangul syllables and jamo), keep the word as-is unless it is // effectively a stop word once trailing particles are removed. const stem = stripKoreanTrailingParticle(segment); const stemIsStopWord = stem !== null && STOP_WORDS_KO.has(stem); if (!STOP_WORDS_KO.has(segment) && !stemIsStopWord) { tokens.push(segment); } // Also emit particle-stripped stems when they are useful keywords. if (stem && !STOP_WORDS_KO.has(stem) && isUsefulKoreanStem(stem)) { tokens.push(stem); } } else { // For non-CJK, keep as single token tokens.push(segment); } } return tokens; } /** * Extract keywords from a conversational query for FTS search. * * Examples: * - "that thing we discussed about the API" → ["discussed", "API"] * - "之前讨论的那个方案" → ["讨论", "方案"] * - "what was the solution for the bug" → ["solution", "bug"] */ export function extractKeywords(query: string): string[] { const tokens = tokenize(query); const keywords: string[] = []; const seen = new Set(); for (const token of tokens) { // Skip stop words if ( STOP_WORDS_EN.has(token) || STOP_WORDS_ES.has(token) || STOP_WORDS_PT.has(token) || STOP_WORDS_AR.has(token) || STOP_WORDS_ZH.has(token) || STOP_WORDS_KO.has(token) || STOP_WORDS_JA.has(token) ) { continue; } // Skip invalid keywords if (!isValidKeyword(token)) { continue; } // Skip duplicates if (seen.has(token)) { continue; } seen.add(token); keywords.push(token); } return keywords; } /** * Expand a query for FTS search. * Returns both the original query and extracted keywords for OR-matching. * * @param query - User's original query * @returns Object with original query and extracted keywords */ export function expandQueryForFts(query: string): { original: string; keywords: string[]; expanded: string; } { const original = query.trim(); const keywords = extractKeywords(original); // Build expanded query: original terms OR extracted keywords // This ensures both exact matches and keyword matches are found const expanded = keywords.length > 0 ? `${original} OR ${keywords.join(" OR ")}` : original; return { original, keywords, expanded }; } /** * Type for an optional LLM-based query expander. * Can be provided to enhance keyword extraction with semantic understanding. */ export type LlmQueryExpander = (query: string) => Promise; /** * Expand query with optional LLM assistance. * Falls back to local extraction if LLM is unavailable or fails. */ export async function expandQueryWithLlm( query: string, llmExpander?: LlmQueryExpander, ): Promise { // If LLM expander is provided, try it first if (llmExpander) { try { const llmKeywords = await llmExpander(query); if (llmKeywords.length > 0) { return llmKeywords; } } catch { // LLM failed, fall back to local extraction } } // Fall back to local keyword extraction return extractKeywords(query); }