openclaw/src/memory/query-expansion.ts

/**
 * Query expansion for FTS-only search mode.
 *
 * When no embedding provider is available, we fall back to FTS (full-text search).
 * FTS works best with specific keywords, but users often ask conversational queries
 * like "that thing we discussed yesterday" or "之前讨论的那个方案".
 *
 * This module extracts meaningful keywords from such queries to improve FTS results.
 */

// Common stop words that don't add search value
const STOP_WORDS_EN = new Set([
  // Articles and determiners
  "a",
  "an",
  "the",
  "this",
  "that",
  "these",
  "those",
  // Pronouns
  "i",
  "me",
  "my",
  "we",
  "our",
  "you",
  "your",
  "he",
  "she",
  "it",
  "they",
  "them",
  // Common verbs
  "is",
  "are",
  "was",
  "were",
  "be",
  "been",
  "being",
  "have",
  "has",
  "had",
  "do",
  "does",
  "did",
  "will",
  "would",
  "could",
  "should",
  "can",
  "may",
  "might",
  // Prepositions
  "in",
  "on",
  "at",
  "to",
  "for",
  "of",
  "with",
  "by",
  "from",
  "about",
  "into",
  "through",
  "during",
  "before",
  "after",
  "above",
  "below",
  "between",
  "under",
  "over",
  // Conjunctions
  "and",
  "or",
  "but",
  "if",
  "then",
  "because",
  "as",
  "while",
  "when",
  "where",
  "what",
  "which",
  "who",
  "how",
  "why",
  // Time references (vague, not useful for FTS)
  "yesterday",
  "today",
  "tomorrow",
  "earlier",
  "later",
  "recently",
  "before",
  "ago",
  "just",
  "now",
  // Vague references
  "thing",
  "things",
  "stuff",
  "something",
  "anything",
  "everything",
  "nothing",
  // Question words
  "please",
  "help",
  "find",
  "show",
  "get",
  "tell",
  "give",
]);

const STOP_WORDS_ES = new Set([
  // Articles and determiners
  "el",
  "la",
  "los",
  "las",
  "un",
  "una",
  "unos",
  "unas",
  "este",
  "esta",
  "ese",
  "esa",
  // Pronouns
  "yo",
  "me",
  "mi",
  "nosotros",
  "nosotras",
  "tu",
  "tus",
  "usted",
  "ustedes",
  "ellos",
  "ellas",
  // Prepositions and conjunctions
  "de",
  "del",
  "a",
  "en",
  "con",
  "por",
  "para",
  "sobre",
  "entre",
  "y",
  "o",
  "pero",
  "si",
  "porque",
  "como",
  // Common verbs / auxiliaries
  "es",
  "son",
  "fue",
  "fueron",
  "ser",
  "estar",
  "haber",
  "tener",
  "hacer",
  // Time references (vague)
  "ayer",
  "hoy",
  "mañana",
  "antes",
  "despues",
  "después",
  "ahora",
  "recientemente",
  // Question/request words
  "que",
  "qué",
  "cómo",
  "cuando",
  "cuándo",
  "donde",
  "dónde",
  "porqué",
  "favor",
  "ayuda",
]);

const STOP_WORDS_PT = new Set([
  // Articles and determiners
  "o",
  "a",
  "os",
  "as",
  "um",
  "uma",
  "uns",
  "umas",
  "este",
  "esta",
  "esse",
  "essa",
  // Pronouns
  "eu",
  "me",
  "meu",
  "minha",
  "nos",
  "nós",
  "você",
  "vocês",
  "ele",
  "ela",
  "eles",
  "elas",
  // Prepositions and conjunctions
  "de",
  "do",
  "da",
  "em",
  "com",
  "por",
  "para",
  "sobre",
  "entre",
  "e",
  "ou",
  "mas",
  "se",
  "porque",
  "como",
  // Common verbs / auxiliaries
  "é",
  "são",
  "foi",
  "foram",
  "ser",
  "estar",
  "ter",
  "fazer",
  // Time references (vague)
  "ontem",
  "hoje",
  "amanhã",
  "antes",
  "depois",
  "agora",
  "recentemente",
  // Question/request words
  "que",
  "quê",
  "quando",
  "onde",
  "porquê",
  "favor",
  "ajuda",
]);

const STOP_WORDS_AR = new Set([
  // Articles and connectors
  "ال",
  "و",
  "أو",
  "لكن",
  "ثم",
  "بل",
  // Pronouns / references
  "أنا",
  "نحن",
  "هو",
  "هي",
  "هم",
  "هذا",
  "هذه",
  "ذلك",
  "تلك",
  "هنا",
  "هناك",
  // Common prepositions
  "من",
  "إلى",
  "الى",
  "في",
  "على",
  "عن",
  "مع",
  "بين",
  "ل",
  "ب",
  "ك",
  // Common auxiliaries / vague verbs
  "كان",
  "كانت",
  "يكون",
  "تكون",
  "صار",
  "أصبح",
  "يمكن",
  "ممكن",
  // Time references (vague)
  "بالأمس",
  "امس",
  "اليوم",
  "غدا",
  "الآن",
  "قبل",
  "بعد",
  "مؤخرا",
  // Question/request words
  "لماذا",
  "كيف",
  "ماذا",
  "متى",
  "أين",
  "هل",
  "من فضلك",
  "فضلا",
  "ساعد",
]);

const STOP_WORDS_KO = new Set([
  // Particles (조사)
  "은",
  "는",
  "이",
  "가",
  "을",
  "를",
  "의",
  "에",
  "에서",
  "로",
  "으로",
  "와",
  "과",
  "도",
  "만",
  "까지",
  "부터",
  "한테",
  "에게",
  "께",
  "처럼",
  "같이",
  "보다",
  "마다",
  "밖에",
  "대로",
  // Pronouns (대명사)
  "나",
  "나는",
  "내가",
  "나를",
  "너",
  "우리",
  "저",
  "저희",
  "그",
  "그녀",
  "그들",
  "이것",
  "저것",
  "그것",
  "여기",
  "저기",
  "거기",
  // Common verbs / auxiliaries (일반 동사/보조 동사)
  "있다",
  "없다",
  "하다",
  "되다",
  "이다",
  "아니다",
  "보다",
  "주다",
  "오다",
  "가다",
  // Nouns (의존 명사 / vague)
  "것",
  "거",
  "등",
  "수",
  "때",
  "곳",
  "중",
  "분",
  // Adverbs
  "잘",
  "더",
  "또",
  "매우",
  "정말",
  "아주",
  "많이",
  "너무",
  "좀",
  // Conjunctions
  "그리고",
  "하지만",
  "그래서",
  "그런데",
  "그러나",
  "또는",
  "그러면",
  // Question words
  "왜",
  "어떻게",
  "뭐",
  "언제",
  "어디",
  "누구",
  "무엇",
  "어떤",
  // Time (vague)
  "어제",
  "오늘",
  "내일",
  "최근",
  "지금",
  "아까",
  "나중",
  "전에",
  // Request words
  "제발",
  "부탁",
]);

// Common Korean trailing particles to strip from words for tokenization
// Sorted by descending length so longest-match-first is guaranteed.
const KO_TRAILING_PARTICLES = [
  "에서",
  "으로",
  "에게",
  "한테",
  "처럼",
  "같이",
  "보다",
  "까지",
  "부터",
  "마다",
  "밖에",
  "대로",
  "은",
  "는",
  "이",
  "가",
  "을",
  "를",
  "의",
  "에",
  "로",
  "와",
  "과",
  "도",
  "만",
].toSorted((a, b) => b.length - a.length);

function stripKoreanTrailingParticle(token: string): string | null {
  for (const particle of KO_TRAILING_PARTICLES) {
    if (token.length > particle.length && token.endsWith(particle)) {
      return token.slice(0, -particle.length);
    }
  }
  return null;
}

function isUsefulKoreanStem(stem: string): boolean {
  // Prevent bogus one-syllable stems from words like "논의" -> "논".
  if (/[\uac00-\ud7af]/.test(stem)) {
    return stem.length >= 2;
  }
  // Keep stripped ASCII stems for mixed tokens like "API를" -> "api".
  return /^[a-z0-9_]+$/i.test(stem);
}

const STOP_WORDS_JA = new Set([
  // Pronouns and references
  "これ",
  "それ",
  "あれ",
  "この",
  "その",
  "あの",
  "ここ",
  "そこ",
  "あそこ",
  // Common auxiliaries / vague verbs
  "する",
  "した",
  "して",
  "です",
  "ます",
  "いる",
  "ある",
  "なる",
  "できる",
  // Particles / connectors
  "の",
  "こと",
  "もの",
  "ため",
  "そして",
  "しかし",
  "また",
  "でも",
  "から",
  "まで",
  "より",
  "だけ",
  // Question words
  "なぜ",
  "どう",
  "何",
  "いつ",
  "どこ",
  "誰",
  "どれ",
  // Time (vague)
  "昨日",
  "今日",
  "明日",
  "最近",
  "今",
  "さっき",
  "前",
  "後",
]);

const STOP_WORDS_ZH = new Set([
  // Pronouns
  "我",
  "我们",
  "你",
  "你们",
  "他",
  "她",
  "它",
  "他们",
  "这",
  "那",
  "这个",
  "那个",
  "这些",
  "那些",
  // Auxiliary words
  "的",
  "了",
  "着",
  "过",
  "得",
  "地",
  "吗",
  "呢",
  "吧",
  "啊",
  "呀",
  "嘛",
  "啦",
  // Verbs (common, vague)
  "是",
  "有",
  "在",
  "被",
  "把",
  "给",
  "让",
  "用",
  "到",
  "去",
  "来",
  "做",
  "说",
  "看",
  "找",
  "想",
  "要",
  "能",
  "会",
  "可以",
  // Prepositions and conjunctions
  "和",
  "与",
  "或",
  "但",
  "但是",
  "因为",
  "所以",
  "如果",
  "虽然",
  "而",
  "也",
  "都",
  "就",
  "还",
  "又",
  "再",
  "才",
  "只",
  // Time (vague)
  "之前",
  "以前",
  "之后",
  "以后",
  "刚才",
  "现在",
  "昨天",
  "今天",
  "明天",
  "最近",
  // Vague references
  "东西",
  "事情",
  "事",
  "什么",
  "哪个",
  "哪些",
  "怎么",
  "为什么",
  "多少",
  // Question/request words
  "请",
  "帮",
  "帮忙",
  "告诉",
]);

/**
 * Check if a token looks like a meaningful keyword.
 * Returns false for short tokens, numbers-only, etc.
 */
function isValidKeyword(token: string): boolean {
  if (!token || token.length === 0) {
    return false;
  }
  // Skip very short English words (likely stop words or fragments)
  if (/^[a-zA-Z]+$/.test(token) && token.length < 3) {
    return false;
  }
  // Skip pure numbers (not useful for semantic search)
  if (/^\d+$/.test(token)) {
    return false;
  }
  // Skip tokens that are all punctuation
  if (/^[\p{P}\p{S}]+$/u.test(token)) {
    return false;
  }
  return true;
}

/**
 * Simple tokenizer that handles English, Chinese, Korean, and Japanese text.
 * For Chinese, we do character-based splitting since we don't have a proper segmenter.
 * For English, we split on whitespace and punctuation.
 */
function tokenize(text: string): string[] {
  const tokens: string[] = [];
  const normalized = text.toLowerCase().trim();

  // Split into segments (English words, Chinese character sequences, etc.)
  const segments = normalized.split(/[\s\p{P}]+/u).filter(Boolean);

  for (const segment of segments) {
    // Japanese text often mixes scripts (kanji/kana/ASCII) without spaces.
    // Extract script-specific chunks so technical terms like "API" / "バグ" are retained.
    if (/[\u3040-\u30ff]/.test(segment)) {
      const jpParts =
        segment.match(/[a-z0-9_]+|[\u30a0-\u30ffー]+|[\u4e00-\u9fff]+|[\u3040-\u309f]{2,}/g) ?? [];
      for (const part of jpParts) {
        if (/^[\u4e00-\u9fff]+$/.test(part)) {
          tokens.push(part);
          for (let i = 0; i < part.length - 1; i++) {
            tokens.push(part[i] + part[i + 1]);
          }
        } else {
          tokens.push(part);
        }
      }
    } else if (/[\u4e00-\u9fff]/.test(segment)) {
      // Check if segment contains CJK characters (Chinese)
      // For Chinese, extract character n-grams (unigrams and bigrams)
      const chars = Array.from(segment).filter((c) => /[\u4e00-\u9fff]/.test(c));
      // Add individual characters
      tokens.push(...chars);
      // Add bigrams for better phrase matching
      for (let i = 0; i < chars.length - 1; i++) {
        tokens.push(chars[i] + chars[i + 1]);
      }
    } else if (/[\uac00-\ud7af\u3131-\u3163]/.test(segment)) {
      // For Korean (Hangul syllables and jamo), keep the word as-is unless it is
      // effectively a stop word once trailing particles are removed.
      const stem = stripKoreanTrailingParticle(segment);
      const stemIsStopWord = stem !== null && STOP_WORDS_KO.has(stem);
      if (!STOP_WORDS_KO.has(segment) && !stemIsStopWord) {
        tokens.push(segment);
      }
      // Also emit particle-stripped stems when they are useful keywords.
      if (stem && !STOP_WORDS_KO.has(stem) && isUsefulKoreanStem(stem)) {
        tokens.push(stem);
      }
    } else {
      // For non-CJK, keep as single token
      tokens.push(segment);
    }
  }

  return tokens;
}

/**
 * Extract keywords from a conversational query for FTS search.
 *
 * Examples:
 * - "that thing we discussed about the API" → ["discussed", "API"]
 * - "之前讨论的那个方案" → ["讨论", "方案"]
 * - "what was the solution for the bug" → ["solution", "bug"]
 */
export function extractKeywords(query: string): string[] {
  const tokens = tokenize(query);
  const keywords: string[] = [];
  const seen = new Set<string>();

  for (const token of tokens) {
    // Skip stop words
    if (
      STOP_WORDS_EN.has(token) ||
      STOP_WORDS_ES.has(token) ||
      STOP_WORDS_PT.has(token) ||
      STOP_WORDS_AR.has(token) ||
      STOP_WORDS_ZH.has(token) ||
      STOP_WORDS_KO.has(token) ||
      STOP_WORDS_JA.has(token)
    ) {
      continue;
    }
    // Skip invalid keywords
    if (!isValidKeyword(token)) {
      continue;
    }
    // Skip duplicates
    if (seen.has(token)) {
      continue;
    }
    seen.add(token);
    keywords.push(token);
  }

  return keywords;
}

/**
 * Expand a query for FTS search.
 * Returns both the original query and extracted keywords for OR-matching.
 *
 * @param query - User's original query
 * @returns Object with original query and extracted keywords
 */
export function expandQueryForFts(query: string): {
  original: string;
  keywords: string[];
  expanded: string;
} {
  const original = query.trim();
  const keywords = extractKeywords(original);

  // Build expanded query: original terms OR extracted keywords
  // This ensures both exact matches and keyword matches are found
  const expanded = keywords.length > 0 ? `${original} OR ${keywords.join(" OR ")}` : original;

  return { original, keywords, expanded };
}

/**
 * Type for an optional LLM-based query expander.
 * Can be provided to enhance keyword extraction with semantic understanding.
 */
export type LlmQueryExpander = (query: string) => Promise<string[]>;

/**
 * Expand query with optional LLM assistance.
 * Falls back to local extraction if LLM is unavailable or fails.
 */
export async function expandQueryWithLlm(
  query: string,
  llmExpander?: LlmQueryExpander,
): Promise<string[]> {
  // If LLM expander is provided, try it first
  if (llmExpander) {
    try {
      const llmKeywords = await llmExpander(query);
      if (llmKeywords.length > 0) {
        return llmKeywords;
      }
    } catch {
      // LLM failed, fall back to local extraction
    }
  }

  // Fall back to local keyword extraction
  return extractKeywords(query);
}