mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-15 12:00:43 +00:00
- Implemented Upstash Vector as a cloud-based storage solution for document chunks, replacing the local LanceDB option. - Added auto-detection of storage mode based on environment variables for seamless integration. - Updated the chat API to utilize the new retrieval mechanism, enhancing response accuracy and performance. - Enhanced README with setup instructions for Upstash and updated environment variable requirements. - Introduced new scripts and configurations for managing the vector index and API interactions.
279 lines
8.3 KiB
TypeScript
279 lines
8.3 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Build a vector search index from docs/*.md for the docs-chat RAG pipeline.
|
|
* Usage: bun build-vector-index.ts [--docs path/to/docs] [--base-url https://docs.openclaw.ai]
|
|
*
|
|
* Requires environment variables:
|
|
* OPENAI_API_KEY - for embeddings
|
|
*
|
|
* Optional (for Upstash cloud store):
|
|
* UPSTASH_VECTOR_REST_URL - Upstash Vector endpoint
|
|
* UPSTASH_VECTOR_REST_TOKEN - Upstash Vector auth token
|
|
*
|
|
* If Upstash credentials are not set, falls back to LanceDB (local file store).
|
|
*/
|
|
import fs from "node:fs";
|
|
import path from "node:path";
|
|
import { fileURLToPath } from "node:url";
|
|
import { randomUUID } from "node:crypto";
|
|
import { Embeddings } from "./rag/embeddings.js";
|
|
import { createStore, detectStoreMode, type DocsChunk } from "./rag/store-factory.js";
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
const root = path.resolve(__dirname, "../..");
|
|
const defaultDocsDir = path.join(root, "docs");
|
|
|
|
// Parse CLI arguments
|
|
const args = process.argv.slice(2);
|
|
let docsDir = defaultDocsDir;
|
|
let baseUrl = "https://docs.openclaw.ai";
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
if (args[i] === "--docs" && args[i + 1]) {
|
|
docsDir = path.resolve(args[++i]);
|
|
} else if (args[i] === "--base-url" && args[i + 1]) {
|
|
baseUrl = args[++i].replace(/\/$/, "");
|
|
}
|
|
}
|
|
|
|
// Validate API key
|
|
const apiKey = process.env.OPENAI_API_KEY;
|
|
if (!apiKey) {
|
|
console.error("Error: OPENAI_API_KEY environment variable is required");
|
|
process.exit(1);
|
|
}
|
|
|
|
interface RawChunk {
|
|
path: string;
|
|
title: string;
|
|
content: string;
|
|
url: string;
|
|
}
|
|
|
|
// Chunking configuration for optimal RAG retrieval
|
|
// ~4 chars per token on average for English text
|
|
const TARGET_CHUNK_CHARS = 2400; // ~600 tokens
|
|
const MAX_CHUNK_CHARS = 4000; // ~1000 tokens
|
|
const OVERLAP_CHARS = 400; // ~100 tokens overlap
|
|
|
|
/**
|
|
* Split a large chunk into smaller pieces with overlap.
|
|
* Splits on paragraph boundaries when possible, falls back to sentence/word boundaries.
|
|
*/
|
|
function splitLargeChunk(chunk: RawChunk): RawChunk[] {
|
|
const content = chunk.content;
|
|
|
|
// If chunk is within limits, return as-is
|
|
if (content.length <= MAX_CHUNK_CHARS) {
|
|
return [chunk];
|
|
}
|
|
|
|
const results: RawChunk[] = [];
|
|
const paragraphs = content.split(/\n\n+/);
|
|
|
|
let currentContent = "";
|
|
let partIndex = 0;
|
|
|
|
const flushChunk = (text: string) => {
|
|
const trimmed = text.trim();
|
|
if (!trimmed) return;
|
|
partIndex++;
|
|
results.push({
|
|
...chunk,
|
|
title: partIndex > 1 ? `${chunk.title} (part ${partIndex})` : chunk.title,
|
|
content: trimmed,
|
|
});
|
|
};
|
|
|
|
for (const para of paragraphs) {
|
|
// If adding this paragraph would exceed max, flush current and start new
|
|
if (
|
|
currentContent.length > 0 &&
|
|
currentContent.length + para.length + 2 > MAX_CHUNK_CHARS
|
|
) {
|
|
flushChunk(currentContent);
|
|
|
|
// Start new chunk with overlap from end of previous
|
|
const overlapStart = Math.max(0, currentContent.length - OVERLAP_CHARS);
|
|
// Find a good break point (paragraph or sentence boundary)
|
|
let overlapText = currentContent.slice(overlapStart);
|
|
const sentenceBreak = overlapText.search(/[.!?]\s+/);
|
|
if (sentenceBreak > 0) {
|
|
overlapText = overlapText.slice(sentenceBreak + 1).trim();
|
|
}
|
|
currentContent = overlapText;
|
|
}
|
|
|
|
// If a single paragraph exceeds max, split it further
|
|
if (para.length > MAX_CHUNK_CHARS) {
|
|
// Flush any accumulated content first
|
|
if (currentContent.length > 0) {
|
|
flushChunk(currentContent);
|
|
currentContent = "";
|
|
}
|
|
|
|
// Split long paragraph on sentence boundaries
|
|
const sentences = para.split(/(?<=[.!?])\s+/);
|
|
let sentenceBuffer = "";
|
|
|
|
for (const sentence of sentences) {
|
|
if (
|
|
sentenceBuffer.length > 0 &&
|
|
sentenceBuffer.length + sentence.length + 1 > MAX_CHUNK_CHARS
|
|
) {
|
|
flushChunk(sentenceBuffer);
|
|
// Overlap from previous sentence buffer
|
|
const overlapStart = Math.max(0, sentenceBuffer.length - OVERLAP_CHARS);
|
|
sentenceBuffer = sentenceBuffer.slice(overlapStart).trim();
|
|
}
|
|
sentenceBuffer += (sentenceBuffer ? " " : "") + sentence;
|
|
}
|
|
|
|
if (sentenceBuffer) {
|
|
currentContent = sentenceBuffer;
|
|
}
|
|
} else {
|
|
currentContent += (currentContent ? "\n\n" : "") + para;
|
|
}
|
|
}
|
|
|
|
// Flush remaining content
|
|
if (currentContent.trim()) {
|
|
flushChunk(currentContent);
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
function stripFrontmatter(content: string): string {
|
|
if (!content.startsWith("---")) return content;
|
|
const end = content.indexOf("\n---", 3);
|
|
if (end === -1) return content;
|
|
return content.slice(end + 4);
|
|
}
|
|
|
|
function walk(dir: string): string[] {
|
|
const entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
const files: string[] = [];
|
|
for (const entry of entries) {
|
|
const full = path.join(dir, entry.name);
|
|
if (entry.isDirectory()) {
|
|
// Skip hidden dirs, i18n, and non-English content
|
|
if (
|
|
entry.name === ".i18n" ||
|
|
entry.name === "zh-CN" ||
|
|
entry.name.startsWith(".")
|
|
) {
|
|
continue;
|
|
}
|
|
files.push(...walk(full));
|
|
} else if (entry.isFile() && /\.mdx?$/.test(entry.name)) {
|
|
files.push(full);
|
|
}
|
|
}
|
|
return files;
|
|
}
|
|
|
|
function extractChunks(filePath: string, content: string): RawChunk[] {
|
|
const chunks: RawChunk[] = [];
|
|
const lines = content.split(/\r?\n/);
|
|
let currentTitle = "";
|
|
let currentLines: string[] = [];
|
|
|
|
const flush = (title: string, body: string) => {
|
|
const text = body.trim();
|
|
if (!text) return;
|
|
const rel = path.relative(docsDir, filePath).replace(/\\/g, "/");
|
|
const urlPath = rel.replace(/\.mdx?$/, "").replace(/^\/+/, "");
|
|
chunks.push({
|
|
path: rel,
|
|
title: title || path.basename(rel, path.extname(rel)),
|
|
content: text,
|
|
url: `${baseUrl}/${urlPath}`,
|
|
});
|
|
};
|
|
|
|
for (const line of lines) {
|
|
const heading = line.match(/^##\s+(.+)$/);
|
|
if (heading) {
|
|
flush(currentTitle, currentLines.join("\n"));
|
|
currentTitle = heading[1].trim();
|
|
currentLines = [];
|
|
} else {
|
|
currentLines.push(line);
|
|
}
|
|
}
|
|
flush(currentTitle, currentLines.join("\n"));
|
|
return chunks;
|
|
}
|
|
|
|
async function main() {
|
|
console.error(`Scanning docs at: ${docsDir}`);
|
|
|
|
// Collect all raw chunks from docs
|
|
const rawChunks: RawChunk[] = [];
|
|
for (const filePath of walk(docsDir)) {
|
|
const raw = fs.readFileSync(filePath, "utf8");
|
|
const body = stripFrontmatter(raw);
|
|
rawChunks.push(...extractChunks(filePath, body));
|
|
}
|
|
|
|
console.error(`Found ${rawChunks.length} sections from docs`);
|
|
|
|
// Split large chunks to stay within embedding model limits and improve retrieval
|
|
const allRawChunks: RawChunk[] = [];
|
|
for (const chunk of rawChunks) {
|
|
allRawChunks.push(...splitLargeChunk(chunk));
|
|
}
|
|
|
|
console.error(
|
|
`Split into ${allRawChunks.length} chunks (target: ~${TARGET_CHUNK_CHARS} chars, max: ${MAX_CHUNK_CHARS} chars, overlap: ${OVERLAP_CHARS} chars)`,
|
|
);
|
|
|
|
if (allRawChunks.length === 0) {
|
|
console.error("No chunks found, exiting.");
|
|
process.exit(0);
|
|
}
|
|
|
|
// Initialize embeddings with text-embedding-3-large for better retrieval quality
|
|
const embeddings = new Embeddings(apiKey!);
|
|
console.error(`Generating embeddings with model: text-embedding-3-large`);
|
|
|
|
// Generate embeddings for all chunks
|
|
// Use title + content for better semantic representation
|
|
const textsToEmbed = allRawChunks.map(
|
|
(chunk) => `${chunk.title}\n${chunk.content}`,
|
|
);
|
|
|
|
console.error(`Embedding ${textsToEmbed.length} chunks in batches...`);
|
|
const vectors = await embeddings.embedBatch(textsToEmbed);
|
|
|
|
// Create DocsChunk objects with embeddings
|
|
const docsChunks: DocsChunk[] = allRawChunks.map((chunk, i) => ({
|
|
id: randomUUID(),
|
|
path: chunk.path,
|
|
title: chunk.title,
|
|
content: chunk.content,
|
|
url: chunk.url,
|
|
vector: vectors[i],
|
|
}));
|
|
|
|
// Store in vector database (auto-detects Upstash or LanceDB)
|
|
const storeMode = detectStoreMode();
|
|
console.error(
|
|
`Storing in ${storeMode === "upstash" ? "Upstash Vector" : "LanceDB (local)"}...`,
|
|
);
|
|
const { store, mode } = await createStore();
|
|
await store.replaceAll(docsChunks);
|
|
|
|
const count = await store.count();
|
|
console.error(
|
|
`Done! Stored ${count} chunks in ${mode === "upstash" ? "Upstash Vector" : "LanceDB"}.`,
|
|
);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error("Error:", err);
|
|
process.exit(1);
|
|
});
|