feat: implement rate limiting and error handling in chat API

- Added rate limiting functionality to the chat API, allowing a maximum number of requests per IP within a specified time window. - Implemented error handling for rate limit exceeded responses, including appropriate headers and retry instructions. - Enhanced error handling for other API errors, providing user-friendly messages for various failure scenarios. - Updated README to include new environment variables for rate limiting configuration.
2026-04-27 09:02:15 +00:00 · 2026-02-03 03:20:41 -06:00
parent 1de7b5c5c8
commit 4a543d15d1
3 changed files with 135 additions and 3 deletions
--- a/scripts/docs-chat/README.md
+++ b/scripts/docs-chat/README.md
@@ -23,7 +23,15 @@ This generates embeddings for all doc chunks and stores them in
 OPENAI_API_KEY=sk-... pnpm docs:chat:serve:vector
 ```

-Defaults to `http://localhost:3001`. Health check:
+Defaults to `http://localhost:3001`. Optional environment variables:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `PORT` | `3001` | Server port |
+| `RATE_LIMIT` | `20` | Max requests per window per IP |
+| `RATE_WINDOW_MS` | `60000` | Rate limit window in milliseconds |
+
+Health check:

 ```bash
 curl http://localhost:3001/health
--- a/scripts/docs-chat/serve.ts
+++ b/scripts/docs-chat/serve.ts
@@ -1,7 +1,7 @@
 #!/usr/bin/env bun
 /**
 * Docs-chat API with RAG (vector search).
- * Env: OPENAI_API_KEY, DOCS_CHAT_DB, PORT
+ * Env: OPENAI_API_KEY, DOCS_CHAT_DB, PORT, RATE_LIMIT, RATE_WINDOW_MS
 */
 import path from "node:path";
 import { fileURLToPath } from "node:url";
@@ -15,6 +15,62 @@ const defaultDbPath = path.join(__dirname, ".lance-db");
 const dbPath = process.env.DOCS_CHAT_DB || defaultDbPath;
 const port = Number(process.env.PORT || 3001);

+// Rate limiting configuration
+const RATE_LIMIT = Number(process.env.RATE_LIMIT || 20); // requests per window
+const RATE_WINDOW_MS = Number(process.env.RATE_WINDOW_MS || 60_000); // 1 minute
+const MAX_MESSAGE_LENGTH = 2000; // characters
+const MAX_BODY_SIZE = 8192; // bytes
+
+// In-memory rate limit store (IP -> { count, resetAt })
+const rateLimitStore = new Map<string, { count: number; resetAt: number }>();
+
+// Periodically clean up expired entries to prevent memory leaks
+setInterval(() => {
+  const now = Date.now();
+  for (const [ip, record] of rateLimitStore) {
+    if (now > record.resetAt) {
+      rateLimitStore.delete(ip);
+    }
+  }
+}, RATE_WINDOW_MS);
+
+/**
+ * Check if an IP is rate limited. Returns remaining requests or -1 if blocked.
+ */
+function checkRateLimit(ip: string): { allowed: boolean; remaining: number; resetAt: number } {
+  const now = Date.now();
+  const record = rateLimitStore.get(ip);
+
+  if (!record || now > record.resetAt) {
+    // New window
+    rateLimitStore.set(ip, { count: 1, resetAt: now + RATE_WINDOW_MS });
+    return { allowed: true, remaining: RATE_LIMIT - 1, resetAt: now + RATE_WINDOW_MS };
+  }
+
+  if (record.count >= RATE_LIMIT) {
+    return { allowed: false, remaining: 0, resetAt: record.resetAt };
+  }
+
+  record.count++;
+  return { allowed: true, remaining: RATE_LIMIT - record.count, resetAt: record.resetAt };
+}
+
+/**
+ * Extract client IP from request, handling proxies.
+ */
+function getClientIP(req: http.IncomingMessage): string {
+  // Check common proxy headers (trust these only if behind a known proxy)
+  const forwarded = req.headers["x-forwarded-for"];
+  if (typeof forwarded === "string") {
+    return forwarded.split(",")[0].trim();
+  }
+  const realIp = req.headers["x-real-ip"];
+  if (typeof realIp === "string") {
+    return realIp.trim();
+  }
+  return req.socket.remoteAddress || "unknown";
+}
+
 // Validate API key
 const apiKey = process.env.OPENAI_API_KEY;
 if (!apiKey) {
@@ -91,8 +147,17 @@ async function streamOpenAI(
 }

 async function handleChat(req: http.IncomingMessage, res: http.ServerResponse) {
+  // Read body with size limit to prevent memory exhaustion
  let body = "";
-  for await (const chunk of req) body += chunk;
+  let bodySize = 0;
+  for await (const chunk of req) {
+    bodySize += chunk.length;
+    if (bodySize > MAX_BODY_SIZE) {
+      sendJson(res, 413, { error: "Request too large" });
+      return;
+    }
+    body += chunk;
+  }

  let message = "";
  try {
@@ -107,6 +172,20 @@ async function handleChat(req: http.IncomingMessage, res: http.ServerResponse) {
    return;
  }

+  // Validate message length to prevent token stuffing
+  const trimmedMessage = message.trim();
+  if (!trimmedMessage) {
+    sendJson(res, 400, { error: "message required" });
+    return;
+  }
+  if (trimmedMessage.length > MAX_MESSAGE_LENGTH) {
+    sendJson(res, 400, {
+      error: `Message too long (max ${MAX_MESSAGE_LENGTH} characters)`,
+    });
+    return;
+  }
+  message = trimmedMessage;
+
  // Use RAG retriever instead of keyword matching
  const results = await retriever.retrieve(message, 8);

@@ -167,6 +246,25 @@ const server = http.createServer(async (req, res) => {
  }

  if (req.method === "POST" && req.url === "/chat") {
+    // Apply rate limiting
+    const clientIP = getClientIP(req);
+    const rateCheck = checkRateLimit(clientIP);
+
+    // Add rate limit headers
+    res.setHeader("X-RateLimit-Limit", RATE_LIMIT);
+    res.setHeader("X-RateLimit-Remaining", Math.max(0, rateCheck.remaining));
+    res.setHeader("X-RateLimit-Reset", Math.ceil(rateCheck.resetAt / 1000));
+
+    if (!rateCheck.allowed) {
+      const retryAfter = Math.ceil((rateCheck.resetAt - Date.now()) / 1000);
+      res.setHeader("Retry-After", retryAfter);
+      sendJson(res, 429, {
+        error: "Too many requests. Please wait before trying again.",
+        retryAfter,
+      });
+      return;
+    }
+
    await handleChat(req, res);
    return;
  }
@@ -179,4 +277,7 @@ server.listen(port, async () => {
  console.error(
    `docs-chat API (RAG) running at http://localhost:${port} (chunks: ${count})`,
  );
+  console.error(
+    `Rate limit: ${RATE_LIMIT} requests per ${RATE_WINDOW_MS / 1000}s window`,
+  );
 });