openclaw/packages/memory-host-sdk/src/host/embedding-chunk-limits.test.ts

import { describe, expect, it } from "vitest";
import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js";
import { estimateUtf8Bytes } from "./embedding-input-limits.js";
import type { EmbeddingProvider } from "./embeddings.js";

function createProvider(maxInputTokens: number): EmbeddingProvider {
  return {
    id: "mock",
    model: "mock-embed",
    maxInputTokens,
    embedQuery: async () => [0],
    embedBatch: async () => [[0]],
  };
}

function createProviderWithoutMaxInputTokens(params: {
  id: string;
  model: string;
}): EmbeddingProvider {
  return {
    id: params.id,
    model: params.model,
    embedQuery: async () => [0],
    embedBatch: async () => [[0]],
  };
}

type EmbeddingChunks = ReturnType<typeof enforceEmbeddingMaxInputTokens>;

function expectChunksWithinUtf8Bytes(chunks: EmbeddingChunks, maxBytes: number) {
  const oversized = chunks
    .map((chunk, index) => ({ index, bytes: estimateUtf8Bytes(chunk.text) }))
    .filter((entry) => entry.bytes > maxBytes);
  expect(oversized).toEqual([]);
}

function expectChunksLineRange(chunks: EmbeddingChunks, startLine: number, endLine: number) {
  expect(chunks.map((chunk) => ({ startLine: chunk.startLine, endLine: chunk.endLine }))).toEqual(
    chunks.map(() => ({ startLine, endLine })),
  );
}

function expectChunksHaveHashes(chunks: EmbeddingChunks) {
  const invalidHashes = chunks
    .map((chunk, index) => ({ index, hash: chunk.hash }))
    .filter((entry) => typeof entry.hash !== "string" || entry.hash.length === 0);
  expect(invalidHashes).toEqual([]);
}

describe("embedding chunk limits", () => {
  it("splits oversized chunks so each embedding input stays <= maxInputTokens bytes", () => {
    const provider = createProvider(8192);
    const input = {
      startLine: 1,
      endLine: 1,
      text: "x".repeat(9000),
      hash: "ignored",
    };

    const out = enforceEmbeddingMaxInputTokens(provider, [input]);
    expect(out.length).toBeGreaterThan(1);
    expect(out.map((chunk) => chunk.text).join("")).toBe(input.text);
    expectChunksWithinUtf8Bytes(out, 8192);
    expectChunksLineRange(out, 1, 1);
    expectChunksHaveHashes(out);
  });

  it("does not split inside surrogate pairs (emoji)", () => {
    const provider = createProvider(8192);
    const emoji = "😀";
    const inputText = `${emoji.repeat(2100)}\n${emoji.repeat(2100)}`;

    const out = enforceEmbeddingMaxInputTokens(provider, [
      { startLine: 1, endLine: 2, text: inputText, hash: "ignored" },
    ]);

    expect(out.length).toBeGreaterThan(1);
    expect(out.map((chunk) => chunk.text).join("")).toBe(inputText);
    expectChunksWithinUtf8Bytes(out, 8192);

    // If we split inside surrogate pairs we'd likely end up with replacement chars.
    expect(out.map((chunk) => chunk.text).join("")).not.toContain("\uFFFD");
  });

  it("uses conservative fallback limits for local providers without declared maxInputTokens", () => {
    const provider = createProviderWithoutMaxInputTokens({
      id: "local",
      model: "unknown-local-embedding",
    });

    const out = enforceEmbeddingMaxInputTokens(provider, [
      {
        startLine: 1,
        endLine: 1,
        text: "x".repeat(3000),
        hash: "ignored",
      },
    ]);

    expect(out.length).toBeGreaterThan(1);
    expectChunksWithinUtf8Bytes(out, 2048);
  });

  it("honors hard safety caps lower than provider maxInputTokens", () => {
    const provider = createProvider(8192);
    const out = enforceEmbeddingMaxInputTokens(
      provider,
      [
        {
          startLine: 1,
          endLine: 1,
          text: "x".repeat(8100),
          hash: "ignored",
        },
      ],
      8000,
    );

    expect(out.length).toBeGreaterThan(1);
    expectChunksWithinUtf8Bytes(out, 8000);
  });
});