ci: gate releases on live cache floors

2026-04-19 13:11:40 +00:00 · 2026-04-04 15:44:21 +09:00
parent be4eb269fc
commit 6e6b4f6004
8 changed files with 634 additions and 8 deletions
--- a/src/agents/live-cache-regression-baseline.ts
+++ b/src/agents/live-cache-regression-baseline.ts
@@ -0,0 +1,79 @@
+export type LiveCacheFloor = {
+  observedCacheRead?: number;
+  observedCacheWrite?: number;
+  observedHitRate?: number;
+  minCacheRead?: number;
+  minCacheWrite?: number;
+  minHitRate?: number;
+  maxCacheRead?: number;
+  maxCacheWrite?: number;
+};
+
+export const LIVE_CACHE_REGRESSION_BASELINE = {
+  anthropic: {
+    disabled: {
+      observedCacheRead: 0,
+      observedCacheWrite: 0,
+      maxCacheRead: 32,
+      maxCacheWrite: 32,
+    },
+    image: {
+      observedCacheRead: 5_660,
+      observedCacheWrite: 85,
+      observedHitRate: 0.985,
+      minCacheRead: 4_500,
+      minCacheWrite: 1,
+      minHitRate: 0.97,
+    },
+    mcp: {
+      observedCacheRead: 6_240,
+      observedCacheWrite: 113,
+      observedHitRate: 0.982,
+      minCacheRead: 5_800,
+      minCacheWrite: 1,
+      minHitRate: 0.97,
+    },
+    stable: {
+      observedCacheRead: 5_660,
+      observedCacheWrite: 18,
+      observedHitRate: 0.996,
+      minCacheRead: 5_400,
+      minCacheWrite: 1,
+      minHitRate: 0.97,
+    },
+    tool: {
+      observedCacheRead: 6_223,
+      observedCacheWrite: 97,
+      observedHitRate: 0.984,
+      minCacheRead: 5_000,
+      minCacheWrite: 1,
+      minHitRate: 0.97,
+    },
+  },
+  openai: {
+    image: {
+      observedCacheRead: 4_864,
+      observedHitRate: 0.954,
+      minCacheRead: 3_840,
+      minHitRate: 0.82,
+    },
+    mcp: {
+      observedCacheRead: 4_608,
+      observedHitRate: 0.891,
+      minCacheRead: 4_096,
+      minHitRate: 0.85,
+    },
+    stable: {
+      observedCacheRead: 4_864,
+      observedHitRate: 0.966,
+      minCacheRead: 4_608,
+      minHitRate: 0.9,
+    },
+    tool: {
+      observedCacheRead: 4_608,
+      observedHitRate: 0.896,
+      minCacheRead: 4_096,
+      minHitRate: 0.85,
+    },
+  },
+} as const satisfies Record<string, Record<string, LiveCacheFloor>>;
--- a/src/agents/live-cache-regression-runner.ts
+++ b/src/agents/live-cache-regression-runner.ts
@@ -0,0 +1,472 @@
+import fs from "node:fs/promises";
+import type { AssistantMessage, Message, Tool } from "@mariozechner/pi-ai";
+import { Type } from "@sinclair/typebox";
+import { LIVE_CACHE_REGRESSION_BASELINE } from "./live-cache-regression-baseline.js";
+import {
+  buildAssistantHistoryTurn,
+  buildStableCachePrefix,
+  completeSimpleWithLiveTimeout,
+  computeCacheHitRate,
+  extractAssistantText,
+  logLiveCache,
+  resolveLiveDirectModel,
+} from "./live-cache-test-support.js";
+
+const OPENAI_TIMEOUT_MS = 120_000;
+const ANTHROPIC_TIMEOUT_MS = 120_000;
+const OPENAI_PREFIX = buildStableCachePrefix("openai");
+const OPENAI_MCP_PREFIX = buildStableCachePrefix("openai-mcp-style");
+const ANTHROPIC_PREFIX = buildStableCachePrefix("anthropic");
+const LIVE_TEST_PNG_URL = new URL(
+  "../../apps/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png",
+  import.meta.url,
+);
+
+type LiveResolvedModel = Awaited<ReturnType<typeof resolveLiveDirectModel>>;
+type ProviderKey = keyof typeof LIVE_CACHE_REGRESSION_BASELINE;
+type CacheLane = "image" | "mcp" | "stable" | "tool";
+type CacheRun = {
+  hitRate: number;
+  suffix: string;
+  text: string;
+  usage: AssistantMessage["usage"];
+};
+type LaneResult = {
+  best?: CacheRun;
+  disabled?: CacheRun;
+  warmup?: CacheRun;
+};
+
+export type LiveCacheRegressionResult = {
+  regressions: string[];
+  summary: Record<string, Record<string, unknown>>;
+};
+
+const NOOP_TOOL: Tool = {
+  name: "noop",
+  description: "Return ok.",
+  parameters: Type.Object({}, { additionalProperties: false }),
+};
+
+const MCP_TOOL: Tool = {
+  name: "bundleProbe__bundle_probe",
+  description: "Return bundle MCP probe text.",
+  parameters: Type.Object({}, { additionalProperties: false }),
+};
+
+function makeUserTurn(content: Extract<Message, { role: "user" }>["content"]): Message {
+  return {
+    role: "user",
+    content,
+    timestamp: Date.now(),
+  };
+}
+
+function makeImageUserTurn(text: string, pngBase64: string): Message {
+  return makeUserTurn([
+    { type: "text", text },
+    { type: "image", mimeType: "image/png", data: pngBase64 },
+  ]);
+}
+
+function makeToolResultMessage(
+  toolCallId: string,
+  toolName: string,
+  text: string,
+): Extract<Message, { role: "toolResult" }> {
+  return {
+    role: "toolResult",
+    toolCallId,
+    toolName,
+    content: [{ type: "text", text }],
+    isError: false,
+    timestamp: Date.now(),
+  };
+}
+
+function extractFirstToolCall(message: AssistantMessage) {
+  return message.content.find((block) => block.type === "toolCall");
+}
+
+function assert(condition: unknown, message: string): asserts condition {
+  if (!condition) {
+    throw new Error(message);
+  }
+}
+
+async function runToolOnlyTurn(params: {
+  apiKey: string;
+  cacheRetention: "none" | "short" | "long";
+  model: LiveResolvedModel["model"];
+  providerTag: "anthropic" | "openai";
+  sessionId: string;
+  systemPrompt: string;
+  tool: Tool;
+}) {
+  const timeoutMs = params.providerTag === "openai" ? OPENAI_TIMEOUT_MS : ANTHROPIC_TIMEOUT_MS;
+  const options = {
+    apiKey: params.apiKey,
+    cacheRetention: params.cacheRetention,
+    sessionId: params.sessionId,
+    maxTokens: 128,
+    temperature: 0,
+    ...(params.providerTag === "openai" ? { reasoning: "none" as unknown as never } : {}),
+  };
+  let prompt = `Call the tool \`${params.tool.name}\` with {}. IMPORTANT: respond ONLY with the tool call and no other text.`;
+  let response = await completeSimpleWithLiveTimeout(
+    params.model,
+    {
+      systemPrompt: params.systemPrompt,
+      messages: [makeUserTurn(prompt)],
+      tools: [params.tool],
+    },
+    options,
+    `${params.providerTag} ${params.tool.name} tool-only turn`,
+    timeoutMs,
+  );
+
+  let toolCall = extractFirstToolCall(response);
+  let text = extractAssistantText(response);
+  for (let attempt = 0; attempt < 2 && (!toolCall || text.length > 0); attempt += 1) {
+    prompt = `Return only a tool call for \`${params.tool.name}\` with {}. No text.`;
+    response = await completeSimpleWithLiveTimeout(
+      params.model,
+      {
+        systemPrompt: params.systemPrompt,
+        messages: [makeUserTurn(prompt)],
+        tools: [params.tool],
+      },
+      options,
+      `${params.providerTag} ${params.tool.name} tool-only retry ${attempt + 1}`,
+      timeoutMs,
+    );
+    toolCall = extractFirstToolCall(response);
+    text = extractAssistantText(response);
+  }
+
+  assert(toolCall, `expected tool call for ${params.tool.name}`);
+  assert(
+    text.length === 0,
+    `expected tool-only response for ${params.tool.name}, got ${JSON.stringify(text)}`,
+  );
+  assert(toolCall.type === "toolCall", `expected toolCall block for ${params.tool.name}`);
+
+  return {
+    prompt,
+    response,
+    toolCall,
+  };
+}
+
+async function completeCacheProbe(params: {
+  apiKey: string;
+  cacheRetention: "none" | "short" | "long";
+  messages: Message[];
+  model: LiveResolvedModel["model"];
+  providerTag: "anthropic" | "openai";
+  sessionId: string;
+  suffix: string;
+  systemPrompt: string;
+  tools?: Tool[];
+  maxTokens?: number;
+}): Promise<CacheRun> {
+  const timeoutMs = params.providerTag === "openai" ? OPENAI_TIMEOUT_MS : ANTHROPIC_TIMEOUT_MS;
+  const response = await completeSimpleWithLiveTimeout(
+    params.model,
+    {
+      systemPrompt: params.systemPrompt,
+      messages: params.messages,
+      ...(params.tools ? { tools: params.tools } : {}),
+    },
+    {
+      apiKey: params.apiKey,
+      cacheRetention: params.cacheRetention,
+      sessionId: params.sessionId,
+      maxTokens: params.maxTokens ?? 64,
+      temperature: 0,
+      ...(params.providerTag === "openai" ? { reasoning: "none" as unknown as never } : {}),
+    },
+    `${params.providerTag} cache lane ${params.suffix}`,
+    timeoutMs,
+  );
+  const text = extractAssistantText(response);
+  assert(
+    text.toLowerCase().includes(params.suffix.toLowerCase()),
+    `expected response to contain ${params.suffix}, got ${JSON.stringify(text)}`,
+  );
+  return {
+    suffix: params.suffix,
+    text,
+    usage: response.usage,
+    hitRate: computeCacheHitRate(response.usage),
+  };
+}
+
+async function runRepeatedLane(params: {
+  lane: CacheLane;
+  providerTag: "anthropic" | "openai";
+  fixture: LiveResolvedModel;
+  runToken: string;
+  sessionId: string;
+  pngBase64: string;
+}): Promise<LaneResult> {
+  const suffixBase = `${params.providerTag}-${params.lane}`;
+  const systemPromptBase =
+    params.providerTag === "openai"
+      ? params.lane === "mcp"
+        ? OPENAI_MCP_PREFIX
+        : OPENAI_PREFIX
+      : ANTHROPIC_PREFIX;
+  const systemPrompt = `${systemPromptBase}\nRun token: ${params.runToken}\nLane: ${params.providerTag}-${params.lane}\n`;
+
+  const run =
+    params.lane === "stable"
+      ? (suffix: string) =>
+          completeCacheProbe({
+            apiKey: params.fixture.apiKey,
+            cacheRetention: "short",
+            messages: [makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`)],
+            model: params.fixture.model,
+            providerTag: params.providerTag,
+            sessionId: params.sessionId,
+            suffix,
+            systemPrompt,
+            maxTokens: 32,
+          })
+      : params.lane === "image"
+        ? (suffix: string) =>
+            completeCacheProbe({
+              apiKey: params.fixture.apiKey,
+              cacheRetention: "short",
+              messages: [
+                makeImageUserTurn(
+                  "An image is attached. Ignore image semantics but keep the bytes in history.",
+                  params.pngBase64,
+                ),
+                buildAssistantHistoryTurn("IMAGE HISTORY ACKNOWLEDGED", params.fixture.model),
+                makeUserTurn("Keep the earlier image turn stable in context."),
+                buildAssistantHistoryTurn("IMAGE HISTORY PRESERVED", params.fixture.model),
+                makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`),
+              ],
+              model: params.fixture.model,
+              providerTag: params.providerTag,
+              sessionId: params.sessionId,
+              suffix,
+              systemPrompt,
+            })
+        : async (suffix: string) => {
+            const tool = params.lane === "mcp" ? MCP_TOOL : NOOP_TOOL;
+            const toolText = params.lane === "mcp" ? "FROM-BUNDLE" : "ok";
+            const historyPrefix = params.lane === "mcp" ? "MCP TOOL HISTORY" : "TOOL HISTORY";
+            const toolTurn = await runToolOnlyTurn({
+              apiKey: params.fixture.apiKey,
+              cacheRetention: "short",
+              model: params.fixture.model,
+              providerTag: params.providerTag,
+              sessionId: params.sessionId,
+              systemPrompt,
+              tool,
+            });
+            return await completeCacheProbe({
+              apiKey: params.fixture.apiKey,
+              cacheRetention: "short",
+              messages: [
+                makeUserTurn(toolTurn.prompt),
+                toolTurn.response,
+                makeToolResultMessage(toolTurn.toolCall.id, tool.name, toolText),
+                buildAssistantHistoryTurn(`${historyPrefix} ACKNOWLEDGED`, params.fixture.model),
+                makeUserTurn(
+                  params.lane === "mcp"
+                    ? "Keep the MCP tool output stable in history."
+                    : "Keep the tool output stable in history.",
+                ),
+                buildAssistantHistoryTurn(`${historyPrefix} PRESERVED`, params.fixture.model),
+                makeUserTurn(`Reply with exactly CACHE-OK ${suffix}.`),
+              ],
+              model: params.fixture.model,
+              providerTag: params.providerTag,
+              sessionId: params.sessionId,
+              suffix,
+              systemPrompt,
+              tools: [tool],
+            });
+          };
+
+  const warmup = await run(`${suffixBase}-warmup`);
+  const hitA = await run(`${suffixBase}-hit-a`);
+  const hitB = await run(`${suffixBase}-hit-b`);
+  const best = (hitA.usage.cacheRead ?? 0) >= (hitB.usage.cacheRead ?? 0) ? hitA : hitB;
+  return { best, warmup };
+}
+
+async function runAnthropicDisabledLane(params: {
+  fixture: LiveResolvedModel;
+  runToken: string;
+  sessionId: string;
+}): Promise<LaneResult> {
+  const disabled = await completeCacheProbe({
+    apiKey: params.fixture.apiKey,
+    cacheRetention: "none",
+    messages: [makeUserTurn("Reply with exactly CACHE-OK anthropic-disabled.")],
+    model: params.fixture.model,
+    providerTag: "anthropic",
+    sessionId: params.sessionId,
+    suffix: "anthropic-disabled",
+    systemPrompt: `${ANTHROPIC_PREFIX}\nRun token: ${params.runToken}\nLane: anthropic-disabled\n`,
+    maxTokens: 32,
+  });
+  return { disabled };
+}
+
+function formatUsage(usage: AssistantMessage["usage"]) {
+  return `cacheRead=${usage.cacheRead ?? 0} cacheWrite=${usage.cacheWrite ?? 0} input=${usage.input ?? 0}`;
+}
+
+function assertAgainstBaseline(params: {
+  lane: string;
+  provider: ProviderKey;
+  result: LaneResult;
+  regressions: string[];
+}) {
+  const floor =
+    LIVE_CACHE_REGRESSION_BASELINE[params.provider][
+      params.lane as keyof (typeof LIVE_CACHE_REGRESSION_BASELINE)[typeof params.provider]
+    ];
+  if (!floor) {
+    params.regressions.push(`${params.provider}:${params.lane} missing baseline entry`);
+    return;
+  }
+
+  if (params.result.best) {
+    const usage = params.result.best.usage;
+    if ((usage.cacheRead ?? 0) < (floor.minCacheRead ?? 0)) {
+      params.regressions.push(
+        `${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} < min=${floor.minCacheRead}`,
+      );
+    }
+    if (params.result.best.hitRate < (floor.minHitRate ?? 0)) {
+      params.regressions.push(
+        `${params.provider}:${params.lane} hitRate=${params.result.best.hitRate.toFixed(3)} < min=${floor.minHitRate?.toFixed(3)}`,
+      );
+    }
+  }
+
+  if (params.result.warmup) {
+    const warmupUsage = params.result.warmup.usage;
+    if ((warmupUsage.cacheWrite ?? 0) < (floor.minCacheWrite ?? 0)) {
+      params.regressions.push(
+        `${params.provider}:${params.lane} warmup cacheWrite=${warmupUsage.cacheWrite ?? 0} < min=${floor.minCacheWrite}`,
+      );
+    }
+  }
+
+  if (params.result.disabled) {
+    const usage = params.result.disabled.usage;
+    if ((usage.cacheRead ?? 0) > (floor.maxCacheRead ?? Number.POSITIVE_INFINITY)) {
+      params.regressions.push(
+        `${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} > max=${floor.maxCacheRead}`,
+      );
+    }
+    if ((usage.cacheWrite ?? 0) > (floor.maxCacheWrite ?? Number.POSITIVE_INFINITY)) {
+      params.regressions.push(
+        `${params.provider}:${params.lane} cacheWrite=${usage.cacheWrite ?? 0} > max=${floor.maxCacheWrite}`,
+      );
+    }
+  }
+}
+
+export async function runLiveCacheRegression(): Promise<LiveCacheRegressionResult> {
+  const pngBase64 = (await fs.readFile(LIVE_TEST_PNG_URL)).toString("base64");
+  const runToken = `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 8)}`;
+  const openai = await resolveLiveDirectModel({
+    provider: "openai",
+    api: "openai-responses",
+    envVar: "OPENCLAW_LIVE_OPENAI_CACHE_MODEL",
+    preferredModelIds: ["gpt-5.4-mini", "gpt-5.4", "gpt-5.2"],
+  });
+  const anthropic = await resolveLiveDirectModel({
+    provider: "anthropic",
+    api: "anthropic-messages",
+    envVar: "OPENCLAW_LIVE_ANTHROPIC_CACHE_MODEL",
+    preferredModelIds: ["claude-sonnet-4-6", "claude-sonnet-4-5", "claude-haiku-3-5"],
+  });
+
+  const regressions: string[] = [];
+  const summary: Record<string, Record<string, unknown>> = {
+    anthropic: {},
+    openai: {},
+  };
+
+  for (const lane of ["stable", "tool", "image", "mcp"] as const) {
+    const openaiResult = await runRepeatedLane({
+      lane,
+      providerTag: "openai",
+      fixture: openai,
+      runToken,
+      sessionId: `live-cache-regression-${runToken}-openai-${lane}`,
+      pngBase64,
+    });
+    logLiveCache(
+      `openai ${lane} warmup ${formatUsage(openaiResult.warmup?.usage ?? {})} rate=${openaiResult.warmup?.hitRate.toFixed(3) ?? "0.000"}`,
+    );
+    logLiveCache(
+      `openai ${lane} best ${formatUsage(openaiResult.best?.usage ?? {})} rate=${openaiResult.best?.hitRate.toFixed(3) ?? "0.000"}`,
+    );
+    summary.openai[lane] = {
+      best: openaiResult.best?.usage,
+      hitRate: openaiResult.best?.hitRate,
+      warmup: openaiResult.warmup?.usage,
+    };
+    assertAgainstBaseline({
+      lane,
+      provider: "openai",
+      result: openaiResult,
+      regressions,
+    });
+
+    const anthropicResult = await runRepeatedLane({
+      lane,
+      providerTag: "anthropic",
+      fixture: anthropic,
+      runToken,
+      sessionId: `live-cache-regression-${runToken}-anthropic-${lane}`,
+      pngBase64,
+    });
+    logLiveCache(
+      `anthropic ${lane} warmup ${formatUsage(anthropicResult.warmup?.usage ?? {})} rate=${anthropicResult.warmup?.hitRate.toFixed(3) ?? "0.000"}`,
+    );
+    logLiveCache(
+      `anthropic ${lane} best ${formatUsage(anthropicResult.best?.usage ?? {})} rate=${anthropicResult.best?.hitRate.toFixed(3) ?? "0.000"}`,
+    );
+    summary.anthropic[lane] = {
+      best: anthropicResult.best?.usage,
+      hitRate: anthropicResult.best?.hitRate,
+      warmup: anthropicResult.warmup?.usage,
+    };
+    assertAgainstBaseline({
+      lane,
+      provider: "anthropic",
+      result: anthropicResult,
+      regressions,
+    });
+  }
+
+  const disabled = await runAnthropicDisabledLane({
+    fixture: anthropic,
+    runToken,
+    sessionId: `live-cache-regression-${runToken}-anthropic-disabled`,
+  });
+  logLiveCache(`anthropic disabled ${formatUsage(disabled.disabled?.usage ?? {})}`);
+  summary.anthropic.disabled = {
+    disabled: disabled.disabled?.usage,
+  };
+  assertAgainstBaseline({
+    lane: "disabled",
+    provider: "anthropic",
+    result: disabled,
+    regressions,
+  });
+
+  logLiveCache(`cache regression summary ${JSON.stringify(summary)}`);
+  return { regressions, summary };
+}
--- a/src/agents/live-cache-regression.live.test.ts
+++ b/src/agents/live-cache-regression.live.test.ts
@@ -0,0 +1,16 @@
+import { describe, expect, it } from "vitest";
+import { runLiveCacheRegression } from "./live-cache-regression-runner.js";
+import { LIVE_CACHE_TEST_ENABLED } from "./live-cache-test-support.js";
+
+const describeCacheLive = LIVE_CACHE_TEST_ENABLED ? describe : describe.skip;
+
+describeCacheLive("live cache regression", () => {
+  it(
+    "matches the stored provider cache baselines",
+    async () => {
+      const result = await runLiveCacheRegression();
+      expect(result.regressions).toEqual([]);
+    },
+    30 * 60_000,
+  );
+});