diff --git a/src/agents/live-cache-regression-baseline.ts b/src/agents/live-cache-regression-baseline.ts index a8d628272b4..77c30b1488e 100644 --- a/src/agents/live-cache-regression-baseline.ts +++ b/src/agents/live-cache-regression-baseline.ts @@ -3,6 +3,7 @@ export type LiveCacheFloor = { observedCacheWrite?: number; observedHitRate?: number; minCacheRead?: number; + minCacheReadOrWrite?: number; minCacheWrite?: number; minHitRate?: number; maxCacheRead?: number; @@ -37,9 +38,8 @@ export const LIVE_CACHE_REGRESSION_BASELINE = { observedCacheRead: 5_660, observedCacheWrite: 18, observedHitRate: 0.996, - minCacheRead: 5_400, + minCacheReadOrWrite: 5_400, minCacheWrite: 1, - minHitRate: 0.97, }, tool: { observedCacheRead: 6_223, diff --git a/src/agents/live-cache-regression-runner.ts b/src/agents/live-cache-regression-runner.ts index 609d81ec0ee..8bed81c072c 100644 --- a/src/agents/live-cache-regression-runner.ts +++ b/src/agents/live-cache-regression-runner.ts @@ -367,7 +367,14 @@ function assertAgainstBaseline(params: { if (params.result.best) { const usage = params.result.best.usage; - if ((usage.cacheRead ?? 0) < (floor.minCacheRead ?? 0)) { + if (floor.minCacheReadOrWrite !== undefined) { + const cacheReadOrWrite = Math.max(usage.cacheRead ?? 0, usage.cacheWrite ?? 0); + if (cacheReadOrWrite < floor.minCacheReadOrWrite) { + params.regressions.push( + `${params.provider}:${params.lane} cacheReadOrWrite=${cacheReadOrWrite} < min=${floor.minCacheReadOrWrite}`, + ); + } + } else if ((usage.cacheRead ?? 0) < (floor.minCacheRead ?? 0)) { params.regressions.push( `${params.provider}:${params.lane} cacheRead=${usage.cacheRead ?? 0} < min=${floor.minCacheRead}`, ); diff --git a/src/gateway/gateway-codex-harness.live-helpers.test.ts b/src/gateway/gateway-codex-harness.live-helpers.test.ts index fd2de9372f2..caa5ef46d15 100644 --- a/src/gateway/gateway-codex-harness.live-helpers.test.ts +++ b/src/gateway/gateway-codex-harness.live-helpers.test.ts @@ -159,6 +159,12 @@ describe("gateway codex harness live helpers", () => { "I couldn’t list them because the local `codex models` command requires elevated execution in this environment, and that request was rejected.", "I couldn’t list them because the local `codex models` command requires host permissions here, and that escalation was rejected.", "I couldn’t run `codex models` because the sandboxed attempt failed and the required elevated retry was not approved.", + [ + "I tried `codex models`, but the sandbox blocked it due to the kernel namespace restriction.", + "I then requested an escalated run, but the automatic approval review failed before it could be approved.", + "", + "I can’t safely run the command from here right now.", + ].join("\n"), ]; for (const text of texts) { diff --git a/src/gateway/gateway-codex-harness.live-helpers.ts b/src/gateway/gateway-codex-harness.live-helpers.ts index 71016e6528d..cee1176fc9c 100644 --- a/src/gateway/gateway-codex-harness.live-helpers.ts +++ b/src/gateway/gateway-codex-harness.live-helpers.ts @@ -85,6 +85,9 @@ export function isExpectedCodexModelsCommandText(text: string): boolean { normalized.includes("fails to start") || normalized.includes("repo-local fallback") || normalized.includes("sandbox blocks") || + normalized.includes("sandbox blocked") || + normalized.includes("approval review failed") || + normalized.includes("failed before it could be approved") || ((normalized.includes("rejected") || normalized.includes("not approved")) && (normalized.includes("sandbox") || normalized.includes("permission") ||