Tighten parity proof heuristics

This commit is contained in:
Eva
2026-04-11 12:30:15 +07:00
committed by Peter Steinberger
parent fd45ea2bf1
commit 17252df122
4 changed files with 32 additions and 4 deletions

View File

@@ -130,6 +130,27 @@ describe("qa agentic parity report", () => {
);
});
it("ignores neutral Failed and Blocked headings in passing protocol reports", () => {
const summary: QaParitySuiteSummary = {
scenarios: [
{
name: "Source and docs discovery report",
status: "pass",
details: `Worked:
- Read the seeded QA material.
Failed:
- None observed.
Blocked:
- No live provider evidence in this lane.
Follow-up:
- Re-run with a real provider if needed.`,
},
],
};
expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0);
});
it("renders a readable markdown parity report", () => {
const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",

View File

@@ -68,9 +68,11 @@ const SUSPICIOUS_PASS_PATTERNS = [
/incomplete turn/i,
/\btimed out\b/i,
/\btimeout\b/i,
/\bblocked\b/i,
/\berror\b/i,
/\bfailed\b/i,
/\bfailed to\b/i,
/\bcould not\b/i,
/\bunable to\b/i,
/did not continue/i,
] as const;
function normalizeScenarioStatus(status: string | undefined): "pass" | "fail" | "skip" {

View File

@@ -228,7 +228,7 @@ describe("qa mock openai server", () => {
},
{
type: "function_call_output",
output: "Replay safety: unsafe after write.\n",
output: "Successfully wrote 41 bytes to compaction-retry-summary.txt.",
},
],
}),

View File

@@ -453,7 +453,12 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
return `Protocol note: Lobster Invaders built at lobster-invaders.html.`;
}
if (toolOutput && /compaction retry mutating tool check/i.test(prompt)) {
if (toolOutput.includes("Replay safety: unsafe after write.")) {
if (
toolOutput.includes("Replay safety: unsafe after write.") ||
/compaction-retry-summary\.txt/i.test(toolOutput) ||
/successfully (?:wrote|replaced)/i.test(toolOutput) ||
/\bwrote\b.*\bcompaction-retry-summary\.txt\b/i.test(toolOutput)
) {
return "Protocol note: replay unsafe after write.";
}
return "";