From fd45ea2bf17dba49a4dacd88e5c9580ce65e590a Mon Sep 17 00:00:00 2001
From: Eva <eva@100yen.org>
Date: Sat, 11 Apr 2026 05:35:08 +0700
Subject: [PATCH] test(qa): add compaction retry parity scenario

---
 .../gpt54-codex-agentic-parity-maintainers.md | 11 +++
 docs/help/gpt54-codex-agentic-parity.md       | 29 ++++--
 .../qa-lab/src/agentic-parity-report.test.ts  |  2 +
 extensions/qa-lab/src/agentic-parity.ts       |  4 +
 extensions/qa-lab/src/cli.runtime.test.ts     |  1 +
 .../qa-lab/src/mock-openai-server.test.ts     | 71 ++++++++++++++
 extensions/qa-lab/src/mock-openai-server.ts   | 17 ++++
 qa/frontier-harness-plan.md                   |  5 +-
 .../compaction-retry-mutating-tool.md         | 98 +++++++++++++++++++
 9 files changed, 230 insertions(+), 8 deletions(-)
 create mode 100644 qa/scenarios/compaction-retry-mutating-tool.md

diff --git a/docs/help/gpt54-codex-agentic-parity-maintainers.md b/docs/help/gpt54-codex-agentic-parity-maintainers.md
index 279dde7ce2f..a602cfe1ca7 100644
--- a/docs/help/gpt54-codex-agentic-parity-maintainers.md
+++ b/docs/help/gpt54-codex-agentic-parity-maintainers.md
@@ -105,6 +105,7 @@ PR D is the proof layer. It should not be the reason runtime-correctness PRs are
 ### PR D
 
 - the scenario pack is understandable and reproducible
+- the pack includes a mutating replay-safety lane, not only read-only flows
 - reports are readable by humans and automation
 - parity claims are evidence-backed, not anecdotal
 
@@ -142,6 +143,16 @@ The parity harness is not the only evidence source. Keep this split explicit in
 - PR D owns the scenario-based GPT-5.4 vs Opus 4.6 comparison
 - PR B deterministic suites still own auth/proxy/DNS and full-access truthfulness evidence
 
+## Goal-to-evidence map
+
+| Completion gate item                     | Primary owner | Review artifact                                                     |
+| ---------------------------------------- | ------------- | ------------------------------------------------------------------- |
+| No plan-only stalls                      | PR A          | strict-agentic runtime tests and `approval-turn-tool-followthrough` |
+| No fake progress or fake tool completion | PR A + PR D   | parity fake-success count plus scenario-level report details        |
+| No false `/elevated full` guidance       | PR B          | deterministic runtime-truthfulness suites                           |
+| Replay/liveness failures remain explicit | PR C + PR D   | lifecycle/replay suites plus `compaction-retry-mutating-tool`       |
+| GPT-5.4 matches or beats Opus 4.6        | PR D          | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json`  |
+
 ## Reviewer shorthand: before vs after
 
 | User-visible problem before                                 | Review signal after                                                                     |
diff --git a/docs/help/gpt54-codex-agentic-parity.md b/docs/help/gpt54-codex-agentic-parity.md
index 62ac1d40b5c..1ee8dd84a16 100644
--- a/docs/help/gpt54-codex-agentic-parity.md
+++ b/docs/help/gpt54-codex-agentic-parity.md
@@ -129,7 +129,7 @@ flowchart LR
 
 ## Scenario pack
 
-The first-wave parity pack currently covers four scenarios:
+The first-wave parity pack currently covers five scenarios:
 
 ### `approval-turn-tool-followthrough`
 
@@ -147,14 +147,19 @@ Checks that the model can read source and docs, synthesize findings, and continu
 
 Checks that mixed-mode tasks involving attachments remain actionable and do not collapse into vague narration.
 
+### `compaction-retry-mutating-tool`
+
+Checks that a task with a real mutating write keeps replay-unsafety explicit instead of quietly looking replay-safe if the run compacts, retries, or loses reply state under pressure.
+
 ## Scenario matrix
 
-| Scenario                           | What it tests                          | Good GPT-5.4 behavior                                                         | Failure signal                                                                |
-| ---------------------------------- | -------------------------------------- | ----------------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
-| `approval-turn-tool-followthrough` | Short approval turns after a plan      | Starts the first concrete tool action immediately instead of restating intent | plan-only follow-up, no tool activity, or blocked turn without a real blocker |
-| `model-switch-tool-continuity`     | Runtime/model switching under tool use | Preserves task context and continues acting coherently                        | resets into commentary, loses tool context, or stops after switch             |
-| `source-docs-discovery-report`     | Source reading + synthesis + action    | Finds sources, uses tools, and produces a useful report without stalling      | thin summary, missing tool work, or incomplete-turn stop                      |
-| `image-understanding-attachment`   | Attachment-driven agentic work         | Interprets the attachment, connects it to tools, and continues the task       | vague narration, attachment ignored, or no concrete next action               |
+| Scenario                           | What it tests                           | Good GPT-5.4 behavior                                                          | Failure signal                                                                 |
+| ---------------------------------- | --------------------------------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------ |
+| `approval-turn-tool-followthrough` | Short approval turns after a plan       | Starts the first concrete tool action immediately instead of restating intent  | plan-only follow-up, no tool activity, or blocked turn without a real blocker  |
+| `model-switch-tool-continuity`     | Runtime/model switching under tool use  | Preserves task context and continues acting coherently                         | resets into commentary, loses tool context, or stops after switch              |
+| `source-docs-discovery-report`     | Source reading + synthesis + action     | Finds sources, uses tools, and produces a useful report without stalling       | thin summary, missing tool work, or incomplete-turn stop                       |
+| `image-understanding-attachment`   | Attachment-driven agentic work          | Interprets the attachment, connects it to tools, and continues the task        | vague narration, attachment ignored, or no concrete next action                |
+| `compaction-retry-mutating-tool`   | Mutating work under compaction pressure | Performs a real write and keeps replay-unsafety explicit after the side effect | mutating write happens but replay safety is implied, missing, or contradictory |
 
 ## Release gate
 
@@ -180,6 +185,16 @@ Parity evidence is intentionally split across two layers:
 - PR D proves same-scenario GPT-5.4 vs Opus 4.6 behavior with QA-lab
 - PR B deterministic suites prove auth, proxy, DNS, and `/elevated full` truthfulness outside the harness
 
+## Goal-to-evidence matrix
+
+| Completion gate item                                     | Owning PR   | Evidence source                                                    | Pass signal                                                                              |
+| -------------------------------------------------------- | ----------- | ------------------------------------------------------------------ | ---------------------------------------------------------------------------------------- |
+| GPT-5.4 no longer stalls after planning                  | PR A        | `approval-turn-tool-followthrough` plus PR A runtime suites        | approval turns trigger real work or an explicit blocked state                            |
+| GPT-5.4 no longer fakes progress or fake tool completion | PR A + PR D | parity report scenario outcomes and fake-success count             | no suspicious pass results and no commentary-only completion                             |
+| GPT-5.4 no longer gives false `/elevated full` guidance  | PR B        | deterministic truthfulness suites                                  | blocked reasons and full-access hints stay runtime-accurate                              |
+| Replay/liveness failures stay explicit                   | PR C + PR D | PR C lifecycle/replay suites plus `compaction-retry-mutating-tool` | mutating work keeps replay-unsafety explicit instead of silently disappearing            |
+| GPT-5.4 matches or beats Opus 4.6 on the agreed metrics  | PR D        | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | same scenario coverage and no regression on completion, stop behavior, or valid tool use |
+
 ## How to read the parity verdict
 
 Use the verdict in `qa-agentic-parity-summary.json` as the final machine-readable decision for the first-wave parity pack.
diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts
index f71df411641..21ee3d80f00 100644
--- a/extensions/qa-lab/src/agentic-parity-report.test.ts
+++ b/extensions/qa-lab/src/agentic-parity-report.test.ts
@@ -137,6 +137,7 @@ describe("qa agentic parity report", () => {
       candidateSummary: {
         scenarios: [
           { name: "Approval turn tool followthrough", status: "pass" },
+          { name: "Compaction retry after mutating tool", status: "pass" },
           { name: "Model switch with tool continuity", status: "pass" },
           { name: "Source and docs discovery report", status: "pass" },
           { name: "Image understanding from attachment", status: "pass" },
@@ -145,6 +146,7 @@ describe("qa agentic parity report", () => {
       baselineSummary: {
         scenarios: [
           { name: "Approval turn tool followthrough", status: "pass" },
+          { name: "Compaction retry after mutating tool", status: "pass" },
           { name: "Model switch with tool continuity", status: "pass" },
           { name: "Source and docs discovery report", status: "pass" },
           { name: "Image understanding from attachment", status: "pass" },
diff --git a/extensions/qa-lab/src/agentic-parity.ts b/extensions/qa-lab/src/agentic-parity.ts
index 73a59080360..e2972c92e17 100644
--- a/extensions/qa-lab/src/agentic-parity.ts
+++ b/extensions/qa-lab/src/agentic-parity.ts
@@ -17,6 +17,10 @@ export const QA_AGENTIC_PARITY_SCENARIOS = [
     id: "image-understanding-attachment",
     title: "Image understanding from attachment",
   },
+  {
+    id: "compaction-retry-mutating-tool",
+    title: "Compaction retry after mutating tool",
+  },
 ] as const;
 
 export const QA_AGENTIC_PARITY_SCENARIO_IDS = QA_AGENTIC_PARITY_SCENARIOS.map(({ id }) => id);
diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts
index 8b7b5012db7..9f4651816f0 100644
--- a/extensions/qa-lab/src/cli.runtime.test.ts
+++ b/extensions/qa-lab/src/cli.runtime.test.ts
@@ -334,6 +334,7 @@ describe("qa cli runtime", () => {
           "model-switch-tool-continuity",
           "source-docs-discovery-report",
           "image-understanding-attachment",
+          "compaction-retry-mutating-tool",
         ],
       }),
     );
diff --git a/extensions/qa-lab/src/mock-openai-server.test.ts b/extensions/qa-lab/src/mock-openai-server.test.ts
index 2101bcbab8b..4648ff6859e 100644
--- a/extensions/qa-lab/src/mock-openai-server.test.ts
+++ b/extensions/qa-lab/src/mock-openai-server.test.ts
@@ -169,6 +169,77 @@ describe("qa mock openai server", () => {
     ]);
   });
 
+  it("drives the compaction retry mutating tool parity flow", async () => {
+    const server = await startQaMockOpenAiServer({
+      host: "127.0.0.1",
+      port: 0,
+    });
+    cleanups.push(async () => {
+      await server.stop();
+    });
+
+    const writePlan = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: true,
+        model: "gpt-5.4",
+        input: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.",
+              },
+            ],
+          },
+          {
+            type: "function_call_output",
+            output: "compaction retry evidence block 0000\ncompaction retry evidence block 0001",
+          },
+        ],
+      }),
+    });
+    expect(writePlan.status).toBe(200);
+    const writePlanBody = await writePlan.text();
+    expect(writePlanBody).toContain('"name":"write"');
+    expect(writePlanBody).toContain("compaction-retry-summary.txt");
+
+    const finalReply = await fetch(`${server.baseUrl}/v1/responses`, {
+      method: "POST",
+      headers: {
+        "content-type": "application/json",
+      },
+      body: JSON.stringify({
+        stream: false,
+        model: "gpt-5.4",
+        input: [
+          {
+            role: "user",
+            content: [
+              {
+                type: "input_text",
+                text: "Compaction retry mutating tool check: read COMPACTION_RETRY_CONTEXT.md, then create compaction-retry-summary.txt and keep replay safety explicit.",
+              },
+            ],
+          },
+          {
+            type: "function_call_output",
+            output: "Replay safety: unsafe after write.\n",
+          },
+        ],
+      }),
+    });
+    expect(finalReply.status).toBe(200);
+    const finalPayload = (await finalReply.json()) as {
+      output?: Array<{ content?: Array<{ text?: string }> }>;
+    };
+    expect(finalPayload.output?.[0]?.content?.[0]?.text).toContain("replay unsafe after write");
+  });
+
   it("supports exact reply memory prompts and embeddings requests", async () => {
     const server = await startQaMockOpenAiServer({
       host: "127.0.0.1",
diff --git a/extensions/qa-lab/src/mock-openai-server.ts b/extensions/qa-lab/src/mock-openai-server.ts
index b44db8424ff..b013d9e420c 100644
--- a/extensions/qa-lab/src/mock-openai-server.ts
+++ b/extensions/qa-lab/src/mock-openai-server.ts
@@ -452,6 +452,12 @@ function buildAssistantText(input: ResponsesInputItem[], body: Record<string, un
     }
     return `Protocol note: Lobster Invaders built at lobster-invaders.html.`;
   }
+  if (toolOutput && /compaction retry mutating tool check/i.test(prompt)) {
+    if (toolOutput.includes("Replay safety: unsafe after write.")) {
+      return "Protocol note: replay unsafe after write.";
+    }
+    return "";
+  }
   if (toolOutput) {
     const snippet = toolOutput.replace(/\s+/g, " ").trim().slice(0, 220);
     return `Protocol note: I reviewed the requested material. Evidence snippet: ${snippet || "no content"}`;
@@ -541,6 +547,17 @@ async function buildResponsesPayload(body: Record<string, unknown>) {
       });
     }
   }
+  if (/compaction retry mutating tool check/i.test(prompt)) {
+    if (!toolOutput) {
+      return buildToolCallEventsWithArgs("read", { path: "COMPACTION_RETRY_CONTEXT.md" });
+    }
+    if (toolOutput.includes("compaction retry evidence")) {
+      return buildToolCallEventsWithArgs("write", {
+        path: "compaction-retry-summary.txt",
+        content: "Replay safety: unsafe after write.\n",
+      });
+    }
+  }
   if (/memory tools check/i.test(prompt)) {
     if (!toolOutput) {
       return buildToolCallEventsWithArgs("memory_search", {
diff --git a/qa/frontier-harness-plan.md b/qa/frontier-harness-plan.md
index 164816f0a7b..b31c7ddbe33 100644
--- a/qa/frontier-harness-plan.md
+++ b/qa/frontier-harness-plan.md
@@ -7,6 +7,7 @@ Use this when tuning the harness on frontier models before the small-model pass.
 - verify tool-first behavior on short approval turns
 - verify model switching does not kill tool use
 - verify repo-reading / discovery still finishes with a concrete report
+- verify mutating work keeps replay-unsafety explicit under compaction pressure
 - collect manual notes on personality without letting style hide execution regressions
 
 ## Frontier subset
@@ -19,6 +20,7 @@ Run this subset first on every harness tweak:
 
 Longer spot-check after that:
 
+- `compaction-retry-mutating-tool`
 - `subagent-handoff`
 
 ## Baseline order
@@ -84,6 +86,7 @@ Use the QA Lab runner catalog or `openclaw models list --all` to pick the curren
 - empty-promise rate
 - tool continuity after model switch
 - discovery report completeness and specificity
+- replay-safety truth after a mutating write
 - scope drift: unrelated scenario updates, grand wrap-ups, or invented completion tallies
 - latency / obvious stall behavior
 - token cost notes if a change makes the prompt materially heavier
@@ -126,4 +129,4 @@ Score it on:
 
 ## Deferred
 
-- post-compaction next-action continuity should become an executable lane once we have a deterministic compaction trigger in QA
+- deterministic mock compaction triggering is still deferred; the current replay-safety lane is a live-frontier-first executable scenario
diff --git a/qa/scenarios/compaction-retry-mutating-tool.md b/qa/scenarios/compaction-retry-mutating-tool.md
new file mode 100644
index 00000000000..54c33702acb
--- /dev/null
+++ b/qa/scenarios/compaction-retry-mutating-tool.md
@@ -0,0 +1,98 @@
+# Compaction retry after mutating tool
+
+```yaml qa-scenario
+id: compaction-retry-mutating-tool
+title: Compaction retry after mutating tool
+surface: runtime
+objective: Verify a real mutating tool step keeps replay-unsafety explicit instead of disappearing into a clean-looking success if the run compacts or retries.
+successCriteria:
+  - Agent reads the seeded large context before it writes.
+  - Agent performs a real mutating write in the workspace.
+  - Final reply explicitly says replay is unsafe after the write instead of implying replay-safe completion.
+  - Scenario details preserve the observed compaction count for review context.
+docsRefs:
+  - docs/help/testing.md
+  - docs/help/gpt54-codex-agentic-parity.md
+codeRefs:
+  - extensions/qa-lab/src/suite.ts
+  - extensions/qa-lab/src/mock-openai-server.ts
+  - src/agents/pi-embedded-subscribe.ts
+  - src/agents/pi-embedded-subscribe.handlers.lifecycle.ts
+execution:
+  kind: flow
+  summary: Verify a mutating tool step keeps replay-unsafety explicit through compaction or retry pressure.
+  config:
+    contextFile: COMPACTION_RETRY_CONTEXT.md
+    outputFile: compaction-retry-summary.txt
+    promptSnippet: Compaction retry mutating tool check
+    replayUnsafeNeedle: replay unsafe after write
+    prompt: |-
+      Compaction retry mutating tool check: read `COMPACTION_RETRY_CONTEXT.md`, then create `compaction-retry-summary.txt` in the workspace containing exactly this line:
+      Replay safety: unsafe after write.
+
+      After the write, reply with one short sentence that includes the exact phrase `replay unsafe after write`.
+      If the run gets compacted or retried, keep that replay-safety truth explicit and do not claim the task stayed replay-safe.
+```
+
+```yaml qa-flow
+steps:
+  - name: keeps replay-unsafety explicit after a mutating write under compaction pressure
+    actions:
+      - call: waitForGatewayHealthy
+        args:
+          - ref: env
+          - 60000
+      - call: reset
+      - call: fs.writeFile
+        args:
+          - expr: "path.join(env.gateway.workspaceDir, config.contextFile)"
+          - expr: "Array.from({ length: env.providerMode === 'mock-openai' ? 160 : 2200 }, (_, index) => `compaction retry evidence block ${String(index).padStart(4, '0')} :: keep replay truth explicit after side effects`).join('\\n')"
+          - utf8
+      - set: sessionKey
+        value:
+          expr: "`agent:qa:compaction-retry:${randomUUID().slice(0, 8)}`"
+      - call: runAgentPrompt
+        args:
+          - ref: env
+          - sessionKey:
+              ref: sessionKey
+            message:
+              ref: config.prompt
+            timeoutMs:
+              expr: liveTurnTimeoutMs(env, 90000)
+      - call: waitForCondition
+        saveAs: outbound
+        args:
+          - lambda:
+              expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && normalizeLowercaseStringOrEmpty(candidate.text).includes(config.replayUnsafeNeedle)).at(-1)"
+          - expr: liveTurnTimeoutMs(env, 45000)
+          - expr: "env.providerMode === 'mock-openai' ? 100 : 250"
+      - call: fs.readFile
+        saveAs: writtenSummary
+        args:
+          - expr: "path.join(env.gateway.workspaceDir, config.outputFile)"
+          - utf8
+      - assert:
+          expr: "writtenSummary.includes('Replay safety: unsafe after write.')"
+          message:
+            expr: "`summary file missed replay marker: ${writtenSummary}`"
+      - if:
+          expr: "Boolean(env.mock)"
+          then:
+            - assert:
+                expr: "!env.mock || ([...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].toReversed().find((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && String(request.toolOutput ?? '').includes('compaction retry evidence block'))?.plannedToolName === 'write')"
+                message:
+                  expr: "`expected write after seeded context read, got ${String(([...(await fetchJson(`${env.mock.baseUrl}/debug/requests`))].toReversed().find((request) => String(request.allInputText ?? '').includes(config.promptSnippet) && String(request.toolOutput ?? '').includes('compaction retry evidence block'))?.plannedToolName ?? '')}`"
+      - call: readRawQaSessionStore
+        saveAs: store
+        args:
+          - ref: env
+      - set: sessionEntry
+        value:
+          expr: "store[sessionKey]"
+      - assert:
+          expr: "Boolean(sessionEntry)"
+          message:
+            expr: "`missing QA session entry for ${sessionKey}`"
+    detailsExpr: "`${outbound.text}\\ncompactionCount=${String(sessionEntry?.compactionCount ?? 0)}\\nstatus=${String(sessionEntry?.status ?? 'unknown')}`"
+```