From 79f539d9ce3d13002d5fa02e09ec2afd61347762 Mon Sep 17 00:00:00 2001
From: Eva <eva@100yen.org>
Date: Sat, 11 Apr 2026 01:32:36 +0700
Subject: [PATCH] docs: clarify GPT-5.4 parity harness and review flow

---
 .../gpt54-codex-agentic-parity-maintainers.md | 118 ++++++++++++++++
 docs/help/gpt54-codex-agentic-parity.md       | 131 ++++++++++++++++++
 extensions/qa-lab/src/agentic-parity.ts       |  14 --
 .../qa-lab/src/scenario-catalog.test.ts       |   2 +-
 4 files changed, 250 insertions(+), 15 deletions(-)
 create mode 100644 docs/help/gpt54-codex-agentic-parity-maintainers.md
 create mode 100644 docs/help/gpt54-codex-agentic-parity.md

diff --git a/docs/help/gpt54-codex-agentic-parity-maintainers.md b/docs/help/gpt54-codex-agentic-parity-maintainers.md
new file mode 100644
index 00000000000..543923d2668
--- /dev/null
+++ b/docs/help/gpt54-codex-agentic-parity-maintainers.md
@@ -0,0 +1,118 @@
+# GPT-5.4 / Codex Parity Maintainer Notes
+
+This note explains how to review the GPT-5.4 / Codex parity program as four merge units without losing the original six-contract architecture.
+
+## Merge units
+
+### PR A: strict-agentic execution
+
+Owns:
+
+- `executionContract`
+- GPT-5-first same-turn follow-through
+- `update_plan` as non-terminal progress tracking
+- explicit blocked states instead of plan-only silent stops
+
+Does not own:
+
+- auth/runtime failure classification
+- permission truthfulness
+- replay/continuation redesign
+- parity benchmarking
+
+### PR B: runtime truthfulness
+
+Owns:
+
+- Codex OAuth scope correctness
+- typed provider/runtime failure classification
+- truthful `/elevated full` availability and blocked reasons
+
+Does not own:
+
+- tool schema normalization
+- replay/liveness state
+- benchmark gating
+
+### PR C: execution correctness
+
+Owns:
+
+- provider-owned OpenAI/Codex tool compatibility
+- parameter-free strict schema handling
+- replay-invalid surfacing
+- paused, blocked, and abandoned long-task state visibility
+
+Does not own:
+
+- self-elected continuation
+- generic Codex dialect behavior outside provider hooks
+- benchmark gating
+
+### PR D: parity harness
+
+Owns:
+
+- first-wave GPT-5.4 vs Opus 4.6 scenario pack
+- parity documentation
+- parity report and release-gate mechanics
+
+Does not own:
+
+- runtime behavior changes outside QA-lab
+- auth/proxy/DNS simulation inside the harness
+
+## Mapping back to the original six contracts
+
+| Original contract                        | Merge unit |
+| ---------------------------------------- | ---------- |
+| Provider transport/auth correctness      | PR B       |
+| Tool contract/schema compatibility       | PR C       |
+| Same-turn execution                      | PR A       |
+| Permission truthfulness                  | PR B       |
+| Replay/continuation/liveness correctness | PR C       |
+| Benchmark/release gate                   | PR D       |
+
+## Review order
+
+1. PR A
+2. PR B
+3. PR C
+4. PR D
+
+PR D is the proof layer. It should not be the reason runtime-correctness PRs are delayed.
+
+## What to look for
+
+### PR A
+
+- GPT-5 runs act or fail closed instead of stopping at commentary
+- `update_plan` no longer looks like progress by itself
+- behavior stays GPT-5-first and embedded-Pi scoped
+
+### PR B
+
+- auth/proxy/runtime failures stop collapsing into generic “model failed” handling
+- `/elevated full` is only described as available when it is actually available
+- blocked reasons are visible to both the model and the user-facing runtime
+
+### PR C
+
+- strict OpenAI/Codex tool registration behaves predictably
+- parameter-free tools do not fail strict schema checks
+- replay and compaction outcomes preserve truthful liveness state
+
+### PR D
+
+- the scenario pack is understandable and reproducible
+- reports are readable by humans and automation
+- parity claims are evidence-backed, not anecdotal
+
+## Release gate
+
+Do not claim GPT-5.4 parity or superiority over Opus 4.6 until:
+
+- PR A, PR B, and PR C are merged
+- PR D runs the first-wave parity pack cleanly
+- runtime-truthfulness regression suites remain green
+- the parity report shows no fake-success cases and no regression in stop behavior
diff --git a/docs/help/gpt54-codex-agentic-parity.md b/docs/help/gpt54-codex-agentic-parity.md
new file mode 100644
index 00000000000..5a56b960b1b
--- /dev/null
+++ b/docs/help/gpt54-codex-agentic-parity.md
@@ -0,0 +1,131 @@
+# GPT-5.4 / Codex Agentic Parity in OpenClaw
+
+OpenClaw already worked well with tool-using frontier models, but GPT-5.4 and Codex-style models were still underperforming in a few practical ways:
+
+- they could stop after planning instead of doing the work
+- they could use strict OpenAI/Codex tool schemas incorrectly
+- they could ask for `/elevated full` even when full access was impossible
+- they could lose long-running task state during replay or compaction
+- parity claims against Claude Opus 4.6 were based on anecdotes instead of repeatable scenarios
+
+This parity program fixes those gaps in four reviewable slices.
+
+## What changed
+
+### PR A: strict-agentic execution
+
+This slice adds an opt-in `strict-agentic` execution contract for embedded Pi GPT-5 runs.
+
+When enabled, OpenClaw stops accepting plan-only turns as “good enough” completion. If the model only says what it intends to do and does not actually use tools or make progress, OpenClaw retries with an act-now steer and then fails closed with an explicit blocked state instead of silently ending the task.
+
+This improves the GPT-5.4 experience most on:
+
+- short “ok do it” follow-ups
+- code tasks where the first step is obvious
+- flows where `update_plan` should be progress tracking rather than filler text
+
+### PR B: runtime truthfulness
+
+This slice makes OpenClaw tell the truth about two things:
+
+- why the provider/runtime call failed
+- whether `/elevated full` is actually available
+
+That means GPT-5.4 gets better runtime signals for missing scope, auth refresh failures, HTML 403 auth failures, proxy issues, DNS or timeout failures, and blocked full-access modes. The model is less likely to hallucinate the wrong remediation or keep asking for a permission mode the runtime cannot provide.
+
+### PR C: execution correctness
+
+This slice improves two kinds of correctness:
+
+- provider-owned OpenAI/Codex tool-schema compatibility
+- replay and long-task liveness surfacing
+
+The tool-compat work reduces schema friction for strict OpenAI/Codex tool registration, especially around parameter-free tools and strict object-root expectations. The replay/liveness work makes long-running tasks more observable, so paused, blocked, and abandoned states are visible instead of disappearing into generic failure text.
+
+### PR D: parity harness
+
+This slice adds the first-wave QA-lab parity pack so GPT-5.4 and Opus 4.6 can be exercised through the same scenarios and compared using shared evidence.
+
+The parity pack is the proof layer. It does not change runtime behavior by itself.
+
+## Why this improves GPT-5.4 in practice
+
+Before this work, GPT-5.4 on OpenClaw could feel less agentic than Opus in real coding sessions because the runtime tolerated behaviors that are especially harmful for GPT-5-style models:
+
+- commentary-only turns
+- schema friction around tools
+- vague permission feedback
+- silent replay or compaction breakage
+
+The goal is not to make GPT-5.4 imitate Opus. The goal is to give GPT-5.4 a runtime contract that rewards real progress, supplies cleaner tool and permission semantics, and turns failure modes into explicit machine- and human-readable states.
+
+That changes the user experience from:
+
+- “the model had a good plan but stopped”
+
+to:
+
+- “the model either acted, or OpenClaw surfaced the exact reason it could not”
+
+## Architecture
+
+```mermaid
+flowchart TD
+    A["User request"] --> B["Embedded Pi runtime"]
+    B --> C["Strict-agentic execution contract"]
+    B --> D["Provider-owned tool compatibility"]
+    B --> E["Runtime truthfulness"]
+    B --> F["Replay and liveness state"]
+    C --> G["Tool call or explicit blocked state"]
+    D --> G
+    E --> G
+    F --> G
+    G --> H["QA-lab parity pack"]
+    H --> I["Scenario report and parity gate"]
+```
+
+## Scenario pack
+
+The first-wave parity pack currently covers four scenarios:
+
+### `approval-turn-tool-followthrough`
+
+Checks that the model does not stop at “I’ll do that” after a short approval. It should take the first concrete action in the same turn.
+
+### `model-switch-tool-continuity`
+
+Checks that tool-using work remains coherent across model/runtime switching boundaries instead of resetting into commentary or losing execution context.
+
+### `source-docs-discovery-report`
+
+Checks that the model can read source and docs, synthesize findings, and continue the task agentically rather than producing a thin summary and stopping early.
+
+### `image-understanding-attachment`
+
+Checks that mixed-mode tasks involving attachments remain actionable and do not collapse into vague narration.
+
+## Release gate
+
+GPT-5.4 can only be considered at parity or better when the merged runtime passes the parity pack and the runtime-truthfulness regressions at the same time.
+
+Required outcomes:
+
+- no plan-only stall when the next tool action is clear
+- no fake completion without real execution
+- no incorrect `/elevated full` guidance
+- no silent replay or compaction abandonment
+- parity-pack metrics that are at least as strong as the agreed Opus 4.6 baseline
+
+## Who should enable `strict-agentic`
+
+Use `strict-agentic` when:
+
+- the agent is expected to act immediately when a next step is obvious
+- GPT-5.4 or Codex-family models are the primary runtime
+- you prefer explicit blocked states over “helpful” recap-only replies
+
+Keep the default contract when:
+
+- you want the existing looser behavior
+- you are not using GPT-5-family models
+- you are testing prompts rather than runtime enforcement
diff --git a/extensions/qa-lab/src/agentic-parity.ts b/extensions/qa-lab/src/agentic-parity.ts
index e8ec3aebcc8..d3021f6d0f2 100644
--- a/extensions/qa-lab/src/agentic-parity.ts
+++ b/extensions/qa-lab/src/agentic-parity.ts
@@ -1,5 +1,3 @@
-import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js";
-
 export const QA_AGENTIC_PARITY_PACK = "agentic";
 
 export const QA_AGENTIC_PARITY_SCENARIO_IDS = [
@@ -24,17 +22,5 @@ export function resolveQaParityPackScenarioIds(params: {
     );
   }
 
-  const availableScenarioIds = new Set(
-    readQaBootstrapScenarioCatalog().scenarios.map((scenario) => scenario.id),
-  );
-  const missingScenarioIds = QA_AGENTIC_PARITY_SCENARIO_IDS.filter(
-    (scenarioId) => !availableScenarioIds.has(scenarioId),
-  );
-  if (missingScenarioIds.length > 0) {
-    throw new Error(
-      `qa parity pack references missing scenarios: ${missingScenarioIds.join(", ")}`,
-    );
-  }
-
   return [...new Set([...explicitScenarioIds, ...QA_AGENTIC_PARITY_SCENARIO_IDS])];
 }
diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts
index d03c04fc926..cd5ea2682c9 100644
--- a/extensions/qa-lab/src/scenario-catalog.test.ts
+++ b/extensions/qa-lab/src/scenario-catalog.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from "vitest";
+import { QA_AGENTIC_PARITY_SCENARIO_IDS } from "./agentic-parity.js";
 import {
-  QA_AGENTIC_PARITY_SCENARIO_IDS,
   listQaScenarioMarkdownPaths,
   readQaBootstrapScenarioCatalog,
   readQaScenarioById,