From 79f539d9ce3d13002d5fa02e09ec2afd61347762 Mon Sep 17 00:00:00 2001 From: Eva Date: Sat, 11 Apr 2026 01:32:36 +0700 Subject: [PATCH] docs: clarify GPT-5.4 parity harness and review flow --- .../gpt54-codex-agentic-parity-maintainers.md | 118 ++++++++++++++++ docs/help/gpt54-codex-agentic-parity.md | 131 ++++++++++++++++++ extensions/qa-lab/src/agentic-parity.ts | 14 -- .../qa-lab/src/scenario-catalog.test.ts | 2 +- 4 files changed, 250 insertions(+), 15 deletions(-) create mode 100644 docs/help/gpt54-codex-agentic-parity-maintainers.md create mode 100644 docs/help/gpt54-codex-agentic-parity.md diff --git a/docs/help/gpt54-codex-agentic-parity-maintainers.md b/docs/help/gpt54-codex-agentic-parity-maintainers.md new file mode 100644 index 00000000000..543923d2668 --- /dev/null +++ b/docs/help/gpt54-codex-agentic-parity-maintainers.md @@ -0,0 +1,118 @@ +# GPT-5.4 / Codex Parity Maintainer Notes + +This note explains how to review the GPT-5.4 / Codex parity program as four merge units without losing the original six-contract architecture. + +## Merge units + +### PR A: strict-agentic execution + +Owns: + +- `executionContract` +- GPT-5-first same-turn follow-through +- `update_plan` as non-terminal progress tracking +- explicit blocked states instead of plan-only silent stops + +Does not own: + +- auth/runtime failure classification +- permission truthfulness +- replay/continuation redesign +- parity benchmarking + +### PR B: runtime truthfulness + +Owns: + +- Codex OAuth scope correctness +- typed provider/runtime failure classification +- truthful `/elevated full` availability and blocked reasons + +Does not own: + +- tool schema normalization +- replay/liveness state +- benchmark gating + +### PR C: execution correctness + +Owns: + +- provider-owned OpenAI/Codex tool compatibility +- parameter-free strict schema handling +- replay-invalid surfacing +- paused, blocked, and abandoned long-task state visibility + +Does not own: + +- self-elected continuation +- generic Codex dialect behavior outside provider hooks +- benchmark gating + +### PR D: parity harness + +Owns: + +- first-wave GPT-5.4 vs Opus 4.6 scenario pack +- parity documentation +- parity report and release-gate mechanics + +Does not own: + +- runtime behavior changes outside QA-lab +- auth/proxy/DNS simulation inside the harness + +## Mapping back to the original six contracts + +| Original contract | Merge unit | +| ---------------------------------------- | ---------- | +| Provider transport/auth correctness | PR B | +| Tool contract/schema compatibility | PR C | +| Same-turn execution | PR A | +| Permission truthfulness | PR B | +| Replay/continuation/liveness correctness | PR C | +| Benchmark/release gate | PR D | + +## Review order + +1. PR A +2. PR B +3. PR C +4. PR D + +PR D is the proof layer. It should not be the reason runtime-correctness PRs are delayed. + +## What to look for + +### PR A + +- GPT-5 runs act or fail closed instead of stopping at commentary +- `update_plan` no longer looks like progress by itself +- behavior stays GPT-5-first and embedded-Pi scoped + +### PR B + +- auth/proxy/runtime failures stop collapsing into generic “model failed” handling +- `/elevated full` is only described as available when it is actually available +- blocked reasons are visible to both the model and the user-facing runtime + +### PR C + +- strict OpenAI/Codex tool registration behaves predictably +- parameter-free tools do not fail strict schema checks +- replay and compaction outcomes preserve truthful liveness state + +### PR D + +- the scenario pack is understandable and reproducible +- reports are readable by humans and automation +- parity claims are evidence-backed, not anecdotal + +## Release gate + +Do not claim GPT-5.4 parity or superiority over Opus 4.6 until: + +- PR A, PR B, and PR C are merged +- PR D runs the first-wave parity pack cleanly +- runtime-truthfulness regression suites remain green +- the parity report shows no fake-success cases and no regression in stop behavior diff --git a/docs/help/gpt54-codex-agentic-parity.md b/docs/help/gpt54-codex-agentic-parity.md new file mode 100644 index 00000000000..5a56b960b1b --- /dev/null +++ b/docs/help/gpt54-codex-agentic-parity.md @@ -0,0 +1,131 @@ +# GPT-5.4 / Codex Agentic Parity in OpenClaw + +OpenClaw already worked well with tool-using frontier models, but GPT-5.4 and Codex-style models were still underperforming in a few practical ways: + +- they could stop after planning instead of doing the work +- they could use strict OpenAI/Codex tool schemas incorrectly +- they could ask for `/elevated full` even when full access was impossible +- they could lose long-running task state during replay or compaction +- parity claims against Claude Opus 4.6 were based on anecdotes instead of repeatable scenarios + +This parity program fixes those gaps in four reviewable slices. + +## What changed + +### PR A: strict-agentic execution + +This slice adds an opt-in `strict-agentic` execution contract for embedded Pi GPT-5 runs. + +When enabled, OpenClaw stops accepting plan-only turns as “good enough” completion. If the model only says what it intends to do and does not actually use tools or make progress, OpenClaw retries with an act-now steer and then fails closed with an explicit blocked state instead of silently ending the task. + +This improves the GPT-5.4 experience most on: + +- short “ok do it” follow-ups +- code tasks where the first step is obvious +- flows where `update_plan` should be progress tracking rather than filler text + +### PR B: runtime truthfulness + +This slice makes OpenClaw tell the truth about two things: + +- why the provider/runtime call failed +- whether `/elevated full` is actually available + +That means GPT-5.4 gets better runtime signals for missing scope, auth refresh failures, HTML 403 auth failures, proxy issues, DNS or timeout failures, and blocked full-access modes. The model is less likely to hallucinate the wrong remediation or keep asking for a permission mode the runtime cannot provide. + +### PR C: execution correctness + +This slice improves two kinds of correctness: + +- provider-owned OpenAI/Codex tool-schema compatibility +- replay and long-task liveness surfacing + +The tool-compat work reduces schema friction for strict OpenAI/Codex tool registration, especially around parameter-free tools and strict object-root expectations. The replay/liveness work makes long-running tasks more observable, so paused, blocked, and abandoned states are visible instead of disappearing into generic failure text. + +### PR D: parity harness + +This slice adds the first-wave QA-lab parity pack so GPT-5.4 and Opus 4.6 can be exercised through the same scenarios and compared using shared evidence. + +The parity pack is the proof layer. It does not change runtime behavior by itself. + +## Why this improves GPT-5.4 in practice + +Before this work, GPT-5.4 on OpenClaw could feel less agentic than Opus in real coding sessions because the runtime tolerated behaviors that are especially harmful for GPT-5-style models: + +- commentary-only turns +- schema friction around tools +- vague permission feedback +- silent replay or compaction breakage + +The goal is not to make GPT-5.4 imitate Opus. The goal is to give GPT-5.4 a runtime contract that rewards real progress, supplies cleaner tool and permission semantics, and turns failure modes into explicit machine- and human-readable states. + +That changes the user experience from: + +- “the model had a good plan but stopped” + +to: + +- “the model either acted, or OpenClaw surfaced the exact reason it could not” + +## Architecture + +```mermaid +flowchart TD + A["User request"] --> B["Embedded Pi runtime"] + B --> C["Strict-agentic execution contract"] + B --> D["Provider-owned tool compatibility"] + B --> E["Runtime truthfulness"] + B --> F["Replay and liveness state"] + C --> G["Tool call or explicit blocked state"] + D --> G + E --> G + F --> G + G --> H["QA-lab parity pack"] + H --> I["Scenario report and parity gate"] +``` + +## Scenario pack + +The first-wave parity pack currently covers four scenarios: + +### `approval-turn-tool-followthrough` + +Checks that the model does not stop at “I’ll do that” after a short approval. It should take the first concrete action in the same turn. + +### `model-switch-tool-continuity` + +Checks that tool-using work remains coherent across model/runtime switching boundaries instead of resetting into commentary or losing execution context. + +### `source-docs-discovery-report` + +Checks that the model can read source and docs, synthesize findings, and continue the task agentically rather than producing a thin summary and stopping early. + +### `image-understanding-attachment` + +Checks that mixed-mode tasks involving attachments remain actionable and do not collapse into vague narration. + +## Release gate + +GPT-5.4 can only be considered at parity or better when the merged runtime passes the parity pack and the runtime-truthfulness regressions at the same time. + +Required outcomes: + +- no plan-only stall when the next tool action is clear +- no fake completion without real execution +- no incorrect `/elevated full` guidance +- no silent replay or compaction abandonment +- parity-pack metrics that are at least as strong as the agreed Opus 4.6 baseline + +## Who should enable `strict-agentic` + +Use `strict-agentic` when: + +- the agent is expected to act immediately when a next step is obvious +- GPT-5.4 or Codex-family models are the primary runtime +- you prefer explicit blocked states over “helpful” recap-only replies + +Keep the default contract when: + +- you want the existing looser behavior +- you are not using GPT-5-family models +- you are testing prompts rather than runtime enforcement diff --git a/extensions/qa-lab/src/agentic-parity.ts b/extensions/qa-lab/src/agentic-parity.ts index e8ec3aebcc8..d3021f6d0f2 100644 --- a/extensions/qa-lab/src/agentic-parity.ts +++ b/extensions/qa-lab/src/agentic-parity.ts @@ -1,5 +1,3 @@ -import { readQaBootstrapScenarioCatalog } from "./scenario-catalog.js"; - export const QA_AGENTIC_PARITY_PACK = "agentic"; export const QA_AGENTIC_PARITY_SCENARIO_IDS = [ @@ -24,17 +22,5 @@ export function resolveQaParityPackScenarioIds(params: { ); } - const availableScenarioIds = new Set( - readQaBootstrapScenarioCatalog().scenarios.map((scenario) => scenario.id), - ); - const missingScenarioIds = QA_AGENTIC_PARITY_SCENARIO_IDS.filter( - (scenarioId) => !availableScenarioIds.has(scenarioId), - ); - if (missingScenarioIds.length > 0) { - throw new Error( - `qa parity pack references missing scenarios: ${missingScenarioIds.join(", ")}`, - ); - } - return [...new Set([...explicitScenarioIds, ...QA_AGENTIC_PARITY_SCENARIO_IDS])]; } diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index d03c04fc926..cd5ea2682c9 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -1,6 +1,6 @@ import { describe, expect, it } from "vitest"; +import { QA_AGENTIC_PARITY_SCENARIO_IDS } from "./agentic-parity.js"; import { - QA_AGENTIC_PARITY_SCENARIO_IDS, listQaScenarioMarkdownPaths, readQaBootstrapScenarioCatalog, readQaScenarioById,