diff --git a/src/agents/execution-contract.test.ts b/src/agents/execution-contract.test.ts new file mode 100644 index 00000000000..fbdff9e4f6f --- /dev/null +++ b/src/agents/execution-contract.test.ts @@ -0,0 +1,205 @@ +import { describe, expect, it } from "vitest"; +import type { OpenClawConfig } from "../config/types.openclaw.js"; +import { + isStrictAgenticExecutionContractActive, + resolveEffectiveExecutionContract, +} from "./execution-contract.js"; + +describe("resolveEffectiveExecutionContract", () => { + const supportedProvider = "openai"; + const unsupportedProvider = "anthropic"; + const emptyConfig: OpenClawConfig = {}; + + describe("supported provider + model detection", () => { + it("auto-activates on bare gpt-5 model ids", () => { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: supportedProvider, + modelId: "gpt-5.4", + }), + ).toBe("strict-agentic"); + }); + + it("auto-activates on gpt-5o and variants without a separator", () => { + for (const modelId of ["gpt-5", "gpt-5o", "gpt-5o-mini"]) { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: supportedProvider, + modelId, + }), + ).toBe("strict-agentic"); + } + }); + + it("auto-activates on dot-separated variants", () => { + for (const modelId of ["gpt-5.0", "gpt-5.4", "gpt-5.4-alt", "gpt-5.99"]) { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: supportedProvider, + modelId, + }), + ).toBe("strict-agentic"); + } + }); + + it("auto-activates on dash-separated variants", () => { + for (const modelId of ["gpt-5-preview", "gpt-5-turbo", "gpt-5-2025-03"]) { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: supportedProvider, + modelId, + }), + ).toBe("strict-agentic"); + } + }); + + it("auto-activates on prefixed model ids (openai/gpt-5.4, openai:gpt-5.4)", () => { + // Regression for the adversarial review finding: prefixed model ids + // must strip the provider prefix before matching the regex. + for (const modelId of [ + "openai/gpt-5.4", + "openai:gpt-5.4", + "openai/gpt-5o-mini", + "openai-codex/gpt-5.4", + "openai-codex:gpt-5.4", + " openai/gpt-5.4 ", + " OPENAI:GPT-5.4 ", + ]) { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: supportedProvider, + modelId, + }), + ).toBe("strict-agentic"); + } + }); + + it("is case-insensitive", () => { + for (const modelId of ["GPT-5.4", "Gpt-5O", "OPENAI/GPT-5.4"]) { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: supportedProvider, + modelId, + }), + ).toBe("strict-agentic"); + } + }); + + it("does not match non-gpt-5 family ids", () => { + for (const modelId of [ + "gpt-4.5", + "gpt-4o", + "gpt-6", + "gpt-50", + "claude-opus-4-6", + "llama-3-70b", + "mistral-large", + ]) { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: supportedProvider, + modelId, + }), + ).toBe("default"); + } + }); + + it("collapses to default on unsupported providers even with gpt-5 model ids", () => { + expect( + resolveEffectiveExecutionContract({ + config: emptyConfig, + provider: unsupportedProvider, + modelId: "gpt-5.4", + }), + ).toBe("default"); + }); + }); + + describe("explicit override behavior", () => { + it("honors explicit strict-agentic on the supported lane", () => { + const config: OpenClawConfig = { + agents: { + defaults: { + embeddedPi: { + executionContract: "strict-agentic", + }, + }, + }, + }; + expect( + resolveEffectiveExecutionContract({ + config, + provider: supportedProvider, + modelId: "gpt-5.4", + }), + ).toBe("strict-agentic"); + }); + + it("honors explicit default opt-out even on the supported lane", () => { + const config: OpenClawConfig = { + agents: { + defaults: { + embeddedPi: { + executionContract: "default", + }, + }, + }, + }; + expect( + resolveEffectiveExecutionContract({ + config, + provider: supportedProvider, + modelId: "gpt-5.4", + }), + ).toBe("default"); + }); + + it("collapses explicit strict-agentic to default on an unsupported lane", () => { + const config: OpenClawConfig = { + agents: { + defaults: { + embeddedPi: { + executionContract: "strict-agentic", + }, + }, + }, + }; + expect( + resolveEffectiveExecutionContract({ + config, + provider: unsupportedProvider, + modelId: "claude-opus-4-6", + }), + ).toBe("default"); + }); + }); + + describe("active flag helper", () => { + it("returns true when the effective contract is strict-agentic", () => { + expect( + isStrictAgenticExecutionContractActive({ + config: emptyConfig, + provider: supportedProvider, + modelId: "openai/gpt-5.4", + }), + ).toBe(true); + }); + + it("returns false when the effective contract is default", () => { + expect( + isStrictAgenticExecutionContractActive({ + config: emptyConfig, + provider: supportedProvider, + modelId: "gpt-4.5", + }), + ).toBe(false); + }); + }); +}); diff --git a/src/agents/execution-contract.ts b/src/agents/execution-contract.ts index 9ab7023a18f..f7bc1624bfd 100644 --- a/src/agents/execution-contract.ts +++ b/src/agents/execution-contract.ts @@ -2,6 +2,114 @@ import type { OpenClawConfig } from "../config/types.openclaw.js"; import { normalizeLowercaseStringOrEmpty } from "../shared/string-coerce.js"; import { resolveAgentExecutionContract, resolveSessionAgentIds } from "./agent-scope.js"; +/** + * Strip any leading `provider/` or `provider:` prefix from a model id so the + * bare-name regex matching below works against `openai/gpt-5.4` and + * `openai:gpt-5.4` the same way it does against `gpt-5.4`. Returns the bare + * model id lowercased for comparison. + * + * Without this, auto-activation silently failed on prefixed model ids — a + * user who configured `model: "openai/gpt-5.4"` in their agent config would + * get the pre-PR-H looser default behavior because the regex only matched + * bare names. The adversarial review in #64227 flagged this as a quality + * gap on completion-gate criterion 1. + */ +function stripProviderPrefix(modelId: string): string { + const normalizedModelId = modelId.trim(); + const match = /^([^/:]+)[/:](.+)$/.exec(normalizedModelId); + return (match?.[2] ?? normalizedModelId).toLowerCase(); +} + +/** + * Regex that matches the full set of GPT-5 variants the strict-agentic + * contract should auto-activate for. Intentionally permissive: every + * model id in the gpt-5 family should opt in by default, not just the + * canonical `gpt-5.4`. + * + * Covers: + * - `gpt-5`, `gpt-5o`, `gpt-5o-mini` (no separator after `5`) + * - `gpt-5.4`, `gpt-5.4-alt`, `gpt-5.0` (dot separator) + * - `gpt-5-preview`, `gpt-5-turbo`, `gpt-5-2025-03` (dash separator) + * + * Does NOT cover `gpt-4.5`, `gpt-6`, or any non-gpt-5 family member. + */ +const STRICT_AGENTIC_MODEL_ID_PATTERN = /^gpt-5(?:[.o-]|$)/i; + +/** + * Supported provider + model combinations where strict-agentic is the intended + * runtime contract. Kept as a narrow helper so both the execution-contract + * resolver and the `update_plan` auto-enable gate converge on the same + * definition of "GPT-5-family openai/openai-codex run". + */ +export function isStrictAgenticSupportedProviderModel(params: { + provider?: string | null; + modelId?: string | null; +}): boolean { + const provider = normalizeLowercaseStringOrEmpty(params.provider ?? ""); + if (provider !== "openai" && provider !== "openai-codex") { + return false; + } + const modelId = typeof params.modelId === "string" ? params.modelId : ""; + const bareModelId = stripProviderPrefix(modelId); + return STRICT_AGENTIC_MODEL_ID_PATTERN.test(bareModelId); +} + +/** + * Returns the effective execution contract for an embedded Pi run. + * + * strict-agentic is a GPT-5-family openai/openai-codex-only runtime contract, + * so an unsupported provider/model pair always collapses to `"default"` + * regardless of what the caller passed or what config says — the contract + * is inert off-provider. Within the supported lane, the behavior matrix is: + * + * - Supported provider/model + explicit `"strict-agentic"` in config + * (defaults or per-agent override) ⇒ `"strict-agentic"`. + * - Supported provider/model + explicit `"default"` in config ⇒ `"default"` + * (opt-out honored). + * - Supported provider/model + unspecified ⇒ `"strict-agentic"` so the + * no-stall completion-gate criterion applies to out-of-the-box GPT-5 runs + * without requiring every user to set the flag. + * - Unsupported provider/model (anything that is not openai or openai-codex + * with a gpt-5-family model id) ⇒ `"default"`, even when the config + * explicitly sets `"strict-agentic"`. The retry guard and blocked-exit + * helpers all check this lane again, so an explicit `"strict-agentic"` + * on an unsupported lane is a no-op rather than a hard failure. + * + * This means explicit opt-out still works, but the gate criterion + * "GPT-5.4 no longer stalls after planning" now covers unconfigured + * installations, not only users who opted in manually. + */ +export function resolveEffectiveExecutionContract(params: { + config?: OpenClawConfig; + sessionKey?: string; + agentId?: string | null; + provider?: string | null; + modelId?: string | null; +}): "default" | "strict-agentic" { + const { sessionAgentId } = resolveSessionAgentIds({ + sessionKey: params.sessionKey, + config: params.config, + agentId: params.agentId ?? undefined, + }); + const explicit = resolveAgentExecutionContract(params.config, sessionAgentId); + // strict-agentic is a GPT-5-family openai/openai-codex runtime contract + // regardless of whether it was set explicitly or auto-activated. On an + // unsupported provider/model pair the contract is inert either way, so + // the effective value collapses to "default". + const supported = isStrictAgenticSupportedProviderModel({ + provider: params.provider, + modelId: params.modelId, + }); + if (!supported) { + return "default"; + } + if (explicit === "default") { + return "default"; + } + // Explicit strict-agentic OR unspecified-but-supported → strict-agentic. + return "strict-agentic"; +} + export function isStrictAgenticExecutionContractActive(params: { config?: OpenClawConfig; sessionKey?: string; @@ -9,17 +117,5 @@ export function isStrictAgenticExecutionContractActive(params: { provider?: string | null; modelId?: string | null; }): boolean { - const { sessionAgentId } = resolveSessionAgentIds({ - sessionKey: params.sessionKey, - config: params.config, - agentId: params.agentId ?? undefined, - }); - if (resolveAgentExecutionContract(params.config, sessionAgentId) !== "strict-agentic") { - return false; - } - const provider = normalizeLowercaseStringOrEmpty(params.provider ?? ""); - if (provider !== "openai" && provider !== "openai-codex") { - return false; - } - return /^gpt-5(?:[.-]|$)/i.test(params.modelId?.trim() ?? ""); + return resolveEffectiveExecutionContract(params) === "strict-agentic"; } diff --git a/src/agents/openclaw-tools.update-plan.test.ts b/src/agents/openclaw-tools.update-plan.test.ts index 68912c9b713..9853b92dc64 100644 --- a/src/agents/openclaw-tools.update-plan.test.ts +++ b/src/agents/openclaw-tools.update-plan.test.ts @@ -29,7 +29,12 @@ describe("openclaw-tools update_plan gating", () => { expect(createUpdatePlanTool().displaySummary).toBe("Track a short structured work plan."); }); - it("does not auto-enable update_plan outside strict-agentic mode", () => { + it("auto-enables update_plan for unconfigured GPT-5 openai runs", () => { + // Criterion 1 of the GPT-5.4 parity gate ("no stalls after planning") is + // universal, not opt-in. Unspecified executionContract on a supported + // provider/model auto-activates strict-agentic so unconfigured installs + // get the same behavior as explicit opt-in. Explicit "default" still + // opts out (see "respects explicit default contract opt-out" below). const cfg = { agents: { list: [{ id: "main" }], @@ -43,6 +48,63 @@ describe("openclaw-tools update_plan gating", () => { modelProvider: "openai", modelId: "gpt-5.4", }), + ).toBe(true); + expect( + isUpdatePlanToolEnabledForOpenClawTools({ + config: cfg, + agentSessionKey: "agent:main:main", + modelProvider: "openai-codex", + modelId: "gpt-5.4", + }), + ).toBe(true); + }); + + it("respects explicit default contract opt-out on GPT-5 runs", () => { + // Users who explicitly set executionContract: "default" are saying they + // want the old pre-parity-program behavior. Honor that opt-out. + const cfg = { + agents: { + defaults: { + embeddedPi: { + executionContract: "default", + }, + }, + list: [{ id: "main" }], + }, + } as OpenClawConfig; + + expect( + isUpdatePlanToolEnabledForOpenClawTools({ + config: cfg, + agentSessionKey: "agent:main:main", + modelProvider: "openai", + modelId: "gpt-5.4", + }), + ).toBe(false); + }); + + it("does not auto-enable update_plan for non-openai providers even when unconfigured", () => { + const cfg = { + agents: { + list: [{ id: "main" }], + }, + } as OpenClawConfig; + + expect( + isUpdatePlanToolEnabledForOpenClawTools({ + config: cfg, + agentSessionKey: "agent:main:main", + modelProvider: "anthropic", + modelId: "claude-sonnet-4-6", + }), + ).toBe(false); + expect( + isUpdatePlanToolEnabledForOpenClawTools({ + config: cfg, + agentSessionKey: "agent:main:main", + modelProvider: "openai", + modelId: "gpt-4.1", + }), ).toBe(false); }); @@ -85,7 +147,7 @@ describe("openclaw-tools update_plan gating", () => { config: cfg, agentSessionKey: "agent:main:main", modelProvider: "anthropic", - modelId: "claude-opus-4-6", + modelId: "claude-sonnet-4-6", }), ).toBe(false); expect( diff --git a/src/agents/pi-embedded-runner/run.incomplete-turn.test.ts b/src/agents/pi-embedded-runner/run.incomplete-turn.test.ts index 8d05e4e6e6d..9093aee5a3e 100644 --- a/src/agents/pi-embedded-runner/run.incomplete-turn.test.ts +++ b/src/agents/pi-embedded-runner/run.incomplete-turn.test.ts @@ -104,6 +104,117 @@ describe("runEmbeddedPiAgent incomplete-turn safety", () => { ]); }); + it("emits explicit replayInvalid + blocked liveness state at the strict-agentic blocked exit", async () => { + // Criterion 4 of the GPT-5.4 parity gate requires every terminal exit path + // to emit explicit replayInvalid + livenessState. The strict-agentic + // blocked exit is the exact place where strict-agentic is supposed to be + // loudest; it must not fall through to "silent disappearance". + mockedClassifyFailoverReason.mockReturnValue(null); + mockedRunEmbeddedAttempt.mockResolvedValue( + makeAttemptResult({ + assistantTexts: ["I'll inspect the code, make the change, and run the checks."], + }), + ); + + const result = await runEmbeddedPiAgent({ + ...overflowBaseRunParams, + provider: "openai", + model: "gpt-5.4", + runId: "run-strict-agentic-blocked-liveness", + config: { + agents: { + defaults: { + embeddedPi: { + executionContract: "strict-agentic", + }, + }, + list: [{ id: "main" }], + }, + } as OpenClawConfig, + }); + + expect(result.payloads).toEqual([ + { + text: STRICT_AGENTIC_BLOCKED_TEXT, + isError: true, + }, + ]); + expect(result.meta.livenessState).toBe("blocked"); + expect(result.meta.replayInvalid).toBe(false); + }); + + it("auto-activates strict-agentic for unconfigured GPT-5 openai runs and surfaces the blocked state", async () => { + // Criterion 1 of the GPT-5.4 parity gate ("no stalls after planning") must + // cover out-of-the-box installs, not only users who opted in. An + // unconfigured GPT-5.4 openai run should receive the strict-agentic retry + // + blocked-state treatment automatically. + mockedClassifyFailoverReason.mockReturnValue(null); + mockedRunEmbeddedAttempt.mockResolvedValue( + makeAttemptResult({ + assistantTexts: ["I'll inspect the code, make the change, and run the checks."], + }), + ); + + const result = await runEmbeddedPiAgent({ + ...overflowBaseRunParams, + provider: "openai", + model: "gpt-5.4", + runId: "run-strict-agentic-auto-activated", + config: { + agents: { + list: [{ id: "main" }], + }, + } as OpenClawConfig, + }); + + // Two retries (strict-agentic retry cap) plus the original attempt = 3 calls. + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(3); + expect(result.payloads).toEqual([ + { + text: STRICT_AGENTIC_BLOCKED_TEXT, + isError: true, + }, + ]); + expect(result.meta.livenessState).toBe("blocked"); + }); + + it("respects explicit default contract opt-out on GPT-5 openai runs", async () => { + // Users who explicitly set executionContract: "default" opt out of + // auto-activated strict-agentic. They keep the old pre-parity-program + // behavior (1 retry, then fall through to the normal completion path). + mockedClassifyFailoverReason.mockReturnValue(null); + mockedRunEmbeddedAttempt.mockResolvedValue( + makeAttemptResult({ + assistantTexts: ["I'll inspect the code, make the change, and run the checks."], + }), + ); + + const result = await runEmbeddedPiAgent({ + ...overflowBaseRunParams, + provider: "openai", + model: "gpt-5.4", + runId: "run-strict-agentic-explicit-default-optout", + config: { + agents: { + defaults: { + embeddedPi: { + executionContract: "default", + }, + }, + list: [{ id: "main" }], + }, + } as OpenClawConfig, + }); + + // Default contract: 1 retry then falls through. Should NOT surface the + // strict-agentic blocked payload. + expect(mockedRunEmbeddedAttempt).toHaveBeenCalledTimes(2); + const payloadTexts = (result.payloads ?? []).map((payload) => payload.text ?? ""); + for (const text of payloadTexts) { + expect(text).not.toContain("plan-only turns"); + } + }); + it("detects replay-safe planning-only GPT turns", () => { const retryInstruction = resolvePlanningOnlyRetryInstruction({ provider: "openai", @@ -249,6 +360,30 @@ describe("runEmbeddedPiAgent incomplete-turn safety", () => { expect(instruction).toContain("Do not recap or restate the plan"); }); + it("applies the planning-only retry guard to prefixed GPT-5 ids", () => { + const retryInstruction = resolvePlanningOnlyRetryInstruction({ + provider: "openai", + modelId: " openai/gpt-5.4 ", + aborted: false, + timedOut: false, + attempt: makeAttemptResult({ + assistantTexts: ["I'll inspect the code, make the change, and run the checks."], + }), + }); + + expect(retryInstruction).toContain("Do not restate the plan"); + }); + + it("applies the ack-turn fast path to broadened GPT-5-family ids", () => { + const instruction = resolveAckExecutionFastPathInstruction({ + provider: "openai", + modelId: "gpt-5o-mini", + prompt: "go ahead", + }); + + expect(instruction).toContain("Do not recap or restate the plan"); + }); + it("extracts structured steps from planning-only narration", () => { expect( extractPlanningOnlyPlanDetails( diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index f5cd5590171..01b8bc48548 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -112,7 +112,11 @@ import { sessionLikelyHasOversizedToolResults, truncateOversizedToolResultsInSession, } from "./tool-result-truncation.js"; -import type { EmbeddedPiAgentMeta, EmbeddedPiRunResult } from "./types.js"; +import type { + EmbeddedPiAgentMeta, + EmbeddedPiRunResult, + EmbeddedRunLivenessState, +} from "./types.js"; import { createUsageAccumulator, mergeUsageIntoAccumulator } from "./usage-accumulator.js"; type ApiKeyInfo = ResolvedProviderAuth; @@ -1624,6 +1628,24 @@ export async function runEmbeddedPiAgent( `strict-agentic run exhausted planning-only retries: runId=${params.runId} sessionId=${params.sessionId} ` + `provider=${provider}/${modelId} configured=${configuredExecutionContract} — surfacing blocked state`, ); + // Criterion 4 of the GPT-5.4 parity gate requires every terminal + // exit path to emit an explicit livenessState + replayInvalid so + // downstream observers never see "silent disappearance". Every + // other hard-error terminal branch in this file uses "blocked" + // for its livenessState (role ordering, image size, schema + // error, compaction timeout, aborted-with-no-payloads). Match + // that convention here so lifecycle consumers treat an + // isError:true strict-agentic-blocked payload the same way they + // treat any other error-terminal payload. Replay validity is + // delegated to the shared resolver because the plan-only + // transcript itself is replay-safe even though the run is + // terminal. + const replayInvalid = resolveReplayInvalidForAttempt(null); + const livenessState: EmbeddedRunLivenessState = "blocked"; + attempt.setTerminalLifecycleMeta?.({ + replayInvalid, + livenessState, + }); return { payloads: [ { @@ -1637,6 +1659,8 @@ export async function runEmbeddedPiAgent( aborted, systemPromptReport: attempt.systemPromptReport, finalAssistantVisibleText, + replayInvalid, + livenessState, }, didSendViaMessagingTool: attempt.didSendViaMessagingTool, didSendDeterministicApprovalPrompt: attempt.didSendDeterministicApprovalPrompt, diff --git a/src/agents/pi-embedded-runner/run/incomplete-turn.ts b/src/agents/pi-embedded-runner/run/incomplete-turn.ts index 1ab460b8305..06962c7e66d 100644 --- a/src/agents/pi-embedded-runner/run/incomplete-turn.ts +++ b/src/agents/pi-embedded-runner/run/incomplete-turn.ts @@ -1,5 +1,6 @@ import type { EmbeddedPiExecutionContract } from "../../../config/types.agent-defaults.js"; import { normalizeLowercaseStringOrEmpty } from "../../../shared/string-coerce.js"; +import { isStrictAgenticSupportedProviderModel } from "../../execution-contract.js"; import { isLikelyMutatingToolName } from "../../tool-mutation.js"; import type { EmbeddedRunLivenessState } from "../types.js"; import type { EmbeddedRunAttemptResult } from "./types.js"; @@ -194,11 +195,10 @@ function shouldApplyPlanningOnlyRetryGuard(params: { provider?: string; modelId?: string; }): boolean { - const provider = normalizeLowercaseStringOrEmpty(params.provider); - if (provider !== "openai" && provider !== "openai-codex") { - return false; - } - return /^gpt-5(?:[.-]|$)/i.test(params.modelId ?? ""); + return isStrictAgenticSupportedProviderModel({ + provider: params.provider, + modelId: params.modelId, + }); } function normalizeAckPrompt(text: string): string {