From 77a1b7625d34042c48c064acb75b7352fcce60e5 Mon Sep 17 00:00:00 2001 From: "clawsweeper[bot]" <274271284+clawsweeper[bot]@users.noreply.github.com> Date: Fri, 22 May 2026 11:21:57 +0000 Subject: [PATCH] fix: preserve Google Gemini 3 cron thinking (#85300) Summary: - The branch adds a Google provider thinking-policy resolver and opt-in profile flag, updates shared thinking validation and cron/proof-policy tests, and adjusts ClawSweeper proof parsing. - Reproducibility: yes. source-reproducible: current main applies the generic off-only profile before provider ... figured thinking through that resolver. I did not execute a live systemd cron run in this read-only review. Automerge notes: - PR branch already contained follow-up commit before automerge: fix: preserve Google Gemini 3 cron thinking Validation: - ClawSweeper review passed for head a6cd2e826e8d02027fa6e37e9da4816e1438a2d3. - Required merge gates passed before the squash merge. Prepared head SHA: a6cd2e826e8d02027fa6e37e9da4816e1438a2d3 Review: https://github.com/openclaw/openclaw/pull/85300#issuecomment-4517662575 Co-authored-by: Neerav Makwana <261249544+neeravmakwana@users.noreply.github.com> Co-authored-by: Cursor Co-authored-by: clawsweeper <274271284+clawsweeper[bot]@users.noreply.github.com> Co-authored-by: clawsweeper[bot] <274271284+clawsweeper[bot]@users.noreply.github.com> Approved-by: takhoffman Co-authored-by: takhoffman <781889+takhoffman@users.noreply.github.com> --- CHANGELOG.md | 1 + extensions/google/provider-hooks.ts | 18 +-- extensions/google/provider-policy-api.test.ts | 71 ++++++++++- extensions/google/provider-policy-api.ts | 7 +- extensions/google/provider-policy.ts | 32 +++++ scripts/github/real-behavior-proof-policy.mjs | 27 ++++- src/auto-reply/thinking.test.ts | 32 +++++ src/auto-reply/thinking.ts | 11 +- .../isolated-agent.model-overrides.test.ts | 53 +++++++- src/plugins/plugin-metadata-snapshot.types.ts | 3 +- src/plugins/plugin-registry-snapshot.ts | 4 +- src/plugins/plugin-registry-snapshot.types.ts | 1 + src/plugins/provider-thinking.types.ts | 6 + .../real-behavior-proof-policy.test.ts | 114 +++++++++++++++++- 14 files changed, 346 insertions(+), 34 deletions(-) create mode 100644 src/plugins/plugin-registry-snapshot.types.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index c96bf8c1c60..474352250d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -55,6 +55,7 @@ Docs: https://docs.openclaw.ai - Control UI: keep the chat session picker from hiding older or cross-agent configured conversations while preserving the bounded configured-agent refresh. (#85211) Thanks @amknight. - Agents/Anthropic: preserve unsafe integer tool-call input values in streamed Anthropic tool-use JSON, preventing Discord-style IDs from being rounded before dispatch. Fixes #47229. (#83063) Thanks @leno23. - Agents/hooks: wait for local one-shot CLI and Codex `agent_end` plugin hooks before process cleanup so terminal observability flushes reliably. (#85007) +- Providers/Google: preserve Gemini 3 cron `thinkingDefault: "low"` when stale catalog metadata says `reasoning:false`, so scheduled runs keep provider-supported thinking instead of downgrading to off. (#85185) Thanks @neeravmakwana. - CLI/agents: allow `openclaw agent --session-key` to target explicit session keys, including agent-scoped legacy keys. (#85121) Thanks @Kaspre. - Auto-reply/ACP: wait for same-channel block reply delivery before starting tool work, while still honoring ACP dispatch aborts so stopped turns do not wait on slow channel sends. (#83722) Thanks @IWhatsskill. - Codex/ACP: mark required child-run completions that only report progress, omit a final deliverable, or fail requester delivery as blocked while preserving real final reports. (#85110) Thanks @IWhatsskill. diff --git a/extensions/google/provider-hooks.ts b/extensions/google/provider-hooks.ts index 219d1041512..7b09c48bdde 100644 --- a/extensions/google/provider-hooks.ts +++ b/extensions/google/provider-hooks.ts @@ -4,25 +4,15 @@ import type { } from "openclaw/plugin-sdk/core"; import { buildProviderReplayFamilyHooks } from "openclaw/plugin-sdk/provider-model-shared"; import { buildProviderToolCompatFamilyHooks } from "openclaw/plugin-sdk/provider-tools"; -import { createGoogleThinkingStreamWrapper, isGoogleGemini3ProModel } from "./thinking-api.js"; +import { resolveGoogleThinkingProfile } from "./provider-policy.js"; +import { createGoogleThinkingStreamWrapper } from "./thinking-api.js"; export const GOOGLE_GEMINI_PROVIDER_HOOKS = { ...buildProviderReplayFamilyHooks({ family: "google-gemini", }), ...buildProviderToolCompatFamilyHooks("gemini"), - resolveThinkingProfile: ({ modelId }: ProviderDefaultThinkingPolicyContext) => - ({ - levels: isGoogleGemini3ProModel(modelId) - ? [{ id: "off" }, { id: "low" }, { id: "adaptive" }, { id: "high" }] - : [ - { id: "off" }, - { id: "minimal" }, - { id: "low" }, - { id: "medium" }, - { id: "adaptive" }, - { id: "high" }, - ], - }) satisfies ProviderThinkingProfile, + resolveThinkingProfile: (context: ProviderDefaultThinkingPolicyContext) => + resolveGoogleThinkingProfile(context) satisfies ProviderThinkingProfile | undefined, wrapStreamFn: createGoogleThinkingStreamWrapper, }; diff --git a/extensions/google/provider-policy-api.test.ts b/extensions/google/provider-policy-api.test.ts index 309bd52db90..7b232be6b10 100644 --- a/extensions/google/provider-policy-api.test.ts +++ b/extensions/google/provider-policy-api.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import { normalizeConfig } from "./provider-policy-api.js"; +import { normalizeConfig, resolveThinkingProfile } from "./provider-policy-api.js"; describe("google provider policy public artifact", () => { it("normalizes Google provider config without loading the full provider plugin", () => { @@ -129,4 +129,73 @@ describe("google provider policy public artifact", () => { ], }); }); + + it("preserves Gemini 3 thinking levels when catalog reasoning metadata is stale", () => { + expect( + resolveThinkingProfile({ + provider: "google", + modelId: "gemini-3-flash-preview", + reasoning: false, + }), + ).toEqual({ + levels: [ + { id: "off" }, + { id: "minimal" }, + { id: "low" }, + { id: "medium" }, + { id: "adaptive" }, + { id: "high" }, + ], + preserveWhenCatalogReasoningFalse: true, + }); + }); + + it("preserves provider-prefixed Gemini 3 thinking levels when catalog reasoning metadata is stale", () => { + expect( + resolveThinkingProfile({ + provider: "google", + modelId: "google/gemini-3-flash-preview", + reasoning: false, + }), + ).toMatchObject({ + levels: expect.arrayContaining([{ id: "low" }, { id: "medium" }, { id: "adaptive" }]), + preserveWhenCatalogReasoningFalse: true, + }); + }); + + it("preserves normalized Gemini 3 aliases when catalog reasoning metadata is stale", () => { + expect( + resolveThinkingProfile({ + provider: "google", + modelId: "google/gemini-3-pro", + reasoning: false, + }), + ).toEqual({ + levels: [{ id: "off" }, { id: "low" }, { id: "adaptive" }, { id: "high" }], + preserveWhenCatalogReasoningFalse: true, + }); + }); + + it("preserves Gemini 3 Pro thinking levels when catalog reasoning metadata is stale", () => { + expect( + resolveThinkingProfile({ + provider: "google", + modelId: "gemini-3.1-pro-preview", + reasoning: false, + }), + ).toEqual({ + levels: [{ id: "off" }, { id: "low" }, { id: "adaptive" }, { id: "high" }], + preserveWhenCatalogReasoningFalse: true, + }); + }); + + it("honors catalog reasoning=false for non-Gemini 3 Google models", () => { + expect( + resolveThinkingProfile({ + provider: "google", + modelId: "gemma-4-26b-a4b-it", + reasoning: false, + }), + ).toBeUndefined(); + }); }); diff --git a/extensions/google/provider-policy-api.ts b/extensions/google/provider-policy-api.ts index 3da6b425b3a..bf9a7ef42ac 100644 --- a/extensions/google/provider-policy-api.ts +++ b/extensions/google/provider-policy-api.ts @@ -1,6 +1,11 @@ +import type { ProviderDefaultThinkingPolicyContext } from "openclaw/plugin-sdk/core"; import type { ModelProviderConfig } from "openclaw/plugin-sdk/provider-model-types"; -import { normalizeGoogleProviderConfig } from "./provider-policy.js"; +import { normalizeGoogleProviderConfig, resolveGoogleThinkingProfile } from "./provider-policy.js"; export function normalizeConfig(params: { provider: string; providerConfig: ModelProviderConfig }) { return normalizeGoogleProviderConfig(params.provider, params.providerConfig); } + +export function resolveThinkingProfile(context: ProviderDefaultThinkingPolicyContext) { + return resolveGoogleThinkingProfile(context); +} diff --git a/extensions/google/provider-policy.ts b/extensions/google/provider-policy.ts index 1e7e978d946..d9b8ff91f54 100644 --- a/extensions/google/provider-policy.ts +++ b/extensions/google/provider-policy.ts @@ -1,5 +1,10 @@ +import type { + ProviderDefaultThinkingPolicyContext, + ProviderThinkingProfile, +} from "openclaw/plugin-sdk/core"; import type { ModelProviderConfig } from "openclaw/plugin-sdk/provider-model-types"; import { normalizeAntigravityModelId, normalizeGoogleModelId } from "./model-id.js"; +import { isGoogleGemini3ProModel, isGoogleGemini3ThinkingLevelModel } from "./thinking-api.js"; type GoogleApiCarrier = { api?: string | null; @@ -174,3 +179,30 @@ export function normalizeGoogleProviderConfig( return nextProvider; } + +export function resolveGoogleThinkingProfile({ + modelId, + reasoning, +}: ProviderDefaultThinkingPolicyContext): ProviderThinkingProfile | undefined { + const normalizedModelId = normalizeGoogleModelId(modelId); + const isGemini3ThinkingModel = isGoogleGemini3ThinkingLevelModel(normalizedModelId); + if (reasoning === false && !isGemini3ThinkingModel) { + return undefined; + } + + const levels: ProviderThinkingProfile["levels"] = isGoogleGemini3ProModel(normalizedModelId) + ? [{ id: "off" }, { id: "low" }, { id: "adaptive" }, { id: "high" }] + : [ + { id: "off" }, + { id: "minimal" }, + { id: "low" }, + { id: "medium" }, + { id: "adaptive" }, + { id: "high" }, + ]; + + return { + levels, + ...(isGemini3ThinkingModel ? { preserveWhenCatalogReasoningFalse: true } : {}), + }; +} diff --git a/scripts/github/real-behavior-proof-policy.mjs b/scripts/github/real-behavior-proof-policy.mjs index c76ab5b9bf1..b81cb8b05e7 100644 --- a/scripts/github/real-behavior-proof-policy.mjs +++ b/scripts/github/real-behavior-proof-policy.mjs @@ -6,6 +6,7 @@ export const MOCK_ONLY_PROOF_LABEL = "triage: mock-only-proof"; export const MAINTAINER_TEAM_SLUG = "maintainer"; export const CLAWSWEEPER_PROOF_VERDICT_STATUS = "clawsweeper_exact_head_pass"; +const CLAWSWEEPER_BOT_LOGINS = new Set(["clawsweeper[bot]", "openclaw-clawsweeper[bot]"]); const privilegedAuthorAssociations = new Set(["OWNER", "MEMBER", "COLLABORATOR"]); @@ -142,11 +143,10 @@ export async function isMaintainerTeamMember({ return body?.state === "active"; } -export function extractRealBehaviorProofSection(body = "") { +function extractMarkdownSection(headingRegex, body = "") { // Normalize CRLF → LF so regexes and section slicing see GitHub web-editor PR // bodies the same way as locally-authored Markdown. const normalizedBody = normalizeLineEndings(body); - const headingRegex = /^#{2,6}\s+real behavior proof\b[^\n]*$/gim; const match = headingRegex.exec(normalizedBody); if (!match) { return ""; @@ -157,6 +157,14 @@ export function extractRealBehaviorProofSection(body = "") { return (nextHeading ? rest.slice(0, nextHeading.index) : rest).trim(); } +export function extractRealBehaviorProofSection(body = "") { + return extractMarkdownSection(/^#{2,6}\s+real behavior proof\b[^\n]*$/im, body); +} + +function extractOutOfScopeFollowUpsSection(body = "") { + return extractMarkdownSection(/^#{2,6}\s+out-of-scope follow-ups\b[^\n]*$/im, body); +} + function fieldLineRegex(name) { return new RegExp( `^\\s*(?:[-*]\\s*)?(?:\\*\\*)?${escapeRegex(name)}(?:\\s*\\([^)]*\\))?(?:\\*\\*)?\\s*:\\s*(.*)$`, @@ -246,7 +254,14 @@ function isTrustedClawSweeperComment(comment) { const appSlug = String( comment?.performed_via_github_app?.slug ?? comment?.performedViaGithubApp?.slug ?? "", ).toLowerCase(); - return appSlug === "clawsweeper"; + if (appSlug === "clawsweeper") { + return true; + } + // GitHub can omit performed_via_github_app on issue comments while still + // returning a reserved ClawSweeper App bot identity. + const login = String(comment?.user?.login ?? "").toLowerCase(); + const userType = String(comment?.user?.type ?? ""); + return CLAWSWEEPER_BOT_LOGINS.has(login) && userType === "Bot"; } export function hasClawSweeperExactHeadProof({ pullRequest, comments = [] } = {}) { @@ -292,7 +307,8 @@ export function evaluateRealBehaviorProof({ pullRequest, labels } = {}) { return result("skipped", "Maintainer, collaborator, or bot PRs do not require this gate."); } - const section = extractRealBehaviorProofSection(pullRequest?.body ?? ""); + const body = pullRequest?.body ?? ""; + const section = extractRealBehaviorProofSection(body); if (!section) { return result( "missing", @@ -303,6 +319,9 @@ export function evaluateRealBehaviorProof({ pullRequest, labels } = {}) { const fields = Object.fromEntries( requiredProofFields.map((field) => [field.key, extractFieldValue(section, field)]), ); + if (!fields.notTested) { + fields.notTested = extractOutOfScopeFollowUpsSection(body); + } const missingFields = requiredProofFields .filter((field) => isMissingValue(fields[field.key] ?? "", field)) .map((field) => field.key); diff --git a/src/auto-reply/thinking.test.ts b/src/auto-reply/thinking.test.ts index aa07e77e6cf..680fd9f3e64 100644 --- a/src/auto-reply/thinking.test.ts +++ b/src/auto-reply/thinking.test.ts @@ -192,6 +192,38 @@ describe("listThinkingLevels", () => { ).toBe("off"); }); + it("preserves provider-authoritative thinking profiles over stale catalog reasoning", () => { + providerRuntimeMocks.resolveProviderThinkingProfile.mockReturnValue({ + levels: [{ id: "off" }, { id: "minimal" }, { id: "low" }, { id: "medium" }], + preserveWhenCatalogReasoningFalse: true, + }); + const catalog = [ + { + provider: "google", + id: "gemini-3-flash-preview", + name: "Gemini 3 Flash Preview", + reasoning: false, + }, + ]; + + expect( + isThinkingLevelSupported({ + provider: "google", + model: "gemini-3-flash-preview", + level: "low", + catalog, + }), + ).toBe(true); + expect( + resolveSupportedThinkingLevel({ + provider: "google", + model: "gemini-3-flash-preview", + level: "low", + catalog, + }), + ).toBe("low"); + }); + it("passes catalog reasoning into provider thinking profiles for support checks", () => { providerRuntimeMocks.resolveProviderThinkingProfile.mockImplementation(({ context }) => ({ levels: diff --git a/src/auto-reply/thinking.ts b/src/auto-reply/thinking.ts index e486121f973..fa3c00e1b59 100644 --- a/src/auto-reply/thinking.ts +++ b/src/auto-reply/thinking.ts @@ -166,19 +166,22 @@ export function resolveThinkingProfile(params: { modelId: context.modelId, reasoning: context.reasoning, }; - if (context.reasoning === false) { - return buildOffOnlyThinkingProfile(); - } const pluginProfile = resolveProviderThinkingProfile({ provider: context.normalizedProvider, context: providerContext, }); if (pluginProfile) { const normalized = normalizeThinkingProfile(pluginProfile); - if (normalized.levels.length > 0) { + if ( + normalized.levels.length > 0 && + (context.reasoning !== false || pluginProfile.preserveWhenCatalogReasoningFalse === true) + ) { return normalized; } } + if (context.reasoning === false) { + return buildOffOnlyThinkingProfile(); + } const defaultLevel = resolveProviderDefaultThinkingLevel({ provider: context.normalizedProvider, diff --git a/src/cron/isolated-agent.model-overrides.test.ts b/src/cron/isolated-agent.model-overrides.test.ts index 0cf67de28ac..7d3d889c0ae 100644 --- a/src/cron/isolated-agent.model-overrides.test.ts +++ b/src/cron/isolated-agent.model-overrides.test.ts @@ -1,4 +1,5 @@ import "./isolated-agent.mocks.js"; +import path from "node:path"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import { loadModelCatalog } from "../agents/model-catalog.js"; import { runEmbeddedPiAgent } from "../agents/pi-embedded.js"; @@ -21,7 +22,7 @@ import * as isolatedAgentRunRuntime from "./isolated-agent/run.runtime.js"; function installThinkingTestProviders() { const registry = createTestRegistry(); - registry.providers = ["anthropic", "openai", "openrouter"].map( + registry.providers = ["anthropic", "google", "openai", "openrouter"].map( (providerId): PluginProviderRegistration => ({ pluginId: providerId, source: "test", @@ -29,10 +30,18 @@ function installThinkingTestProviders() { id: providerId, label: providerId, auth: [], - resolveThinkingProfile: () => ({ - levels: BASE_THINKING_LEVELS.map((id) => ({ id })), - defaultLevel: "off", - }), + resolveThinkingProfile: ({ modelId }) => + providerId === "google" && modelId === "gemini-3-flash-preview" + ? { + levels: (["off", "minimal", "low", "medium", "adaptive", "high"] as const).map( + (id) => ({ id }), + ), + preserveWhenCatalogReasoningFalse: true, + } + : { + levels: BASE_THINKING_LEVELS.map((id) => ({ id })), + defaultLevel: "off", + }, }, }), ); @@ -253,4 +262,38 @@ describe("runCronIsolatedAgentTurn model overrides", () => { expect(callArgs?.thinkLevel).toBe("low"); }); }); + + it("keeps configured Gemini 3 cron thinking when catalog reasoning metadata is stale", async () => { + await withTempHome(async (home) => { + vi.mocked(isolatedAgentRunRuntime.resolveThinkingDefault).mockReturnValueOnce("low"); + vi.mocked(loadModelCatalog).mockResolvedValueOnce([ + { + id: "gemini-3-flash-preview", + name: "Gemini 3 Flash Preview", + provider: "google", + reasoning: false, + }, + ]); + + await runCronTurn(home, { + cfgOverrides: { + agents: { + defaults: { + model: "google/gemini-3-flash-preview", + workspace: path.join(home, "openclaw"), + thinkingDefault: "low", + }, + }, + }, + jobPayload: DEFAULT_AGENT_TURN_PAYLOAD, + mockTexts: ["done"], + }); + + const calls = vi.mocked(runEmbeddedPiAgent).mock.calls; + const callArgs = calls[calls.length - 1]?.[0]; + expect(callArgs?.provider).toBe("google"); + expect(callArgs?.model).toBe("gemini-3-flash-preview"); + expect(callArgs?.thinkLevel).toBe("low"); + }); + }); }); diff --git a/src/plugins/plugin-metadata-snapshot.types.ts b/src/plugins/plugin-metadata-snapshot.types.ts index d5526b58894..bb085963795 100644 --- a/src/plugins/plugin-metadata-snapshot.types.ts +++ b/src/plugins/plugin-metadata-snapshot.types.ts @@ -2,8 +2,7 @@ import type { OpenClawConfig } from "../config/types.openclaw.js"; import type { InstalledPluginIndex } from "./installed-plugin-index.js"; import type { PluginManifestRecord, PluginManifestRegistry } from "./manifest-registry.js"; import type { PluginDiagnostic } from "./manifest-types.js"; - -export type PluginRegistrySnapshotSource = "provided" | "persisted" | "derived"; +import type { PluginRegistrySnapshotSource } from "./plugin-registry-snapshot.types.js"; export type PluginMetadataSnapshotOwnerMaps = { channels: ReadonlyMap; diff --git a/src/plugins/plugin-registry-snapshot.ts b/src/plugins/plugin-registry-snapshot.ts index 91a8c5c01bc..63440da270a 100644 --- a/src/plugins/plugin-registry-snapshot.ts +++ b/src/plugins/plugin-registry-snapshot.ts @@ -26,12 +26,12 @@ import { type LoadInstalledPluginIndexParams, type RefreshInstalledPluginIndexParams, } from "./installed-plugin-index.js"; -import type { PluginRegistrySnapshotSource } from "./plugin-metadata-snapshot.types.js"; +import type { PluginRegistrySnapshotSource } from "./plugin-registry-snapshot.types.js"; export type PluginRegistrySnapshot = InstalledPluginIndex; export type PluginRegistryRecord = InstalledPluginIndexRecord; export type PluginRegistryInspection = InstalledPluginIndexStoreInspection; -export type { PluginRegistrySnapshotSource }; +export type { PluginRegistrySnapshotSource } from "./plugin-registry-snapshot.types.js"; export type PluginRegistrySnapshotDiagnosticCode = | "persisted-registry-disabled" | "persisted-registry-missing" diff --git a/src/plugins/plugin-registry-snapshot.types.ts b/src/plugins/plugin-registry-snapshot.types.ts new file mode 100644 index 00000000000..d822a63a06b --- /dev/null +++ b/src/plugins/plugin-registry-snapshot.types.ts @@ -0,0 +1 @@ +export type PluginRegistrySnapshotSource = "provided" | "persisted" | "derived"; diff --git a/src/plugins/provider-thinking.types.ts b/src/plugins/provider-thinking.types.ts index 8d28e3324d2..eab175e4608 100644 --- a/src/plugins/provider-thinking.types.ts +++ b/src/plugins/provider-thinking.types.ts @@ -49,4 +49,10 @@ export type ProviderThinkingLevel = { export type ProviderThinkingProfile = { levels: ProviderThinkingLevel[] | ReadonlyArray; defaultLevel?: ProviderThinkingLevelId | null; + /** + * Some bundled providers have model-specific thinking contracts that are more + * current than cached generic catalog metadata. Keep this opt-in so + * `reasoning: false` remains authoritative for ordinary catalog entries. + */ + preserveWhenCatalogReasoningFalse?: boolean; }; diff --git a/test/scripts/real-behavior-proof-policy.test.ts b/test/scripts/real-behavior-proof-policy.test.ts index 01286df72cb..489b76bb34c 100644 --- a/test/scripts/real-behavior-proof-policy.test.ts +++ b/test/scripts/real-behavior-proof-policy.test.ts @@ -85,6 +85,76 @@ describe("real-behavior-proof-policy", () => { expect(labelsForRealBehaviorProof(evaluation)).toEqual([PROOF_SUPPLIED_LABEL]); }); + it("accepts out-of-scope follow-ups as not-tested proof detail", () => { + const body = [ + "## Real behavior proof", + "", + "- Behavior addressed: Cron validation keeps Google Gemini 3 low thinking.", + "- Real environment tested: Local macOS source checkout, Node 24.", + "- Exact steps or command run after this patch:", + " 1. Built the local checkout with `node scripts/build-all.mjs`.", + " 2. Ran a redacted behavior probe for `provider=google`, `model=gemini-3-flash-preview`, and `catalogReasoning=false`.", + '- Evidence after fix: `.artifacts/behavior-85156/after-installed.json` recorded `lowSupported: true` and `fallbackFromLow: "low"`.', + "- Observed result after fix:", + " - `levels: off, minimal, low, medium, adaptive, high`", + " - `lowSupported: true`", + " - `fallbackFromLow: low`", + " - `local command version: OpenClaw 2026.5.21`", + "", + "## Out-of-scope Follow-ups", + "- No live systemd cron schedule was tested.", + "- No real Google provider request was sent.", + ].join("\n"); + const evaluation = evaluateRealBehaviorProof({ + pullRequest: externalPr(body), + }); + + expect(evaluation.status).toBe("passed"); + expect(evaluation.fields?.notTested).toBe( + "- No live systemd cron schedule was tested.\n- No real Google provider request was sent.", + ); + expect(labelsForRealBehaviorProof(evaluation)).toEqual([PROOF_SUPPLIED_LABEL]); + }); + + it("accepts source PR proof when explicit gaps live in out-of-scope follow-ups", () => { + const body = [ + "## Real behavior proof", + "", + '- Behavior addressed: Cron/provider thinking validation no longer downgrades `google/gemini-3-flash-preview` `thinkingDefault: "low"` to `"off"` when cached catalog metadata says `reasoning:false` but the Google provider policy says Gemini 3 supports low thinking.', + "- Real environment tested: Local macOS source checkout, Node v24.8.0, OpenClaw 2026.5.21 (c8a35c4), local `openclaw` shim pointed at the freshly built checkout. No channel credentials or provider API keys were used.", + "- Exact steps or command run after this patch:", + " 1. Built the local checkout with `node scripts/build-all.mjs`.", + " 2. Updated `/Users/example/.local/bin/openclaw` to run this checkout's `openclaw.mjs` and verified `/Users/example/.local/bin/openclaw --version`.", + " 3. Ran a redacted behavior probe for the reported cron validation decision with `provider=google`, `model=gemini-3-flash-preview`, `configuredThinkingDefault=low`, and `catalogReasoning=false`.", + '- Evidence after fix: `.artifacts/behavior-85156/after-installed.json` from the local checkout recorded `lowSupported: true` and `fallbackFromLow: "low"`.', + "- Observed result after fix:", + " - `levels: off, minimal, low, medium, adaptive, high`", + " - `lowSupported: true`", + " - `fallbackFromLow: low`", + " - `local command version: OpenClaw 2026.5.21 (c8a35c4)`", + "", + "## Out-of-scope Follow-ups", + "- No live systemd cron schedule is added in this PR.", + "- No real Google provider request is sent in this PR.", + "- No catalog refresh or provider model-list behavior is changed in this PR.", + "- No channel, gateway allowlist, credential, or auth-profile behavior is changed in this PR.", + ].join("\n"); + const evaluation = evaluateRealBehaviorProof({ + pullRequest: externalPr(body), + }); + + expect(evaluation.status).toBe("passed"); + expect(evaluation.fields?.notTested).toBe( + [ + "- No live systemd cron schedule is added in this PR.", + "- No real Google provider request is sent in this PR.", + "- No catalog refresh or provider model-list behavior is changed in this PR.", + "- No channel, gateway allowlist, credential, or auth-profile behavior is changed in this PR.", + ].join("\n"), + ); + expect(labelsForRealBehaviorProof(evaluation)).toEqual([PROOF_SUPPLIED_LABEL]); + }); + it("fails external PRs without a real behavior proof section", () => { const evaluation = evaluateRealBehaviorProof({ pullRequest: externalPr("## Summary\n\n- Fixed startup."), @@ -234,7 +304,7 @@ describe("real-behavior-proof-policy", () => { expect(evaluateClawSweeperExactHeadProof({ pullRequest, comments }).passed).toBe(false); }); - it("rejects bot-shaped ClawSweeper pass verdict markers without the GitHub App source", () => { + it("accepts exact ClawSweeper bot pass verdict markers when GitHub omits the app source", () => { const pullRequest = { number: 83581, head: { @@ -251,6 +321,48 @@ describe("real-behavior-proof-policy", () => { }, ]; + expect(hasClawSweeperExactHeadProof({ pullRequest, comments })).toBe(true); + expect(evaluateClawSweeperExactHeadProof({ pullRequest, comments }).passed).toBe(true); + }); + + it("accepts exact OpenClaw ClawSweeper bot pass verdict markers when GitHub omits the app source", () => { + const pullRequest = { + number: 83581, + head: { + sha: "06ee95df6608d29a395c52ba8ab53fdd93a9dc4f", + }, + }; + const comments = [ + { + user: { + login: "openclaw-clawsweeper[bot]", + type: "Bot", + }, + body: "", + }, + ]; + + expect(hasClawSweeperExactHeadProof({ pullRequest, comments })).toBe(true); + expect(evaluateClawSweeperExactHeadProof({ pullRequest, comments }).passed).toBe(true); + }); + + it("rejects bot-shaped pass verdict markers from other bot users", () => { + const pullRequest = { + number: 83581, + head: { + sha: "06ee95df6608d29a395c52ba8ab53fdd93a9dc4f", + }, + }; + const comments = [ + { + user: { + login: "not-clawsweeper[bot]", + type: "Bot", + }, + body: "", + }, + ]; + expect(hasClawSweeperExactHeadProof({ pullRequest, comments })).toBe(false); expect(evaluateClawSweeperExactHeadProof({ pullRequest, comments }).passed).toBe(false); });