diff --git a/.github/workflows/real-behavior-proof.yml b/.github/workflows/real-behavior-proof.yml index 7d14fb2d5a6..a51a1a173af 100644 --- a/.github/workflows/real-behavior-proof.yml +++ b/.github/workflows/real-behavior-proof.yml @@ -18,6 +18,7 @@ jobs: name: Real behavior proof permissions: contents: read + issues: read pull-requests: read runs-on: ubuntu-24.04 steps: diff --git a/scripts/github/barnacle-auto-response.mjs b/scripts/github/barnacle-auto-response.mjs index d96098d82e2..5c4177b0560 100644 --- a/scripts/github/barnacle-auto-response.mjs +++ b/scripts/github/barnacle-auto-response.mjs @@ -7,6 +7,7 @@ import { PROOF_SUFFICIENT_LABEL, PROOF_SUPPLIED_LABEL, evaluateRealBehaviorProof, + hasClawSweeperExactHeadProof, labelsForRealBehaviorProof, } from "./real-behavior-proof-policy.mjs"; @@ -767,6 +768,15 @@ async function listPullRequestFiles(github, context, pullRequest) { }); } +async function listIssueComments(github, context, issueNumber) { + return github.paginate(github.rest.issues.listComments, { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + per_page: 100, + }); +} + async function addMissingLabels(github, context, core, issueNumber, labels, labelSet) { const missingLabels = labels.filter((label) => !labelSet.has(label)); if (missingLabels.length === 0) { @@ -784,7 +794,10 @@ async function addMissingLabels(github, context, core, issueNumber, labels, labe core.info(`Added candidate labels to #${issueNumber}: ${missingLabels.join(", ")}`); } -function shouldRemoveProofSufficientLabel(context, proofEvaluation) { +function shouldRemoveProofSufficientLabel(context, proofEvaluation, hasExactHeadClawSweeperProof) { + if (hasExactHeadClawSweeperProof) { + return false; + } if (proofEvaluation.status !== "passed") { return true; } @@ -793,6 +806,12 @@ function shouldRemoveProofSufficientLabel(context, proofEvaluation) { async function applyPullRequestCandidateLabels(github, context, core, pullRequest, labelSet) { const files = await listPullRequestFiles(github, context, pullRequest); + const hasExactHeadClawSweeperProof = + labelSet.has(PROOF_SUFFICIENT_LABEL) && + hasClawSweeperExactHeadProof({ + pullRequest, + comments: await listIssueComments(github, context, pullRequest.number), + }); const proofEvaluation = evaluateRealBehaviorProof({ pullRequest: { ...pullRequest, @@ -811,7 +830,7 @@ async function applyPullRequestCandidateLabels(github, context, core, pullReques ); if ( labelSet.has(PROOF_SUFFICIENT_LABEL) && - shouldRemoveProofSufficientLabel(context, proofEvaluation) + shouldRemoveProofSufficientLabel(context, proofEvaluation, hasExactHeadClawSweeperProof) ) { staleProofLabels.push(PROOF_SUFFICIENT_LABEL); } diff --git a/scripts/github/real-behavior-proof-check.mjs b/scripts/github/real-behavior-proof-check.mjs index fce26ac30d9..a6cdffb6815 100644 --- a/scripts/github/real-behavior-proof-check.mjs +++ b/scripts/github/real-behavior-proof-check.mjs @@ -1,6 +1,7 @@ #!/usr/bin/env node import { readFileSync } from "node:fs"; import { + evaluateClawSweeperExactHeadProof, evaluateRealBehaviorProof, isMaintainerTeamMember, } from "./real-behavior-proof-policy.mjs"; @@ -26,12 +27,12 @@ if (!pullRequest) { process.exit(0); } -const token = process.env.GH_APP_TOKEN; +const appToken = process.env.GH_APP_TOKEN; const org = event.repository?.owner?.login; const authorLogin = pullRequest.user?.login; -if (token && org && authorLogin) { +if (appToken && org && authorLogin) { try { - if (await isMaintainerTeamMember({ token, org, login: authorLogin })) { + if (await isMaintainerTeamMember({ token: appToken, org, login: authorLogin })) { console.log( `PR author @${authorLogin} is an active member of the ${org}/maintainer team; skipping real behavior proof gate.`, ); @@ -50,6 +51,44 @@ if (evaluation.passed) { process.exit(0); } +const token = appToken || process.env.GITHUB_TOKEN; +const repository = process.env.GITHUB_REPOSITORY; +if (token && repository && pullRequest.number) { + const [owner, repo] = repository.split("/"); + const comments = []; + for (let page = 1; page <= 10; page += 1) { + const url = new URL( + `https://api.github.com/repos/${owner}/${repo}/issues/${pullRequest.number}/comments`, + ); + url.searchParams.set("per_page", "100"); + url.searchParams.set("page", String(page)); + const response = await fetch(url, { + headers: { + Accept: "application/vnd.github+json", + Authorization: `Bearer ${token}`, + "X-GitHub-Api-Version": "2022-11-28", + }, + }); + if (!response.ok) { + throw new Error(`Failed to fetch PR comments for proof verdicts: ${response.status}`); + } + const pageComments = await response.json(); + comments.push(...pageComments); + if (pageComments.length < 100) { + break; + } + } + + const clawSweeperEvaluation = evaluateClawSweeperExactHeadProof({ + pullRequest, + comments, + }); + if (clawSweeperEvaluation.passed) { + console.log(clawSweeperEvaluation.reason); + process.exit(0); + } +} + const message = `${evaluation.reason} Add after-fix evidence from a real OpenClaw setup in the PR body. Screenshots, recordings, terminal screenshots, console output, redacted runtime logs, linked artifacts, or copied live output count. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only. A maintainer can apply proof: override when appropriate.`; console.error(`::error title=Real behavior proof required::${escapeCommandValue(message)}`); process.exit(1); diff --git a/scripts/github/real-behavior-proof-policy.mjs b/scripts/github/real-behavior-proof-policy.mjs index 24980ea4e4d..9042bdfa744 100644 --- a/scripts/github/real-behavior-proof-policy.mjs +++ b/scripts/github/real-behavior-proof-policy.mjs @@ -5,6 +5,8 @@ export const NEEDS_REAL_BEHAVIOR_PROOF_LABEL = "triage: needs-real-behavior-proo export const MOCK_ONLY_PROOF_LABEL = "triage: mock-only-proof"; export const MAINTAINER_TEAM_SLUG = "maintainer"; +export const CLAWSWEEPER_PROOF_VERDICT_STATUS = "clawsweeper_exact_head_pass"; + const privilegedAuthorAssociations = new Set(["OWNER", "MEMBER", "COLLABORATOR"]); const requiredProofFields = [ @@ -230,11 +232,47 @@ function result(status, reason, details = {}) { status, reason, applies: ["passed", "missing", "mock_only", "insufficient", "override"].includes(status), - passed: ["passed", "skipped", "override"].includes(status), + passed: ["passed", "skipped", "override", CLAWSWEEPER_PROOF_VERDICT_STATUS].includes(status), ...details, }; } +function extractMarkerField(marker, name) { + const match = marker.match(new RegExp(`\\b${escapeRegex(name)}=([^\\s>]+)`, "i")); + return match?.[1] ?? ""; +} + +export function hasClawSweeperExactHeadProof({ pullRequest, comments = [] } = {}) { + const pullNumber = String(pullRequest?.number ?? ""); + const headSha = String(pullRequest?.head?.sha ?? pullRequest?.head_sha ?? "").toLowerCase(); + if (!pullNumber || !/^[0-9a-f]{40}$/i.test(headSha)) { + return false; + } + + for (const comment of comments) { + const body = String(comment?.body ?? ""); + const markers = body.match(//gi) ?? []; + for (const marker of markers) { + const item = extractMarkerField(marker, "item"); + const sha = extractMarkerField(marker, "sha").toLowerCase(); + if (item === pullNumber && sha === headSha) { + return true; + } + } + } + return false; +} + +export function evaluateClawSweeperExactHeadProof({ pullRequest, comments = [] } = {}) { + if (hasClawSweeperExactHeadProof({ pullRequest, comments })) { + return result( + CLAWSWEEPER_PROOF_VERDICT_STATUS, + "ClawSweeper accepted real behavior proof for the exact PR head.", + ); + } + return result("insufficient", "No exact-head ClawSweeper proof verdict was found."); +} + export function evaluateRealBehaviorProof({ pullRequest, labels } = {}) { const currentLabels = labels ?? pullRequest?.labels ?? []; if (hasProofOverride(currentLabels)) { diff --git a/test/scripts/barnacle-auto-response.test.ts b/test/scripts/barnacle-auto-response.test.ts index 058555e2875..4a9eaf6e940 100644 --- a/test/scripts/barnacle-auto-response.test.ts +++ b/test/scripts/barnacle-auto-response.test.ts @@ -135,6 +135,7 @@ function barnacleGithub( maintainerLogins?: string[]; removeLabelNotFound?: string[]; repositoryRoles?: Record; + comments?: Array<{ body: string }>; } = {}, ) { const maintainerLogins = new Set( @@ -154,8 +155,10 @@ function barnacleGithub( removeLabel: [] as Array<{ issue_number: number; name: string }>, update: [] as Array<{ issue_number: number; state?: string }>, }; + const listFiles = async () => files; + const listComments = async () => options.comments ?? []; const github = { - paginate: async () => files, + paginate: async (fn: unknown) => (fn === listComments ? (options.comments ?? []) : files), rest: { issues: { addLabels: async (params: { issue_number: number; labels: string[] }) => { @@ -173,6 +176,7 @@ function barnacleGithub( managedLabelSpecs[params.name as keyof typeof managedLabelSpecs]?.description ?? "", }, }), + listComments, lock: async (params: { issue_number: number; lock_reason?: string }) => { calls.lock.push(params); }, @@ -190,7 +194,7 @@ function barnacleGithub( updateLabel: async () => undefined, }, pulls: { - listFiles: async () => files, + listFiles, }, repos: { getCollaboratorPermissionLevel: async ({ username }: { username: string }) => { @@ -784,6 +788,36 @@ describe("barnacle-auto-response", () => { }, ); + it("preserves sufficient proof on synchronize when ClawSweeper passed the exact head", async () => { + const headSha = "06ee95df6608d29a395c52ba8ab53fdd93a9dc4f"; + const { calls, github } = barnacleGithub([file("src/gateway/server.ts")], { + comments: [ + { + body: ``, + }, + ], + }); + + await runBarnacleAutoResponse({ + github, + context: barnacleContext( + { + body: blankTemplateBody, + head: { sha: headSha }, + }, + [PROOF_SUFFICIENT_LABEL], + { action: "synchronize" }, + ), + core: { + info: () => undefined, + }, + }); + + expect(calls.removeLabel).not.toContainEqual( + expect.objectContaining({ name: PROOF_SUFFICIENT_LABEL }), + ); + }); + it("preserves ClawSweeper's sufficient proof label on ordinary label events", async () => { const { calls, github } = barnacleGithub([file("src/gateway/server.ts")]); diff --git a/test/scripts/real-behavior-proof-policy.test.ts b/test/scripts/real-behavior-proof-policy.test.ts index c81a8f55cf5..dbe0ea33b18 100644 --- a/test/scripts/real-behavior-proof-policy.test.ts +++ b/test/scripts/real-behavior-proof-policy.test.ts @@ -4,7 +4,9 @@ import { NEEDS_REAL_BEHAVIOR_PROOF_LABEL, PROOF_OVERRIDE_LABEL, PROOF_SUPPLIED_LABEL, + evaluateClawSweeperExactHeadProof, evaluateRealBehaviorProof, + hasClawSweeperExactHeadProof, isMaintainerTeamMember, labelsForRealBehaviorProof, } from "../../scripts/github/real-behavior-proof-policy.mjs"; @@ -174,6 +176,35 @@ describe("real-behavior-proof-policy", () => { }).status, ).toBe("override"); }); + + it("accepts ClawSweeper pass verdict comments only for the exact PR head", () => { + const pullRequest = { + number: 83581, + head: { + sha: "06ee95df6608d29a395c52ba8ab53fdd93a9dc4f", + }, + }; + const comments = [ + { + body: [ + "Codex review: passed.", + "", + ].join("\n"), + }, + ]; + + expect(hasClawSweeperExactHeadProof({ pullRequest, comments })).toBe(true); + expect(evaluateClawSweeperExactHeadProof({ pullRequest, comments }).passed).toBe(true); + expect( + hasClawSweeperExactHeadProof({ + pullRequest: { + ...pullRequest, + head: { sha: "d0215b2d67a45a783277fc7d2949ac4a30f63ec6" }, + }, + comments, + }), + ).toBe(false); + }); }); describe("isMaintainerTeamMember", () => {