Require real behavior proof for external PRs (#77622)

* ci: require real behavior proof for external PRs * fix: tighten real behavior proof heuristics * fix: reject test-only real behavior proof labels --------- Co-authored-by: Peter Steinberger <steipete@gmail.com>
2026-05-06 06:30:42 +00:00 · 2026-05-04 21:45:30 -07:00
parent d02fbc6116
commit 70f34bf177
10 changed files with 671 additions and 11 deletions
--- a/scripts/github/barnacle-auto-response.mjs
+++ b/scripts/github/barnacle-auto-response.mjs
@@ -1,5 +1,13 @@
 // Barnacle owns deterministic GitHub triage and auto-response behavior.

+import {
+  MOCK_ONLY_PROOF_LABEL,
+  NEEDS_REAL_BEHAVIOR_PROOF_LABEL,
+  PROOF_OVERRIDE_LABEL,
+  evaluateRealBehaviorProof,
+  labelsForRealBehaviorProof,
+} from "./real-behavior-proof-policy.mjs";
+
 const activePrLimit = 20;

 const thirdPartyExtensionMessage =
@@ -134,6 +142,18 @@ export const managedLabelSpecs = {
    color: "C5DEF5",
    description: "Candidate: PR template appears mostly untouched.",
  },
+  [NEEDS_REAL_BEHAVIOR_PROOF_LABEL]: {
+    color: "C5DEF5",
+    description: "Candidate: external PR needs after-fix proof from a real setup.",
+  },
+  [MOCK_ONLY_PROOF_LABEL]: {
+    color: "C5DEF5",
+    description: "Candidate: PR proof only shows tests, mocks, snapshots, lint, typecheck, or CI.",
+  },
+  [PROOF_OVERRIDE_LABEL]: {
+    color: "C2E0C6",
+    description: "Maintainer override for the external PR real behavior proof gate.",
+  },
  "triage: dirty-candidate": {
    color: "C5DEF5",
    description: "Candidate: broad unrelated surfaces; may need splitting or cleanup.",
@@ -154,6 +174,8 @@ export const candidateLabels = {
  docsDiscoverability: "triage: docs-discoverability",
  testOnlyNoBug: "triage: test-only-no-bug",
  refactorOnly: "triage: refactor-only",
+  needsRealBehaviorProof: NEEDS_REAL_BEHAVIOR_PROOF_LABEL,
+  mockOnlyProof: MOCK_ONLY_PROOF_LABEL,
  dirtyCandidate: "triage: dirty-candidate",
  riskyInfra: "triage: risky-infra",
  externalPluginCandidate: "triage: external-plugin-candidate",
@@ -196,10 +218,23 @@ const maintainerAuthorLabel = "maintainer";
 const privilegedAuthorAssociations = new Set(["OWNER", "MEMBER", "COLLABORATOR"]);
 const privilegedRepositoryRoles = new Set(["admin", "maintain", "write"]);
 const candidateLabelValues = Object.values(candidateLabels);
+const proofCandidateLabelValues = [NEEDS_REAL_BEHAVIOR_PROOF_LABEL, MOCK_ONLY_PROOF_LABEL];
 const noisyPrMessage =
  "Closing this PR because it looks dirty (too many unrelated or unexpected changes). This usually happens when a branch picks up unrelated commits or a merge went sideways. Please recreate the PR from a clean branch.";

 const candidateActionRules = [
+  {
+    label: candidateLabels.needsRealBehaviorProof,
+    close: true,
+    message:
+      "Closing this PR because it does not include real behavior proof. Please reopen or resubmit with after-fix evidence from a real OpenClaw setup; terminal screenshots, console output, redacted logs, recordings, linked artifacts, and copied live output count. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only.",
+  },
+  {
+    label: candidateLabels.mockOnlyProof,
+    close: true,
+    message:
+      "Closing this PR because the proof only shows tests, mocks, snapshots, lint, typechecks, or CI. Please reopen or resubmit with after-fix evidence from a real OpenClaw setup; terminal screenshots, console output, redacted logs, recordings, linked artifacts, and copied live output count.",
+  },
  {
    label: candidateLabels.dirtyCandidate,
    close: true,
@@ -438,6 +473,14 @@ export function classifyPullRequestCandidateLabels(pullRequest, files) {
    labelsToAdd.push(candidateLabels.blankTemplate);
  }

+  labelsToAdd.push(
+    ...labelsForRealBehaviorProof(
+      evaluateRealBehaviorProof({
+        pullRequest,
+      }),
+    ),
+  );
+
  const docsOnly = filenames.every(isMarkdownOrDocsFile);
  const docsSignal =
    /\b(add|adds|update|updates|fix|fixes|improve|cleanup|clean up|typo|readme|docs?|documentation|translation|translate)\b/i.test(
@@ -718,14 +761,18 @@ async function addMissingLabels(github, context, core, issueNumber, labels, labe

 async function applyPullRequestCandidateLabels(github, context, core, pullRequest, labelSet) {
  const files = await listPullRequestFiles(github, context, pullRequest);
-  await addMissingLabels(
-    github,
-    context,
-    core,
-    pullRequest.number,
-    classifyPullRequestCandidateLabels(pullRequest, files),
-    labelSet,
+  const classifiedLabels = classifyPullRequestCandidateLabels(
+    {
+      ...pullRequest,
+      labels: [...labelSet].map((name) => ({ name })),
+    },
+    files,
  );
+  const staleProofLabels = proofCandidateLabelValues.filter(
+    (label) => labelSet.has(label) && !classifiedLabels.includes(label),
+  );
+  await removeLabels(github, context, pullRequest.number, staleProofLabels, labelSet);
+  await addMissingLabels(github, context, core, pullRequest.number, classifiedLabels, labelSet);
 }

 function isAutomationUser(user, fallbackLogin = "") {
@@ -931,7 +978,9 @@ export async function runBarnacleAutoResponse({ github, context, core = console
  const isLabelEvent = context.payload.action === "labeled";
  const isPrCandidateEvent =
    pullRequest &&
-    ["opened", "edited", "synchronize", "reopened", "labeled"].includes(context.payload.action);
+    ["opened", "edited", "synchronize", "reopened", "labeled", "unlabeled"].includes(
+      context.payload.action,
+    );
  if (!hasTriggerLabel && !isLabelEvent && !isPrCandidateEvent) {
    return;
  }
--- a/scripts/github/real-behavior-proof-check.mjs
+++ b/scripts/github/real-behavior-proof-check.mjs
@@ -0,0 +1,34 @@
+#!/usr/bin/env node
+import { readFileSync } from "node:fs";
+import { evaluateRealBehaviorProof } from "./real-behavior-proof-policy.mjs";
+
+function escapeCommandValue(value) {
+  return String(value)
+    .replace(/%/g, "%25")
+    .replace(/\r/g, "%0D")
+    .replace(/\n/g, "%0A")
+    .replace(/:/g, "%3A");
+}
+
+const eventPath = process.env.GITHUB_EVENT_PATH;
+if (!eventPath) {
+  console.error("::error title=Real behavior proof failed::GITHUB_EVENT_PATH is not set.");
+  process.exit(1);
+}
+
+const event = JSON.parse(readFileSync(eventPath, "utf8"));
+const pullRequest = event.pull_request;
+if (!pullRequest) {
+  console.log("No pull_request payload found; skipping real behavior proof gate.");
+  process.exit(0);
+}
+
+const evaluation = evaluateRealBehaviorProof({ pullRequest });
+if (evaluation.passed) {
+  console.log(evaluation.reason);
+  process.exit(0);
+}
+
+const message = `${evaluation.reason} Add after-fix evidence from a real OpenClaw setup in the PR body. Screenshots, recordings, terminal screenshots, console output, redacted runtime logs, linked artifacts, or copied live output count. Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental only. A maintainer can apply proof: override when appropriate.`;
+console.error(`::error title=Real behavior proof required::${escapeCommandValue(message)}`);
+process.exit(1);
--- a/scripts/github/real-behavior-proof-policy.mjs
+++ b/scripts/github/real-behavior-proof-policy.mjs
@@ -0,0 +1,284 @@
+export const PROOF_OVERRIDE_LABEL = "proof: override";
+export const NEEDS_REAL_BEHAVIOR_PROOF_LABEL = "triage: needs-real-behavior-proof";
+export const MOCK_ONLY_PROOF_LABEL = "triage: mock-only-proof";
+
+const privilegedAuthorAssociations = new Set(["OWNER", "MEMBER", "COLLABORATOR"]);
+
+const requiredProofFields = [
+  {
+    key: "behavior",
+    names: ["Behavior or issue addressed", "Issue addressed", "Behavior addressed"],
+  },
+  {
+    key: "environment",
+    names: ["Real environment tested", "Environment tested", "Real setup tested"],
+  },
+  {
+    key: "steps",
+    names: [
+      "Exact steps or command run after this patch",
+      "Exact steps or command run after the patch",
+      "Exact steps or command run after fix",
+      "Steps run after the patch",
+      "Command run after the patch",
+    ],
+  },
+  {
+    key: "evidence",
+    names: [
+      "Evidence after fix",
+      "After-fix evidence",
+      "Evidence link or embedded proof",
+      "Evidence",
+    ],
+  },
+  {
+    key: "observedResult",
+    names: ["Observed result after fix", "Observed result after the fix", "Observed result"],
+  },
+  {
+    key: "notTested",
+    names: ["What was not tested", "Not tested"],
+    allowNone: true,
+  },
+];
+
+const allProofFieldNames = requiredProofFields
+  .flatMap((field) => field.names)
+  .concat(["Before evidence", "Before evidence optional"]);
+
+const missingValueRegex =
+  /^(?:n\/?a|not applicable|tbd|todo|unknown|unsure|none provided|no evidence|not tested|untested|-|\[[^\]]*\])$/i;
+
+const standaloneMissingProofRegex =
+  /^\s*(?:[-*]\s*)?(?:n\/?a|not applicable|not tested|untested|no evidence|did not test|didn't test|could not test|couldn't test)\s*\.?\s*$/im;
+
+const mockOnlyEvidenceRegex =
+  /\b(?:pnpm|npm|yarn|bun)\s+(?:run\s+)?(?:test|vitest|lint|typecheck|tsgo|build|check)\b|\b(?:vitest|unit tests?|mock(?:ed|s)?|snapshots?|lint|typechecks?|tsgo|ci(?:\s+passes?)?)\b/i;
+
+const artifactEvidenceRegex =
+  /!\[[^\]]*\]\([^)]+\)|github\.com\/user-attachments\/assets\/|github\.com\/[^/\s]+\/[^/\s]+\/actions\/runs\/\d+\/artifacts\/\d+|https?:\/\/\S+\.(?:png|jpe?g|gif|webp|mp4|mov|webm)\b/i;
+
+const evidenceDescriptorRegex =
+  /\b(?:screenshot|screen\s*recording|recording|terminal\s+(?:capture|screenshot|transcript|output)|console\s+(?:output|log)|runtime\s+logs?|redacted\s+logs?|live\s+output|actual\s+output|observed\s+output|stdout|stderr|stack trace|trace excerpt|log excerpt|linked\s+artifacts?|artifact\s+links?)\b|```[\s\S]*\n[\s\S]*\n```/i;
+
+const liveCommandRegex =
+  /\b(?:openclaw|node|docker|curl|gh|ssh|adb|xcrun|xcodebuild|open|npm\s+run|pnpm\s+openclaw)\b/i;
+
+const mockOnlyEvidenceStripRegex =
+  /\b(?:pnpm|npm|yarn|bun)\s+(?:run\s+)?(?:test|vitest|lint|typecheck|tsgo|build|check)\b|\b(?:vitest|unit tests?|mock(?:ed|s)?|snapshots?|lint|typechecks?|tsgo|ci(?:\s+passes?)?|tests?|passed|passes|green|success|succeeded|with|and|the|branch|only|output|transcript|capture|fenced)\b/gi;
+
+const evidenceDescriptorStripRegex =
+  /\b(?:screenshot|screen\s*recording|recording|terminal\s+(?:capture|screenshot|transcript|output)|console\s+(?:output|log)|runtime\s+logs?|redacted\s+logs?|live\s+output|actual\s+output|observed\s+output|stdout|stderr|stack trace|trace excerpt|log excerpt|linked\s+artifacts?|artifact\s+links?)\b/gi;
+
+function escapeRegex(text) {
+  return text.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+}
+
+function labelNames(labels) {
+  return new Set(
+    (labels ?? [])
+      .map((label) => (typeof label === "string" ? label : label?.name))
+      .filter((label) => typeof label === "string"),
+  );
+}
+
+function isAutomationUser(user = {}, fallbackLogin = "") {
+  const login = user?.login ?? fallbackLogin;
+  return user?.type === "Bot" || /\[bot\]$/i.test(login) || login.startsWith("app/");
+}
+
+export function isExternalPullRequest(pullRequest) {
+  if (!pullRequest) {
+    return false;
+  }
+  if (isAutomationUser(pullRequest.user)) {
+    return false;
+  }
+  const authorAssociation = String(
+    pullRequest.author_association ?? pullRequest.authorAssociation ?? "",
+  ).toUpperCase();
+  return !privilegedAuthorAssociations.has(authorAssociation);
+}
+
+export function hasProofOverride(labels) {
+  return labelNames(labels).has(PROOF_OVERRIDE_LABEL);
+}
+
+export function extractRealBehaviorProofSection(body = "") {
+  const headingRegex = /^#{2,6}\s+real behavior proof\b[^\n]*$/gim;
+  const match = headingRegex.exec(body);
+  if (!match) {
+    return "";
+  }
+  const sectionStart = match.index + match[0].length;
+  const rest = body.slice(sectionStart);
+  const nextHeading = rest.match(/\n#{1,6}\s+\S/);
+  return (nextHeading ? rest.slice(0, nextHeading.index) : rest).trim();
+}
+
+function fieldLineRegex(name) {
+  return new RegExp(
+    `^\\s*(?:[-*]\\s*)?(?:\\*\\*)?${escapeRegex(name)}(?:\\s*\\([^)]*\\))?(?:\\*\\*)?\\s*:\\s*(.*)$`,
+    "i",
+  );
+}
+
+function isAnyProofFieldLine(line) {
+  return allProofFieldNames.some((name) => fieldLineRegex(name).test(line));
+}
+
+function extractFieldValue(section, field) {
+  const lines = section.split("\n");
+  for (let index = 0; index < lines.length; index += 1) {
+    const matchingName = field.names.find((name) => fieldLineRegex(name).test(lines[index]));
+    if (!matchingName) {
+      continue;
+    }
+
+    const match = lines[index].match(fieldLineRegex(matchingName));
+    const valueLines = [match?.[1] ?? ""];
+    for (let next = index + 1; next < lines.length; next += 1) {
+      const line = lines[next];
+      if (/^#{1,6}\s+\S/.test(line) || isAnyProofFieldLine(line)) {
+        break;
+      }
+      valueLines.push(line);
+    }
+    return valueLines.join("\n").trim();
+  }
+  return "";
+}
+
+function stripProofFieldLabels(section) {
+  return section
+    .split("\n")
+    .map((line) => {
+      if (!isAnyProofFieldLine(line)) {
+        return line;
+      }
+      const matchingName = allProofFieldNames.find((name) => fieldLineRegex(name).test(line));
+      const match = matchingName ? line.match(fieldLineRegex(matchingName)) : null;
+      return match?.[1] ?? "";
+    })
+    .join("\n");
+}
+
+function isMissingValue(value, field) {
+  const trimmed = value.trim();
+  if (!trimmed) {
+    return true;
+  }
+  if (
+    field.allowNone &&
+    /^(?:none|nothing else|no known gaps|no additional gaps)$/i.test(trimmed)
+  ) {
+    return false;
+  }
+  return missingValueRegex.test(trimmed);
+}
+
+function hasNonMockEvidencePayload(value) {
+  const payload = value
+    .replace(evidenceDescriptorStripRegex, "")
+    .replace(mockOnlyEvidenceStripRegex, "")
+    .replace(/```(?:\w+)?|```/g, "")
+    .replace(/[`$>:\-_.()[\]\s]+/g, "");
+  return Boolean(payload);
+}
+
+function result(status, reason, details = {}) {
+  return {
+    status,
+    reason,
+    applies: ["passed", "missing", "mock_only", "insufficient", "override"].includes(status),
+    passed: ["passed", "skipped", "override"].includes(status),
+    ...details,
+  };
+}
+
+export function evaluateRealBehaviorProof({ pullRequest, labels } = {}) {
+  const currentLabels = labels ?? pullRequest?.labels ?? [];
+  if (hasProofOverride(currentLabels)) {
+    return result("override", `Maintainer override label ${PROOF_OVERRIDE_LABEL} is present.`);
+  }
+  if (!isExternalPullRequest(pullRequest)) {
+    return result("skipped", "Maintainer, collaborator, or bot PRs do not require this gate.");
+  }
+
+  const section = extractRealBehaviorProofSection(pullRequest?.body ?? "");
+  if (!section) {
+    return result(
+      "missing",
+      "External PRs must include a Real behavior proof section with after-fix evidence from a real setup.",
+    );
+  }
+
+  const fields = Object.fromEntries(
+    requiredProofFields.map((field) => [field.key, extractFieldValue(section, field)]),
+  );
+  const missingFields = requiredProofFields
+    .filter((field) => isMissingValue(fields[field.key] ?? "", field))
+    .map((field) => field.key);
+  if (missingFields.length > 0) {
+    return result(
+      "missing",
+      `Real behavior proof is missing required field content: ${missingFields.join(", ")}.`,
+      { fields, missingFields },
+    );
+  }
+
+  const proofContent = stripProofFieldLabels(section);
+  if (standaloneMissingProofRegex.test(proofContent)) {
+    return result("insufficient", "Real behavior proof says the changed behavior was not tested.", {
+      fields,
+    });
+  }
+
+  const evidenceContent = [fields.evidence, fields.observedResult].join("\n");
+  const proofContentForMockDetection = [fields.evidence, fields.observedResult, fields.steps].join(
+    "\n",
+  );
+  const hasArtifactEvidence = artifactEvidenceRegex.test(evidenceContent);
+  const hasNonMockPayload = hasNonMockEvidencePayload(evidenceContent);
+  const hasMockEvidenceSignal = mockOnlyEvidenceRegex.test(proofContentForMockDetection);
+  if (hasMockEvidenceSignal && !hasArtifactEvidence && !hasNonMockPayload) {
+    return result(
+      "mock_only",
+      "Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental and do not count as real behavior proof.",
+      { fields },
+    );
+  }
+
+  const hasRealEvidence =
+    hasArtifactEvidence ||
+    (evidenceDescriptorRegex.test(evidenceContent) && hasNonMockPayload) ||
+    liveCommandRegex.test(evidenceContent);
+  if (hasMockEvidenceSignal && !hasRealEvidence) {
+    return result(
+      "mock_only",
+      "Unit tests, mocks, snapshots, lint, typechecks, and CI are supplemental and do not count as real behavior proof.",
+      { fields },
+    );
+  }
+
+  if (!hasRealEvidence) {
+    return result(
+      "insufficient",
+      "Real behavior proof must include an after-fix screenshot, recording, terminal capture, console output, redacted runtime log, linked artifact, or copied live output.",
+      { fields },
+    );
+  }
+
+  return result("passed", "External PR includes after-fix real behavior proof.", { fields });
+}
+
+export function labelsForRealBehaviorProof(evaluation) {
+  if (evaluation.status === "mock_only") {
+    return [MOCK_ONLY_PROOF_LABEL];
+  }
+  if (evaluation.status === "missing" || evaluation.status === "insufficient") {
+    return [NEEDS_REAL_BEHAVIOR_PROOF_LABEL];
+  }
+  return [];
+}