From 360cb3dbf1f8e45e7cbbae85629adab4e44ba1f4 Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Thu, 23 Apr 2026 20:40:42 +0100
Subject: [PATCH] ci: add test performance agent

---
 .../codex/prompts/test-performance-agent.md   |  41 +++
 .github/workflows/test-performance-agent.yml  | 274 ++++++++++++++++++
 docs/ci.md                                    |  13 +
 docs/reference/test.md                        |   2 +
 4 files changed, 330 insertions(+)
 create mode 100644 .github/codex/prompts/test-performance-agent.md
 create mode 100644 .github/workflows/test-performance-agent.yml
diff --git a/.github/codex/prompts/test-performance-agent.md b/.github/codex/prompts/test-performance-agent.md
new file mode 100644
index 00000000000..ef3bf4ed5dd
--- /dev/null
+++ b/.github/codex/prompts/test-performance-agent.md
@@ -0,0 +1,41 @@
+# OpenClaw Test Performance Agent
+
+You are maintaining OpenClaw test performance after a trusted main-branch CI run.
+
+Goal: inspect the full-suite test performance report, then make small, coverage-preserving improvements to slow tests when the fix is clear. If the baseline report shows failing tests and the fix is obvious, fix those too.
+
+Inputs:
+
+- Baseline grouped report: `.artifacts/test-perf/baseline-before.json`
+- Per-config Vitest JSON reports: `.artifacts/test-perf/baseline-before/vitest-json/`
+- Per-config logs: `.artifacts/test-perf/baseline-before/logs/`
+
+Hard limits:
+
+- Preserve test coverage and behavioral intent.
+- Do not delete, skip, weaken, or narrow test cases to make the suite faster.
+- Do not add `test.skip`, `it.skip`, `describe.skip`, `test.only`, `it.only`, or `describe.only`.
+- Do not update snapshots, generated baselines, inventories, ignore files, lockfiles, package metadata, CI workflows, or release metadata.
+- Do not add dependencies.
+- Do not create, delete, or rename files.
+- Keep changes minimal and focused on the slow or failing tests you can justify from the report.
+- Prefer no edit when a performance improvement is speculative.
+
+Good fixes:
+
+- Replace broad partial module mocks, especially `importOriginal()` mocks, with narrow injected dependencies or local runtime seams.
+- Avoid importing heavy barrels in hot tests when a narrow module or helper covers the same behavior.
+- Move expensive setup from per-test hooks to shared setup only when state isolation remains correct.
+- Reuse existing fixtures/builders instead of recreating expensive work per case.
+- Mock expensive runtime boundaries directly: filesystem crawls, package registries, provider SDKs, network/process launch, browser/runtime scanners.
+- Keep one integration smoke per boundary and test pure helpers directly, but only when the same behavior remains covered.
+
+Required workflow:
+
+1. Run `pnpm docs:list` if available, then read `docs/reference/test.md` and `docs/help/testing.md` sections about test performance.
+2. Inspect `.artifacts/test-perf/baseline-before.json`; focus on the slowest files/configs or any failed configs.
+3. Pick at most a few low-risk files. Explain the coverage-preserving reason in comments only if the code would otherwise be unclear.
+4. Run targeted tests for changed files where possible. Use `pnpm test <path>` and optionally `pnpm test:perf:imports <path>`.
+5. Leave the worktree clean if no safe improvement exists.
+
+When uncertain, make no edit and explain the uncertainty in the final message.
diff --git a/.github/workflows/test-performance-agent.yml b/.github/workflows/test-performance-agent.yml
new file mode 100644
index 00000000000..efe04ab29c3
--- /dev/null
+++ b/.github/workflows/test-performance-agent.yml
@@ -0,0 +1,274 @@
+name: Test Performance Agent
+
+on:
+  workflow_run: # zizmor: ignore[dangerous-triggers] main-only test optimization after trusted CI; job gates repository, event, branch, actor, conclusion, current main SHA, and daily cadence before using write token
+    workflows:
+      - CI
+    types:
+      - completed
+  workflow_dispatch:
+
+permissions:
+  actions: read
+  contents: write
+
+concurrency:
+  group: test-performance-agent-main
+  cancel-in-progress: false
+
+env:
+  FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true"
+  TEST_PERF_BEFORE: .artifacts/test-perf/baseline-before.json
+  TEST_PERF_AFTER: .artifacts/test-perf/after-agent.json
+  TEST_PERF_COMPARE: .artifacts/test-perf/agent-compare.json
+
+jobs:
+  optimize-tests:
+    if: >
+      github.repository == 'openclaw/openclaw' &&
+      (github.event_name == 'workflow_dispatch' ||
+        (github.event.workflow_run.conclusion == 'success' &&
+          github.event.workflow_run.event == 'push' &&
+          github.event.workflow_run.head_branch == 'main' &&
+          !endsWith(github.event.workflow_run.actor.login, '[bot]')))
+    runs-on: blacksmith-32vcpu-ubuntu-2404
+    timeout-minutes: 240
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          ref: main
+          fetch-depth: 0
+          persist-credentials: false
+          submodules: false
+
+      - name: Gate trusted main activity and daily cadence
+        id: gate
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          GH_TOKEN: ${{ github.token }}
+          WORKFLOW_HEAD_SHA: ${{ github.event.workflow_run.head_sha }}
+        run: |
+          set -euo pipefail
+
+          if [ "$EVENT_NAME" != "workflow_run" ]; then
+            echo "run_agent=true" >> "$GITHUB_OUTPUT"
+            echo "base_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          for attempt in 1 2 3 4 5; do
+            if git fetch --no-tags origin main; then
+              break
+            fi
+            if [ "$attempt" = "5" ]; then
+              echo "Failed to fetch main after retries." >&2
+              exit 1
+            fi
+            echo "Fetch attempt ${attempt} failed; retrying."
+            sleep $((attempt * 2))
+          done
+
+          remote_main="$(git rev-parse origin/main)"
+          if [ "$remote_main" != "$WORKFLOW_HEAD_SHA" ]; then
+            echo "CI run is superseded by ${remote_main}; skipping test performance agent for ${WORKFLOW_HEAD_SHA}."
+            echo "run_agent=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          day_start="$(date -u +%Y-%m-%dT00:00:00Z)"
+          runs_json="$RUNNER_TEMP/test-performance-agent-runs.json"
+          gh api "repos/${GITHUB_REPOSITORY}/actions/workflows/test-performance-agent.yml/runs" \
+            -f branch=main \
+            -f event=workflow_run \
+            -f per_page=50 > "$runs_json"
+
+          prior_runs="$(
+            jq -r \
+              --argjson current_run_id "$GITHUB_RUN_ID" \
+              --arg day_start "$day_start" \
+              '.workflow_runs[]
+                | select(.database_id != $current_run_id)
+                | select(.created_at >= $day_start)
+                | select(.status != "cancelled")
+                | select((.conclusion // "") != "skipped")
+                | [.database_id, .status, (.conclusion // ""), .created_at, .head_sha]
+                | @tsv' "$runs_json"
+          )"
+
+          if [ -n "$prior_runs" ]; then
+            echo "Test performance agent already ran or is running today; skipping."
+            printf '%s\n' "$prior_runs"
+            echo "run_agent=false" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          echo "run_agent=true" >> "$GITHUB_OUTPUT"
+          echo "base_sha=${remote_main}" >> "$GITHUB_OUTPUT"
+
+      - name: Setup Node environment
+        if: steps.gate.outputs.run_agent == 'true'
+        uses: ./.github/actions/setup-node-env
+        with:
+          install-bun: "false"
+
+      - name: Ensure test performance agent key exists
+        if: steps.gate.outputs.run_agent == 'true'
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENCLAW_TEST_PERF_AGENT_OPENAI_API_KEY || secrets.OPENAI_API_KEY }}
+        run: |
+          set -euo pipefail
+          if [ -z "${OPENAI_API_KEY:-}" ]; then
+            echo "Missing OPENCLAW_TEST_PERF_AGENT_OPENAI_API_KEY or OPENAI_API_KEY secret." >&2
+            exit 1
+          fi
+
+      - name: Build baseline full-suite performance report
+        if: steps.gate.outputs.run_agent == 'true'
+        run: pnpm test:perf:groups --full-suite --allow-failures --output "$TEST_PERF_BEFORE" --limit 20 --top-files 40
+
+      - name: Run Codex test performance agent
+        if: steps.gate.outputs.run_agent == 'true'
+        uses: openai/codex-action@v1
+        with:
+          openai-api-key: ${{ secrets.OPENCLAW_TEST_PERF_AGENT_OPENAI_API_KEY || secrets.OPENAI_API_KEY }}
+          prompt-file: .github/codex/prompts/test-performance-agent.md
+          model: gpt-5.4
+          effort: high
+          sandbox: workspace-write
+          safety-strategy: drop-sudo
+          codex-args: '["--full-auto"]'
+
+      - name: Enforce focused test performance patch
+        if: steps.gate.outputs.run_agent == 'true'
+        id: patch
+        run: |
+          set -euo pipefail
+
+          untracked="$(git ls-files --others --exclude-standard)"
+          if [ -n "$untracked" ]; then
+            echo "Test performance agent created untracked files; forbidden:"
+            printf '%s\n' "$untracked"
+            exit 1
+          fi
+
+          added_deleted_or_renamed="$(git diff --name-status --diff-filter=ADR)"
+          if [ -n "$added_deleted_or_renamed" ]; then
+            echo "Test performance agent added, deleted, or renamed tracked files; forbidden:"
+            printf '%s\n' "$added_deleted_or_renamed"
+            exit 1
+          fi
+
+          bad_paths="$(
+            git diff --name-only | while IFS= read -r path; do
+              case "$path" in
+                apps/*|extensions/*|packages/*|scripts/*|src/*|Swabble/*|test/*|ui/*) ;;
+                *) printf '%s\n' "$path" ;;
+              esac
+            done
+          )"
+          if [ -n "$bad_paths" ]; then
+            echo "Test performance agent touched forbidden paths:"
+            printf '%s\n' "$bad_paths"
+            exit 1
+          fi
+
+          if git diff --quiet; then
+            echo "has_changes=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "has_changes=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Restore Node 24 path
+        if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true'
+        run: |
+          set -euo pipefail
+          export PATH="${NODE_BIN}:${PATH}"
+          echo "${NODE_BIN}" >> "$GITHUB_PATH"
+          node -v
+          corepack enable
+          pnpm -v
+
+      - name: Run full-suite performance report after agent changes
+        if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true'
+        run: pnpm test:perf:groups --full-suite --output "$TEST_PERF_AFTER" --limit 20 --top-files 40
+
+      - name: Compare test performance reports
+        if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true'
+        run: pnpm test:perf:groups:compare "$TEST_PERF_BEFORE" "$TEST_PERF_AFTER" --output "$TEST_PERF_COMPARE" --limit 20 --top-files 40
+
+      - name: Enforce coverage-preserving test count
+        if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true'
+        run: |
+          set -euo pipefail
+          node <<'NODE'
+          const fs = require("node:fs");
+          const before = JSON.parse(fs.readFileSync(process.env.TEST_PERF_BEFORE, "utf8"));
+          const after = JSON.parse(fs.readFileSync(process.env.TEST_PERF_AFTER, "utf8"));
+
+          if (before.failed) {
+            console.log("Baseline had failing configs; skipping total test-count comparison against partial report.");
+            process.exit(0);
+          }
+
+          const beforeTests = before.totals?.testCount ?? 0;
+          const afterTests = after.totals?.testCount ?? 0;
+          if (afterTests < beforeTests) {
+            console.error(`Test count decreased from ${beforeTests} to ${afterTests}; refusing coverage-reducing patch.`);
+            process.exit(1);
+          }
+          console.log(`Test count preserved: ${beforeTests} -> ${afterTests}.`);
+          NODE
+
+      - name: Check changed lanes
+        if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true'
+        run: pnpm check:changed
+
+      - name: Commit test performance updates
+        if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true'
+        env:
+          BASE_SHA: ${{ steps.gate.outputs.base_sha }}
+          GITHUB_TOKEN: ${{ github.token }}
+          TARGET_BRANCH: main
+        run: |
+          set -euo pipefail
+
+          if git diff --quiet; then
+            echo "No test performance changes."
+            exit 0
+          fi
+
+          git config user.name "openclaw-test-performance-agent[bot]"
+          git config user.email "openclaw-test-performance-agent[bot]@users.noreply.github.com"
+          git add apps extensions packages scripts src Swabble test ui
+          git commit --no-verify -m "test: optimize slow tests"
+
+          for attempt in 1 2 3 4 5; do
+            if ! git fetch --no-tags origin "${TARGET_BRANCH}"; then
+              echo "Fetch attempt ${attempt} failed; retrying."
+              sleep $((attempt * 2))
+              continue
+            fi
+            if git push "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" HEAD:"${TARGET_BRANCH}"; then
+              exit 0
+            fi
+            remote_main="$(git rev-parse "origin/${TARGET_BRANCH}")"
+            if [ "$remote_main" != "$BASE_SHA" ]; then
+              echo "main advanced from ${BASE_SHA} to ${remote_main}; skipping stale test performance update."
+              exit 0
+            fi
+            echo "Test performance update attempt ${attempt} failed; retrying."
+            sleep $((attempt * 2))
+          done
+
+          echo "Failed to push test performance updates after retries." >&2
+          exit 1
+
+      - name: Upload test performance artifacts
+        if: steps.gate.outputs.run_agent == 'true' && always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: test-performance-agent-${{ github.run_id }}
+          path: .artifacts/test-perf/
+          if-no-files-found: ignore
+          retention-days: 14
diff --git a/docs/ci.md b/docs/ci.md
index dffbc2f5c86..49b2ea50e3d 100644
--- a/docs/ci.md
+++ b/docs/ci.md
@@ -25,6 +25,16 @@ listed PRs when `apply=true`. Before mutating GitHub, it verifies that the
 landed PR is merged and that each duplicate has either a shared referenced issue
 or overlapping changed hunks.
 
+The `Test Performance Agent` workflow is an event-driven Codex maintenance lane
+for slow tests. It has no pure schedule: a successful non-bot push CI run on
+`main` can trigger it, but it skips if another workflow-run invocation already
+ran or is running that UTC day. Manual dispatch bypasses that daily activity
+gate. The lane builds a full-suite grouped Vitest performance report, lets Codex
+make only small coverage-preserving test performance fixes, then reruns the
+full-suite report and rejects changes that reduce the passing baseline test
+count. If the baseline has failing tests, Codex may fix only obvious failures
+and the after-agent full-suite report must pass before anything is committed.
+
 ```bash
 gh workflow run duplicate-after-merge.yml \
   -f landed_pr=70532 \
@@ -56,6 +66,7 @@ gh workflow run duplicate-after-merge.yml \
 | `macos-node`                     | macOS TypeScript test lane using the shared built artifacts                                  | macOS-relevant changes               |
 | `macos-swift`                    | Swift lint, build, and tests for the macOS app                                               | macOS-relevant changes               |
 | `android`                        | Android unit tests for both flavors plus one debug APK build                                 | Android-relevant changes             |
+| `test-performance-agent`         | Daily Codex slow-test optimization after trusted activity                                    | Main CI success or manual dispatch   |
 
 ## Fail-Fast Order
 
@@ -111,4 +122,6 @@ pnpm check:docs     # docs format + lint + broken links
 pnpm build          # build dist when CI artifact/build-smoke lanes matter
 node scripts/ci-run-timings.mjs <run-id>      # summarize wall time, queue time, and slowest jobs
 node scripts/ci-run-timings.mjs --recent 10   # compare recent successful main CI runs
+pnpm test:perf:groups --full-suite --allow-failures --output .artifacts/test-perf/baseline-before.json
+pnpm test:perf:groups:compare .artifacts/test-perf/baseline-before.json .artifacts/test-perf/after-agent.json
 ```
diff --git a/docs/reference/test.md b/docs/reference/test.md
index 5287731c0b3..e12b35ce0fe 100644
--- a/docs/reference/test.md
+++ b/docs/reference/test.md
@@ -29,6 +29,8 @@ title: "Tests"
 - `pnpm test:perf:changed:bench -- --worktree` benchmarks the current worktree change set without committing first.
 - `pnpm test:perf:profile:main`: writes a CPU profile for the Vitest main thread (`.artifacts/vitest-main-profile`).
 - `pnpm test:perf:profile:runner`: writes CPU + heap profiles for the unit runner (`.artifacts/vitest-runner-profile`).
+- `pnpm test:perf:groups --full-suite --allow-failures --output .artifacts/test-perf/baseline-before.json`: runs every full-suite Vitest leaf config serially and writes grouped duration data plus per-config JSON/log artifacts. The Test Performance Agent uses this as its baseline before attempting slow-test fixes.
+- `pnpm test:perf:groups:compare .artifacts/test-perf/baseline-before.json .artifacts/test-perf/after-agent.json`: compares grouped reports after a performance-focused change.
 - Gateway integration: opt-in via `OPENCLAW_TEST_INCLUDE_GATEWAY=1 pnpm test` or `pnpm test:gateway`.
 - `pnpm test:e2e`: Runs gateway end-to-end smoke tests (multi-instance WS/HTTP/node pairing). Defaults to `threads` + `isolate: false` with adaptive workers in `vitest.e2e.config.ts`; tune with `OPENCLAW_E2E_WORKERS=<n>` and set `OPENCLAW_E2E_VERBOSE=1` for verbose logs.
 - `pnpm test:live`: Runs provider live tests (minimax/zai). Requires API keys and `LIVE=1` (or provider-specific `*_LIVE_TEST=1`) to unskip.