From 360cb3dbf1f8e45e7cbbae85629adab4e44ba1f4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 23 Apr 2026 20:40:42 +0100 Subject: [PATCH] ci: add test performance agent --- .../codex/prompts/test-performance-agent.md | 41 +++ .github/workflows/test-performance-agent.yml | 274 ++++++++++++++++++ docs/ci.md | 13 + docs/reference/test.md | 2 + 4 files changed, 330 insertions(+) create mode 100644 .github/codex/prompts/test-performance-agent.md create mode 100644 .github/workflows/test-performance-agent.yml diff --git a/.github/codex/prompts/test-performance-agent.md b/.github/codex/prompts/test-performance-agent.md new file mode 100644 index 00000000000..ef3bf4ed5dd --- /dev/null +++ b/.github/codex/prompts/test-performance-agent.md @@ -0,0 +1,41 @@ +# OpenClaw Test Performance Agent + +You are maintaining OpenClaw test performance after a trusted main-branch CI run. + +Goal: inspect the full-suite test performance report, then make small, coverage-preserving improvements to slow tests when the fix is clear. If the baseline report shows failing tests and the fix is obvious, fix those too. + +Inputs: + +- Baseline grouped report: `.artifacts/test-perf/baseline-before.json` +- Per-config Vitest JSON reports: `.artifacts/test-perf/baseline-before/vitest-json/` +- Per-config logs: `.artifacts/test-perf/baseline-before/logs/` + +Hard limits: + +- Preserve test coverage and behavioral intent. +- Do not delete, skip, weaken, or narrow test cases to make the suite faster. +- Do not add `test.skip`, `it.skip`, `describe.skip`, `test.only`, `it.only`, or `describe.only`. +- Do not update snapshots, generated baselines, inventories, ignore files, lockfiles, package metadata, CI workflows, or release metadata. +- Do not add dependencies. +- Do not create, delete, or rename files. +- Keep changes minimal and focused on the slow or failing tests you can justify from the report. +- Prefer no edit when a performance improvement is speculative. + +Good fixes: + +- Replace broad partial module mocks, especially `importOriginal()` mocks, with narrow injected dependencies or local runtime seams. +- Avoid importing heavy barrels in hot tests when a narrow module or helper covers the same behavior. +- Move expensive setup from per-test hooks to shared setup only when state isolation remains correct. +- Reuse existing fixtures/builders instead of recreating expensive work per case. +- Mock expensive runtime boundaries directly: filesystem crawls, package registries, provider SDKs, network/process launch, browser/runtime scanners. +- Keep one integration smoke per boundary and test pure helpers directly, but only when the same behavior remains covered. + +Required workflow: + +1. Run `pnpm docs:list` if available, then read `docs/reference/test.md` and `docs/help/testing.md` sections about test performance. +2. Inspect `.artifacts/test-perf/baseline-before.json`; focus on the slowest files/configs or any failed configs. +3. Pick at most a few low-risk files. Explain the coverage-preserving reason in comments only if the code would otherwise be unclear. +4. Run targeted tests for changed files where possible. Use `pnpm test ` and optionally `pnpm test:perf:imports `. +5. Leave the worktree clean if no safe improvement exists. + +When uncertain, make no edit and explain the uncertainty in the final message. diff --git a/.github/workflows/test-performance-agent.yml b/.github/workflows/test-performance-agent.yml new file mode 100644 index 00000000000..efe04ab29c3 --- /dev/null +++ b/.github/workflows/test-performance-agent.yml @@ -0,0 +1,274 @@ +name: Test Performance Agent + +on: + workflow_run: # zizmor: ignore[dangerous-triggers] main-only test optimization after trusted CI; job gates repository, event, branch, actor, conclusion, current main SHA, and daily cadence before using write token + workflows: + - CI + types: + - completed + workflow_dispatch: + +permissions: + actions: read + contents: write + +concurrency: + group: test-performance-agent-main + cancel-in-progress: false + +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: "true" + TEST_PERF_BEFORE: .artifacts/test-perf/baseline-before.json + TEST_PERF_AFTER: .artifacts/test-perf/after-agent.json + TEST_PERF_COMPARE: .artifacts/test-perf/agent-compare.json + +jobs: + optimize-tests: + if: > + github.repository == 'openclaw/openclaw' && + (github.event_name == 'workflow_dispatch' || + (github.event.workflow_run.conclusion == 'success' && + github.event.workflow_run.event == 'push' && + github.event.workflow_run.head_branch == 'main' && + !endsWith(github.event.workflow_run.actor.login, '[bot]'))) + runs-on: blacksmith-32vcpu-ubuntu-2404 + timeout-minutes: 240 + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + ref: main + fetch-depth: 0 + persist-credentials: false + submodules: false + + - name: Gate trusted main activity and daily cadence + id: gate + env: + EVENT_NAME: ${{ github.event_name }} + GH_TOKEN: ${{ github.token }} + WORKFLOW_HEAD_SHA: ${{ github.event.workflow_run.head_sha }} + run: | + set -euo pipefail + + if [ "$EVENT_NAME" != "workflow_run" ]; then + echo "run_agent=true" >> "$GITHUB_OUTPUT" + echo "base_sha=$(git rev-parse HEAD)" >> "$GITHUB_OUTPUT" + exit 0 + fi + + for attempt in 1 2 3 4 5; do + if git fetch --no-tags origin main; then + break + fi + if [ "$attempt" = "5" ]; then + echo "Failed to fetch main after retries." >&2 + exit 1 + fi + echo "Fetch attempt ${attempt} failed; retrying." + sleep $((attempt * 2)) + done + + remote_main="$(git rev-parse origin/main)" + if [ "$remote_main" != "$WORKFLOW_HEAD_SHA" ]; then + echo "CI run is superseded by ${remote_main}; skipping test performance agent for ${WORKFLOW_HEAD_SHA}." + echo "run_agent=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + day_start="$(date -u +%Y-%m-%dT00:00:00Z)" + runs_json="$RUNNER_TEMP/test-performance-agent-runs.json" + gh api "repos/${GITHUB_REPOSITORY}/actions/workflows/test-performance-agent.yml/runs" \ + -f branch=main \ + -f event=workflow_run \ + -f per_page=50 > "$runs_json" + + prior_runs="$( + jq -r \ + --argjson current_run_id "$GITHUB_RUN_ID" \ + --arg day_start "$day_start" \ + '.workflow_runs[] + | select(.database_id != $current_run_id) + | select(.created_at >= $day_start) + | select(.status != "cancelled") + | select((.conclusion // "") != "skipped") + | [.database_id, .status, (.conclusion // ""), .created_at, .head_sha] + | @tsv' "$runs_json" + )" + + if [ -n "$prior_runs" ]; then + echo "Test performance agent already ran or is running today; skipping." + printf '%s\n' "$prior_runs" + echo "run_agent=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + echo "run_agent=true" >> "$GITHUB_OUTPUT" + echo "base_sha=${remote_main}" >> "$GITHUB_OUTPUT" + + - name: Setup Node environment + if: steps.gate.outputs.run_agent == 'true' + uses: ./.github/actions/setup-node-env + with: + install-bun: "false" + + - name: Ensure test performance agent key exists + if: steps.gate.outputs.run_agent == 'true' + env: + OPENAI_API_KEY: ${{ secrets.OPENCLAW_TEST_PERF_AGENT_OPENAI_API_KEY || secrets.OPENAI_API_KEY }} + run: | + set -euo pipefail + if [ -z "${OPENAI_API_KEY:-}" ]; then + echo "Missing OPENCLAW_TEST_PERF_AGENT_OPENAI_API_KEY or OPENAI_API_KEY secret." >&2 + exit 1 + fi + + - name: Build baseline full-suite performance report + if: steps.gate.outputs.run_agent == 'true' + run: pnpm test:perf:groups --full-suite --allow-failures --output "$TEST_PERF_BEFORE" --limit 20 --top-files 40 + + - name: Run Codex test performance agent + if: steps.gate.outputs.run_agent == 'true' + uses: openai/codex-action@v1 + with: + openai-api-key: ${{ secrets.OPENCLAW_TEST_PERF_AGENT_OPENAI_API_KEY || secrets.OPENAI_API_KEY }} + prompt-file: .github/codex/prompts/test-performance-agent.md + model: gpt-5.4 + effort: high + sandbox: workspace-write + safety-strategy: drop-sudo + codex-args: '["--full-auto"]' + + - name: Enforce focused test performance patch + if: steps.gate.outputs.run_agent == 'true' + id: patch + run: | + set -euo pipefail + + untracked="$(git ls-files --others --exclude-standard)" + if [ -n "$untracked" ]; then + echo "Test performance agent created untracked files; forbidden:" + printf '%s\n' "$untracked" + exit 1 + fi + + added_deleted_or_renamed="$(git diff --name-status --diff-filter=ADR)" + if [ -n "$added_deleted_or_renamed" ]; then + echo "Test performance agent added, deleted, or renamed tracked files; forbidden:" + printf '%s\n' "$added_deleted_or_renamed" + exit 1 + fi + + bad_paths="$( + git diff --name-only | while IFS= read -r path; do + case "$path" in + apps/*|extensions/*|packages/*|scripts/*|src/*|Swabble/*|test/*|ui/*) ;; + *) printf '%s\n' "$path" ;; + esac + done + )" + if [ -n "$bad_paths" ]; then + echo "Test performance agent touched forbidden paths:" + printf '%s\n' "$bad_paths" + exit 1 + fi + + if git diff --quiet; then + echo "has_changes=false" >> "$GITHUB_OUTPUT" + else + echo "has_changes=true" >> "$GITHUB_OUTPUT" + fi + + - name: Restore Node 24 path + if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true' + run: | + set -euo pipefail + export PATH="${NODE_BIN}:${PATH}" + echo "${NODE_BIN}" >> "$GITHUB_PATH" + node -v + corepack enable + pnpm -v + + - name: Run full-suite performance report after agent changes + if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true' + run: pnpm test:perf:groups --full-suite --output "$TEST_PERF_AFTER" --limit 20 --top-files 40 + + - name: Compare test performance reports + if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true' + run: pnpm test:perf:groups:compare "$TEST_PERF_BEFORE" "$TEST_PERF_AFTER" --output "$TEST_PERF_COMPARE" --limit 20 --top-files 40 + + - name: Enforce coverage-preserving test count + if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true' + run: | + set -euo pipefail + node <<'NODE' + const fs = require("node:fs"); + const before = JSON.parse(fs.readFileSync(process.env.TEST_PERF_BEFORE, "utf8")); + const after = JSON.parse(fs.readFileSync(process.env.TEST_PERF_AFTER, "utf8")); + + if (before.failed) { + console.log("Baseline had failing configs; skipping total test-count comparison against partial report."); + process.exit(0); + } + + const beforeTests = before.totals?.testCount ?? 0; + const afterTests = after.totals?.testCount ?? 0; + if (afterTests < beforeTests) { + console.error(`Test count decreased from ${beforeTests} to ${afterTests}; refusing coverage-reducing patch.`); + process.exit(1); + } + console.log(`Test count preserved: ${beforeTests} -> ${afterTests}.`); + NODE + + - name: Check changed lanes + if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true' + run: pnpm check:changed + + - name: Commit test performance updates + if: steps.gate.outputs.run_agent == 'true' && steps.patch.outputs.has_changes == 'true' + env: + BASE_SHA: ${{ steps.gate.outputs.base_sha }} + GITHUB_TOKEN: ${{ github.token }} + TARGET_BRANCH: main + run: | + set -euo pipefail + + if git diff --quiet; then + echo "No test performance changes." + exit 0 + fi + + git config user.name "openclaw-test-performance-agent[bot]" + git config user.email "openclaw-test-performance-agent[bot]@users.noreply.github.com" + git add apps extensions packages scripts src Swabble test ui + git commit --no-verify -m "test: optimize slow tests" + + for attempt in 1 2 3 4 5; do + if ! git fetch --no-tags origin "${TARGET_BRANCH}"; then + echo "Fetch attempt ${attempt} failed; retrying." + sleep $((attempt * 2)) + continue + fi + if git push "https://x-access-token:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git" HEAD:"${TARGET_BRANCH}"; then + exit 0 + fi + remote_main="$(git rev-parse "origin/${TARGET_BRANCH}")" + if [ "$remote_main" != "$BASE_SHA" ]; then + echo "main advanced from ${BASE_SHA} to ${remote_main}; skipping stale test performance update." + exit 0 + fi + echo "Test performance update attempt ${attempt} failed; retrying." + sleep $((attempt * 2)) + done + + echo "Failed to push test performance updates after retries." >&2 + exit 1 + + - name: Upload test performance artifacts + if: steps.gate.outputs.run_agent == 'true' && always() + uses: actions/upload-artifact@v7 + with: + name: test-performance-agent-${{ github.run_id }} + path: .artifacts/test-perf/ + if-no-files-found: ignore + retention-days: 14 diff --git a/docs/ci.md b/docs/ci.md index dffbc2f5c86..49b2ea50e3d 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -25,6 +25,16 @@ listed PRs when `apply=true`. Before mutating GitHub, it verifies that the landed PR is merged and that each duplicate has either a shared referenced issue or overlapping changed hunks. +The `Test Performance Agent` workflow is an event-driven Codex maintenance lane +for slow tests. It has no pure schedule: a successful non-bot push CI run on +`main` can trigger it, but it skips if another workflow-run invocation already +ran or is running that UTC day. Manual dispatch bypasses that daily activity +gate. The lane builds a full-suite grouped Vitest performance report, lets Codex +make only small coverage-preserving test performance fixes, then reruns the +full-suite report and rejects changes that reduce the passing baseline test +count. If the baseline has failing tests, Codex may fix only obvious failures +and the after-agent full-suite report must pass before anything is committed. + ```bash gh workflow run duplicate-after-merge.yml \ -f landed_pr=70532 \ @@ -56,6 +66,7 @@ gh workflow run duplicate-after-merge.yml \ | `macos-node` | macOS TypeScript test lane using the shared built artifacts | macOS-relevant changes | | `macos-swift` | Swift lint, build, and tests for the macOS app | macOS-relevant changes | | `android` | Android unit tests for both flavors plus one debug APK build | Android-relevant changes | +| `test-performance-agent` | Daily Codex slow-test optimization after trusted activity | Main CI success or manual dispatch | ## Fail-Fast Order @@ -111,4 +122,6 @@ pnpm check:docs # docs format + lint + broken links pnpm build # build dist when CI artifact/build-smoke lanes matter node scripts/ci-run-timings.mjs # summarize wall time, queue time, and slowest jobs node scripts/ci-run-timings.mjs --recent 10 # compare recent successful main CI runs +pnpm test:perf:groups --full-suite --allow-failures --output .artifacts/test-perf/baseline-before.json +pnpm test:perf:groups:compare .artifacts/test-perf/baseline-before.json .artifacts/test-perf/after-agent.json ``` diff --git a/docs/reference/test.md b/docs/reference/test.md index 5287731c0b3..e12b35ce0fe 100644 --- a/docs/reference/test.md +++ b/docs/reference/test.md @@ -29,6 +29,8 @@ title: "Tests" - `pnpm test:perf:changed:bench -- --worktree` benchmarks the current worktree change set without committing first. - `pnpm test:perf:profile:main`: writes a CPU profile for the Vitest main thread (`.artifacts/vitest-main-profile`). - `pnpm test:perf:profile:runner`: writes CPU + heap profiles for the unit runner (`.artifacts/vitest-runner-profile`). +- `pnpm test:perf:groups --full-suite --allow-failures --output .artifacts/test-perf/baseline-before.json`: runs every full-suite Vitest leaf config serially and writes grouped duration data plus per-config JSON/log artifacts. The Test Performance Agent uses this as its baseline before attempting slow-test fixes. +- `pnpm test:perf:groups:compare .artifacts/test-perf/baseline-before.json .artifacts/test-perf/after-agent.json`: compares grouped reports after a performance-focused change. - Gateway integration: opt-in via `OPENCLAW_TEST_INCLUDE_GATEWAY=1 pnpm test` or `pnpm test:gateway`. - `pnpm test:e2e`: Runs gateway end-to-end smoke tests (multi-instance WS/HTTP/node pairing). Defaults to `threads` + `isolate: false` with adaptive workers in `vitest.e2e.config.ts`; tune with `OPENCLAW_E2E_WORKERS=` and set `OPENCLAW_E2E_VERBOSE=1` for verbose logs. - `pnpm test:live`: Runs provider live tests (minimax/zai). Requires API keys and `LIVE=1` (or provider-specific `*_LIVE_TEST=1`) to unskip.