From 9f213354628fabd2507f5873da827e66dc97345b Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 29 Apr 2026 17:45:50 +0100 Subject: [PATCH] ci: reduce release runner queue pressure --- .agents/skills/openclaw-testing/SKILL.md | 5 ++- .github/workflows/full-release-validation.yml | 22 ++++++++++ .../openclaw-live-and-e2e-checks-reusable.yml | 4 +- .github/workflows/openclaw-release-checks.yml | 8 ++-- .../workflows/qa-live-transports-convex.yml | 10 ++--- .../package-acceptance-workflow.test.ts | 40 +++++++++++++++++++ 6 files changed, 77 insertions(+), 12 deletions(-) diff --git a/.agents/skills/openclaw-testing/SKILL.md b/.agents/skills/openclaw-testing/SKILL.md index 1b2c0a720cb..5f479af673d 100644 --- a/.agents/skills/openclaw-testing/SKILL.md +++ b/.agents/skills/openclaw-testing/SKILL.md @@ -173,7 +173,10 @@ For bounded recovery after a focused fix, pass `-f rerun_group=`. Supported umbrella groups are `all`, `ci`, `plugin-prerelease`, `release-checks`, `install-smoke`, `cross-os`, `live-e2e`, `package`, `qa`, `qa-parity`, `qa-live`, and `npm-telegram`. Use the narrowest group that covers -the failed box. +the failed box. After a targeted release-check fix, do not restart the full +umbrella by habit: dispatch the matching `rerun_group`, cancel older duplicate +runs for the same target/group, and rerun only the parent verifier/evidence step +after the child is green unless the release evidence is stale. ### Release Evidence diff --git a/.github/workflows/full-release-validation.yml b/.github/workflows/full-release-validation.yml index 73e7053fc2e..49b76948b5f 100644 --- a/.github/workflows/full-release-validation.yml +++ b/.github/workflows/full-release-validation.yml @@ -699,6 +699,28 @@ jobs: | map("| `" + (.name | gsub("\\|"; "\\|")) + "` | `" + ((.conclusion // "") | tostring) + "` | " + (.durationMin | tostring) + " |") | .[]) ' || echo "_Unable to summarize jobs for run ${run_id}._" + echo + echo "### Longest queues: ${label}" + echo + gh run view "$run_id" --json createdAt,jobs --jq ' + def ts: fromdateiso8601; + .createdAt as $createdAt | + "| Job | Result | Queue minutes | Run minutes |", + "| --- | --- | ---: | ---: |", + ([.jobs[] + | select(.startedAt != "0001-01-01T00:00:00Z") + | . + { + queueMin: ((((.startedAt | ts) - ($createdAt | ts)) / 60) * 10 | round / 10), + durationMin: (if .completedAt == "0001-01-01T00:00:00Z" then null else ((((.completedAt | ts) - (.startedAt | ts)) / 60) * 10 | round / 10) end) + } + | select(.queueMin > 0) + | {name, conclusion, queueMin, durationMin}] + | sort_by(.queueMin) + | reverse + | .[0:10] + | map("| `" + (.name | gsub("\\|"; "\\|")) + "` | `" + ((.conclusion // "") | tostring) + "` | " + (.queueMin | tostring) + " | " + ((.durationMin // "") | tostring) + " |") + | .[]) + ' || echo "_Unable to summarize queue times for run ${run_id}._" } >> "$GITHUB_STEP_SUMMARY" } diff --git a/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml b/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml index ac51aa49b5e..bf85f14a580 100644 --- a/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml +++ b/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml @@ -336,7 +336,7 @@ jobs: validate_repo_e2e: needs: validate_selected_ref if: inputs.include_repo_e2e - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 90 env: OPENCLAW_VITEST_MAX_WORKERS: "2" @@ -363,7 +363,7 @@ jobs: validate_special_e2e: needs: validate_selected_ref if: inputs.include_repo_e2e || (inputs.include_live_suites && !inputs.live_models_only) - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: ${{ matrix.timeout_minutes }} strategy: fail-fast: false diff --git a/.github/workflows/openclaw-release-checks.yml b/.github/workflows/openclaw-release-checks.yml index af45fce7bef..1896c0e301e 100644 --- a/.github/workflows/openclaw-release-checks.yml +++ b/.github/workflows/openclaw-release-checks.yml @@ -481,7 +481,7 @@ jobs: name: Run QA Lab parity lane (${{ matrix.lane }}) needs: [resolve_target] if: contains(fromJSON('["all","qa","qa-parity"]'), needs.resolve_target.outputs.rerun_group) - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 30 permissions: contents: read @@ -565,7 +565,7 @@ jobs: name: Run QA Lab parity report needs: [resolve_target, qa_lab_parity_lane_release_checks] if: contains(fromJSON('["all","qa","qa-parity"]'), needs.resolve_target.outputs.rerun_group) - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 20 permissions: contents: read @@ -621,7 +621,7 @@ jobs: name: Run QA Lab live Matrix lane needs: [resolve_target] if: contains(fromJSON('["all","qa","qa-live"]'), needs.resolve_target.outputs.rerun_group) - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 60 permissions: contents: read @@ -698,7 +698,7 @@ jobs: name: Run QA Lab live Telegram lane needs: [resolve_target] if: contains(fromJSON('["all","qa","qa-live"]'), needs.resolve_target.outputs.rerun_group) - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 60 permissions: contents: read diff --git a/.github/workflows/qa-live-transports-convex.yml b/.github/workflows/qa-live-transports-convex.yml index 7651055203d..4827954fe5a 100644 --- a/.github/workflows/qa-live-transports-convex.yml +++ b/.github/workflows/qa-live-transports-convex.yml @@ -143,7 +143,7 @@ jobs: run_mock_parity: name: Run QA Lab parity gate needs: [validate_selected_ref] - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 30 env: QA_PARITY_CONCURRENCY: "1" @@ -215,7 +215,7 @@ jobs: name: Run Matrix live QA lane needs: [authorize_actor, validate_selected_ref] if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.matrix_profile == 'all') }} - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 60 environment: qa-live-shared steps: @@ -290,7 +290,7 @@ jobs: name: Run Matrix live QA lane (${{ matrix.profile }}) needs: [authorize_actor, validate_selected_ref] if: ${{ github.event_name == 'workflow_dispatch' && inputs.matrix_profile == 'all' }} - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 60 environment: qa-live-shared strategy: @@ -372,7 +372,7 @@ jobs: run_live_telegram: name: Run Telegram live QA lane with Convex leases needs: [authorize_actor, validate_selected_ref] - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 60 environment: qa-live-shared steps: @@ -465,7 +465,7 @@ jobs: run_live_discord: name: Run Discord live QA lane with Convex leases needs: [authorize_actor, validate_selected_ref] - runs-on: blacksmith-32vcpu-ubuntu-2404 + runs-on: blacksmith-8vcpu-ubuntu-2404 timeout-minutes: 60 environment: qa-live-shared steps: diff --git a/test/scripts/package-acceptance-workflow.test.ts b/test/scripts/package-acceptance-workflow.test.ts index 18129f9e601..99a51b12fa6 100644 --- a/test/scripts/package-acceptance-workflow.test.ts +++ b/test/scripts/package-acceptance-workflow.test.ts @@ -6,6 +6,7 @@ const LIVE_E2E_WORKFLOW = ".github/workflows/openclaw-live-and-e2e-checks-reusab const NPM_TELEGRAM_WORKFLOW = ".github/workflows/npm-telegram-beta-e2e.yml"; const RELEASE_CHECKS_WORKFLOW = ".github/workflows/openclaw-release-checks.yml"; const FULL_RELEASE_VALIDATION_WORKFLOW = ".github/workflows/full-release-validation.yml"; +const QA_LIVE_TRANSPORTS_WORKFLOW = ".github/workflows/qa-live-transports-convex.yml"; describe("package acceptance workflow", () => { it("resolves candidate package sources before reusing Docker E2E lanes", () => { @@ -138,6 +139,8 @@ describe("package artifact reuse", () => { 'OPENCLAW_LIVE_CLI_BACKEND_ARGS=["exec","--json","--color","never","--sandbox","danger-full-access","--skip-git-repo-check"]', ); expect(workflow).toContain("bash .release-harness/scripts/ci-live-command-retry.sh"); + expect(workflow).toMatch(/validate_repo_e2e:[\s\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404/u); + expect(workflow).toMatch(/validate_special_e2e:[\s\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404/u); expect(workflow).toMatch( /validate_live_provider_suites:[\s\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404/u, ); @@ -349,4 +352,41 @@ describe("package artifact reuse", () => { expect(workflow).not.toContain("workflow_ref:"); expect(workflow).not.toContain("inputs.workflow_ref"); }); + + it("keeps release QA and repo E2E lanes off scarce 32-core runners", () => { + const releaseChecksWorkflow = readFileSync(RELEASE_CHECKS_WORKFLOW, "utf8"); + const qaWorkflow = readFileSync(QA_LIVE_TRANSPORTS_WORKFLOW, "utf8"); + + for (const jobName of [ + "qa_lab_parity_lane_release_checks", + "qa_lab_parity_report_release_checks", + "qa_live_matrix_release_checks", + "qa_live_telegram_release_checks", + ]) { + expect(releaseChecksWorkflow).toMatch( + new RegExp(`${jobName}:[\\s\\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404`, "u"), + ); + } + + for (const jobName of [ + "run_mock_parity", + "run_live_matrix", + "run_live_matrix_sharded", + "run_live_telegram", + "run_live_discord", + ]) { + expect(qaWorkflow).toMatch( + new RegExp(`${jobName}:[\\s\\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404`, "u"), + ); + } + }); + + it("summarizes queue time separately from execution time in full validation", () => { + const workflow = readFileSync(FULL_RELEASE_VALIDATION_WORKFLOW, "utf8"); + + expect(workflow).toContain("### Slowest jobs: ${label}"); + expect(workflow).toContain("### Longest queues: ${label}"); + expect(workflow).toContain("| Job | Result | Queue minutes | Run minutes |"); + expect(workflow).toContain('gh run view "$run_id" --json createdAt,jobs'); + }); });