ci: reduce release runner queue pressure

2026-05-06 09:40:43 +00:00 · 2026-04-29 17:45:50 +01:00
parent b53ec93ed9
commit 9f21335462
6 changed files with 77 additions and 12 deletions
--- a/.agents/skills/openclaw-testing/SKILL.md
+++ b/.agents/skills/openclaw-testing/SKILL.md
@@ -173,7 +173,10 @@ For bounded recovery after a focused fix, pass `-f rerun_group=<group>`.
 Supported umbrella groups are `all`, `ci`, `plugin-prerelease`,
 `release-checks`, `install-smoke`, `cross-os`, `live-e2e`, `package`, `qa`,
 `qa-parity`, `qa-live`, and `npm-telegram`. Use the narrowest group that covers
-the failed box.
+the failed box. After a targeted release-check fix, do not restart the full
+umbrella by habit: dispatch the matching `rerun_group`, cancel older duplicate
+runs for the same target/group, and rerun only the parent verifier/evidence step
+after the child is green unless the release evidence is stale.

 ### Release Evidence

--- a/.github/workflows/full-release-validation.yml
+++ b/.github/workflows/full-release-validation.yml
@@ -699,6 +699,28 @@ jobs:
                  | map("| `" + (.name | gsub("\\|"; "\\|")) + "` | `" + ((.conclusion // "") | tostring) + "` | " + (.durationMin | tostring) + " |")
                  | .[])
              ' || echo "_Unable to summarize jobs for run ${run_id}._"
+              echo
+              echo "### Longest queues: ${label}"
+              echo
+              gh run view "$run_id" --json createdAt,jobs --jq '
+                def ts: fromdateiso8601;
+                .createdAt as $createdAt |
+                "| Job | Result | Queue minutes | Run minutes |",
+                "| --- | --- | ---: | ---: |",
+                ([.jobs[]
+                  | select(.startedAt != "0001-01-01T00:00:00Z")
+                  | . + {
+                      queueMin: ((((.startedAt | ts) - ($createdAt | ts)) / 60) * 10 | round / 10),
+                      durationMin: (if .completedAt == "0001-01-01T00:00:00Z" then null else ((((.completedAt | ts) - (.startedAt | ts)) / 60) * 10 | round / 10) end)
+                    }
+                  | select(.queueMin > 0)
+                  | {name, conclusion, queueMin, durationMin}]
+                  | sort_by(.queueMin)
+                  | reverse
+                  | .[0:10]
+                  | map("| `" + (.name | gsub("\\|"; "\\|")) + "` | `" + ((.conclusion // "") | tostring) + "` | " + (.queueMin | tostring) + " | " + ((.durationMin // "") | tostring) + " |")
+                  | .[])
+              ' || echo "_Unable to summarize queue times for run ${run_id}._"
            } >> "$GITHUB_STEP_SUMMARY"
          }

--- a/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml
+++ b/.github/workflows/openclaw-live-and-e2e-checks-reusable.yml
@@ -336,7 +336,7 @@ jobs:
  validate_repo_e2e:
    needs: validate_selected_ref
    if: inputs.include_repo_e2e
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 90
    env:
      OPENCLAW_VITEST_MAX_WORKERS: "2"
@@ -363,7 +363,7 @@ jobs:
  validate_special_e2e:
    needs: validate_selected_ref
    if: inputs.include_repo_e2e || (inputs.include_live_suites && !inputs.live_models_only)
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: ${{ matrix.timeout_minutes }}
    strategy:
      fail-fast: false
--- a/.github/workflows/openclaw-release-checks.yml
+++ b/.github/workflows/openclaw-release-checks.yml
@@ -481,7 +481,7 @@ jobs:
    name: Run QA Lab parity lane (${{ matrix.lane }})
    needs: [resolve_target]
    if: contains(fromJSON('["all","qa","qa-parity"]'), needs.resolve_target.outputs.rerun_group)
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 30
    permissions:
      contents: read
@@ -565,7 +565,7 @@ jobs:
    name: Run QA Lab parity report
    needs: [resolve_target, qa_lab_parity_lane_release_checks]
    if: contains(fromJSON('["all","qa","qa-parity"]'), needs.resolve_target.outputs.rerun_group)
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 20
    permissions:
      contents: read
@@ -621,7 +621,7 @@ jobs:
    name: Run QA Lab live Matrix lane
    needs: [resolve_target]
    if: contains(fromJSON('["all","qa","qa-live"]'), needs.resolve_target.outputs.rerun_group)
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 60
    permissions:
      contents: read
@@ -698,7 +698,7 @@ jobs:
    name: Run QA Lab live Telegram lane
    needs: [resolve_target]
    if: contains(fromJSON('["all","qa","qa-live"]'), needs.resolve_target.outputs.rerun_group)
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 60
    permissions:
      contents: read
--- a/.github/workflows/qa-live-transports-convex.yml
+++ b/.github/workflows/qa-live-transports-convex.yml
@@ -143,7 +143,7 @@ jobs:
  run_mock_parity:
    name: Run QA Lab parity gate
    needs: [validate_selected_ref]
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 30
    env:
      QA_PARITY_CONCURRENCY: "1"
@@ -215,7 +215,7 @@ jobs:
    name: Run Matrix live QA lane
    needs: [authorize_actor, validate_selected_ref]
    if: ${{ !(github.event_name == 'workflow_dispatch' && inputs.matrix_profile == 'all') }}
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 60
    environment: qa-live-shared
    steps:
@@ -290,7 +290,7 @@ jobs:
    name: Run Matrix live QA lane (${{ matrix.profile }})
    needs: [authorize_actor, validate_selected_ref]
    if: ${{ github.event_name == 'workflow_dispatch' && inputs.matrix_profile == 'all' }}
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 60
    environment: qa-live-shared
    strategy:
@@ -372,7 +372,7 @@ jobs:
  run_live_telegram:
    name: Run Telegram live QA lane with Convex leases
    needs: [authorize_actor, validate_selected_ref]
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 60
    environment: qa-live-shared
    steps:
@@ -465,7 +465,7 @@ jobs:
  run_live_discord:
    name: Run Discord live QA lane with Convex leases
    needs: [authorize_actor, validate_selected_ref]
-    runs-on: blacksmith-32vcpu-ubuntu-2404
+    runs-on: blacksmith-8vcpu-ubuntu-2404
    timeout-minutes: 60
    environment: qa-live-shared
    steps:
--- a/test/scripts/package-acceptance-workflow.test.ts
+++ b/test/scripts/package-acceptance-workflow.test.ts
@@ -6,6 +6,7 @@ const LIVE_E2E_WORKFLOW = ".github/workflows/openclaw-live-and-e2e-checks-reusab
 const NPM_TELEGRAM_WORKFLOW = ".github/workflows/npm-telegram-beta-e2e.yml";
 const RELEASE_CHECKS_WORKFLOW = ".github/workflows/openclaw-release-checks.yml";
 const FULL_RELEASE_VALIDATION_WORKFLOW = ".github/workflows/full-release-validation.yml";
+const QA_LIVE_TRANSPORTS_WORKFLOW = ".github/workflows/qa-live-transports-convex.yml";

 describe("package acceptance workflow", () => {
  it("resolves candidate package sources before reusing Docker E2E lanes", () => {
@@ -138,6 +139,8 @@ describe("package artifact reuse", () => {
      'OPENCLAW_LIVE_CLI_BACKEND_ARGS=["exec","--json","--color","never","--sandbox","danger-full-access","--skip-git-repo-check"]',
    );
    expect(workflow).toContain("bash .release-harness/scripts/ci-live-command-retry.sh");
+    expect(workflow).toMatch(/validate_repo_e2e:[\s\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404/u);
+    expect(workflow).toMatch(/validate_special_e2e:[\s\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404/u);
    expect(workflow).toMatch(
      /validate_live_provider_suites:[\s\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404/u,
    );
@@ -349,4 +352,41 @@ describe("package artifact reuse", () => {
    expect(workflow).not.toContain("workflow_ref:");
    expect(workflow).not.toContain("inputs.workflow_ref");
  });
+
+  it("keeps release QA and repo E2E lanes off scarce 32-core runners", () => {
+    const releaseChecksWorkflow = readFileSync(RELEASE_CHECKS_WORKFLOW, "utf8");
+    const qaWorkflow = readFileSync(QA_LIVE_TRANSPORTS_WORKFLOW, "utf8");
+
+    for (const jobName of [
+      "qa_lab_parity_lane_release_checks",
+      "qa_lab_parity_report_release_checks",
+      "qa_live_matrix_release_checks",
+      "qa_live_telegram_release_checks",
+    ]) {
+      expect(releaseChecksWorkflow).toMatch(
+        new RegExp(`${jobName}:[\\s\\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404`, "u"),
+      );
+    }
+
+    for (const jobName of [
+      "run_mock_parity",
+      "run_live_matrix",
+      "run_live_matrix_sharded",
+      "run_live_telegram",
+      "run_live_discord",
+    ]) {
+      expect(qaWorkflow).toMatch(
+        new RegExp(`${jobName}:[\\s\\S]*?runs-on: blacksmith-8vcpu-ubuntu-2404`, "u"),
+      );
+    }
+  });
+
+  it("summarizes queue time separately from execution time in full validation", () => {
+    const workflow = readFileSync(FULL_RELEASE_VALIDATION_WORKFLOW, "utf8");
+
+    expect(workflow).toContain("### Slowest jobs: ${label}");
+    expect(workflow).toContain("### Longest queues: ${label}");
+    expect(workflow).toContain("| Job | Result | Queue minutes | Run minutes |");
+    expect(workflow).toContain('gh run view "$run_id" --json createdAt,jobs');
+  });
 });