ci: retry transient live provider flakes

This commit is contained in:
Peter Steinberger
2026-04-29 11:44:26 +01:00
parent 6b4873d0c1
commit 1dac6ac4c6
3 changed files with 69 additions and 2 deletions

View File

@@ -1923,7 +1923,9 @@ jobs:
- name: Run ${{ matrix.label }}
if: contains(matrix.profiles, inputs.release_test_profile)
run: ${{ matrix.command }}
env:
OPENCLAW_LIVE_COMMAND: ${{ matrix.command }}
run: bash .release-harness/scripts/ci-live-command-retry.sh
validate_live_docker_provider_suites:
name: Docker live suites (${{ matrix.label }})
@@ -2082,7 +2084,9 @@ jobs:
- name: Run ${{ matrix.label }}
if: contains(matrix.profiles, inputs.release_test_profile)
run: ${{ matrix.command }}
env:
OPENCLAW_LIVE_COMMAND: ${{ matrix.command }}
run: bash .release-harness/scripts/ci-live-command-retry.sh
validate_live_media_provider_suites:
name: Live media suites (${{ matrix.label }})

View File

@@ -0,0 +1,57 @@
#!/usr/bin/env bash
set -euo pipefail
command="${OPENCLAW_LIVE_COMMAND:-}"
if [[ -z "$command" && "$#" -gt 0 ]]; then
command="$*"
fi
if [[ -z "$command" ]]; then
echo "Usage: OPENCLAW_LIVE_COMMAND='<command>' $0" >&2
exit 64
fi
attempts="${OPENCLAW_LIVE_COMMAND_ATTEMPTS:-2}"
delay_seconds="${OPENCLAW_LIVE_COMMAND_RETRY_DELAY_SECONDS:-10}"
retry_pattern="${OPENCLAW_LIVE_COMMAND_RETRY_PATTERN:-ECONNRESET|ETIMEDOUT|ENOTFOUND|EAI_AGAIN|fetch failed|TLS connection|socket hang up|UND_ERR|\\b429\\b|\\b529\\b}"
if ! [[ "$attempts" =~ ^[1-9][0-9]*$ ]]; then
echo "OPENCLAW_LIVE_COMMAND_ATTEMPTS must be a positive integer, got: $attempts" >&2
exit 64
fi
if ! [[ "$delay_seconds" =~ ^[0-9]+$ ]]; then
echo "OPENCLAW_LIVE_COMMAND_RETRY_DELAY_SECONDS must be a non-negative integer, got: $delay_seconds" >&2
exit 64
fi
log_file="$(mktemp)"
cleanup() {
rm -f "$log_file"
}
trap cleanup EXIT
for attempt in $(seq 1 "$attempts"); do
: >"$log_file"
set +e
bash -o pipefail -c "$command" 2>&1 | tee "$log_file"
status="${PIPESTATUS[0]}"
set -e
if [[ "$status" -eq 0 ]]; then
exit 0
fi
if [[ "$attempt" -ge "$attempts" ]]; then
exit "$status"
fi
if ! grep -Eiq "$retry_pattern" "$log_file"; then
exit "$status"
fi
echo "Live command failed with a retryable provider/network error; retrying ($attempt/$attempts)..." >&2
if [[ "$delay_seconds" -gt 0 ]]; then
sleep "$delay_seconds"
fi
done

View File

@@ -119,6 +119,7 @@ describe("package artifact reuse", () => {
it("shards broad native live tests instead of one serial live-all job", () => {
const workflow = readFileSync(LIVE_E2E_WORKFLOW, "utf8");
const retryHelper = readFileSync("scripts/ci-live-command-retry.sh", "utf8");
expect(workflow).not.toContain("suite_id: live-all");
expect(workflow).not.toContain("command: pnpm test:live\n");
@@ -127,6 +128,8 @@ describe("package artifact reuse", () => {
expect(workflow).toContain(
"command: node .release-harness/scripts/test-live-shard.mjs native-live-src-agents",
);
expect(workflow).toContain("OPENCLAW_LIVE_COMMAND: ${{ matrix.command }}");
expect(workflow).toContain("bash .release-harness/scripts/ci-live-command-retry.sh");
expect(workflow).toContain("suite_id: native-live-src-gateway-core");
expect(workflow).toContain("suite_id: native-live-src-gateway-backends");
expect(workflow).toContain("suite_id: native-live-src-gateway-profiles-deepseek");
@@ -150,6 +153,9 @@ describe("package artifact reuse", () => {
expect(workflow).toContain("suite_id: native-live-extensions-media-music-minimax");
expect(workflow).toContain("suite_id: native-live-extensions-media-video");
expect(workflow).not.toContain("needs_ffmpeg: true");
expect(retryHelper).toContain("OPENCLAW_LIVE_COMMAND_ATTEMPTS:-2");
expect(retryHelper).toContain("ECONNRESET");
expect(retryHelper).toContain("fetch failed");
});
it("runs Docker live harnesses from trusted helper scripts", () => {