mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-12 07:20:45 +00:00
fix: reset stale execution state after SIGUSR1 in-process restart (#15195)
Merged via /review-pr -> /prepare-pr -> /merge-pr.
Prepared head SHA: 676f9ec451
Co-authored-by: joeykrug <5925937+joeykrug@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras
This commit is contained in:
191
scripts/recover-orphaned-processes.sh
Executable file
191
scripts/recover-orphaned-processes.sh
Executable file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env bash
|
||||
# Scan for orphaned coding agent processes after a gateway restart.
|
||||
#
|
||||
# Background coding agents (Claude Code, Codex CLI) spawned by the gateway
|
||||
# can outlive the session that started them when the gateway restarts.
|
||||
# This script finds them and reports their state.
|
||||
#
|
||||
# Usage:
|
||||
# recover-orphaned-processes.sh
|
||||
#
|
||||
# Output: JSON object with `orphaned` array and `ts` timestamp.
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
Usage: recover-orphaned-processes.sh
|
||||
|
||||
Scans for likely orphaned coding agent processes and prints JSON.
|
||||
USAGE
|
||||
}
|
||||
|
||||
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ]; then
|
||||
usage
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ "$#" -gt 0 ]; then
|
||||
usage >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if ! command -v node &>/dev/null; then
|
||||
_ts="unknown"
|
||||
command -v date &>/dev/null && _ts="$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null)" || true
|
||||
[ -z "$_ts" ] && _ts="unknown"
|
||||
printf '{"error":"node not found on PATH","orphaned":[],"ts":"%s"}\n' "$_ts"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
node <<'NODE'
|
||||
const { execFileSync } = require("node:child_process");
|
||||
const fs = require("node:fs");
|
||||
|
||||
let username = process.env.USER || process.env.LOGNAME || "";
|
||||
|
||||
if (username && !/^[a-zA-Z0-9._-]+$/.test(username)) {
|
||||
username = "";
|
||||
}
|
||||
|
||||
function runFile(file, args) {
|
||||
try {
|
||||
return execFileSync(file, args, {
|
||||
encoding: "utf8",
|
||||
stdio: ["ignore", "pipe", "ignore"],
|
||||
});
|
||||
} catch (err) {
|
||||
if (err && typeof err.stdout === "string") {
|
||||
return err.stdout;
|
||||
}
|
||||
if (err && err.stdout && Buffer.isBuffer(err.stdout)) {
|
||||
return err.stdout.toString("utf8");
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
function resolveStarted(pid) {
|
||||
const started = runFile("ps", ["-o", "lstart=", "-p", String(pid)]).trim();
|
||||
return started.length > 0 ? started : "unknown";
|
||||
}
|
||||
|
||||
function resolveCwd(pid) {
|
||||
if (process.platform === "linux") {
|
||||
try {
|
||||
return fs.readlinkSync(`/proc/${pid}/cwd`);
|
||||
} catch {
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
const lsof = runFile("lsof", ["-a", "-d", "cwd", "-p", String(pid), "-Fn"]);
|
||||
const match = lsof.match(/^n(.+)$/m);
|
||||
return match ? match[1] : "unknown";
|
||||
}
|
||||
|
||||
function sanitizeCommand(cmd) {
|
||||
// Avoid leaking obvious secrets when this diagnostic output is shared.
|
||||
return cmd
|
||||
.replace(
|
||||
/(--(?:token|api[-_]?key|password|secret|authorization)\s+)([^\s]+)/gi,
|
||||
"$1<redacted>",
|
||||
)
|
||||
.replace(
|
||||
/((?:token|api[-_]?key|password|secret|authorization)=)([^\s]+)/gi,
|
||||
"$1<redacted>",
|
||||
)
|
||||
.replace(/(Bearer\s+)[A-Za-z0-9._~+/=-]+/g, "$1<redacted>");
|
||||
}
|
||||
|
||||
// Pre-filter candidate PIDs using pgrep to avoid scanning all processes.
|
||||
// Only falls back to a full ps scan when pgrep is genuinely unavailable
|
||||
// (ENOENT), not when it simply finds no matches (exit code 1).
|
||||
let pgrepUnavailable = false;
|
||||
const pgrepResult = (() => {
|
||||
const args =
|
||||
username.length > 0
|
||||
? ["-u", username, "-f", "codex|claude"]
|
||||
: ["-f", "codex|claude"];
|
||||
try {
|
||||
return execFileSync("pgrep", args, {
|
||||
encoding: "utf8",
|
||||
stdio: ["ignore", "pipe", "ignore"],
|
||||
});
|
||||
} catch (err) {
|
||||
if (err && err.code === "ENOENT") {
|
||||
pgrepUnavailable = true;
|
||||
return "";
|
||||
}
|
||||
// pgrep exit code 1 = no matches — return stdout (empty)
|
||||
if (err && typeof err.stdout === "string") return err.stdout;
|
||||
return "";
|
||||
}
|
||||
})();
|
||||
|
||||
const candidatePids = pgrepResult
|
||||
.split("\n")
|
||||
.map((s) => s.trim())
|
||||
.filter((s) => s.length > 0 && /^\d+$/.test(s));
|
||||
|
||||
let lines;
|
||||
if (candidatePids.length > 0) {
|
||||
// Fetch command info only for candidate PIDs.
|
||||
lines = runFile("ps", ["-o", "pid=,command=", "-p", candidatePids.join(",")]).split("\n");
|
||||
} else if (pgrepUnavailable && username.length > 0) {
|
||||
// pgrep not installed — fall back to user-scoped ps scan.
|
||||
lines = runFile("ps", ["-U", username, "-o", "pid=,command="]).split("\n");
|
||||
} else if (pgrepUnavailable) {
|
||||
// pgrep not installed and no username — full scan as last resort.
|
||||
lines = runFile("ps", ["-axo", "pid=,command="]).split("\n");
|
||||
} else {
|
||||
// pgrep ran successfully but found no matches — no orphans.
|
||||
lines = [];
|
||||
}
|
||||
|
||||
const includePattern = /codex|claude/i;
|
||||
|
||||
const excludePatterns = [
|
||||
/openclaw-gateway/i,
|
||||
/signal-cli/i,
|
||||
/node_modules\/\.bin\/openclaw/i,
|
||||
/recover-orphaned-processes\.sh/i,
|
||||
];
|
||||
|
||||
const orphaned = [];
|
||||
|
||||
for (const rawLine of lines) {
|
||||
const line = rawLine.trim();
|
||||
if (!line) {
|
||||
continue;
|
||||
}
|
||||
const match = line.match(/^(\d+)\s+(.+)$/);
|
||||
if (!match) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const pid = Number(match[1]);
|
||||
const cmd = match[2];
|
||||
if (!Number.isInteger(pid) || pid <= 0 || pid === process.pid) {
|
||||
continue;
|
||||
}
|
||||
if (!includePattern.test(cmd)) {
|
||||
continue;
|
||||
}
|
||||
if (excludePatterns.some((pattern) => pattern.test(cmd))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
orphaned.push({
|
||||
pid,
|
||||
cmd: sanitizeCommand(cmd),
|
||||
cwd: resolveCwd(pid),
|
||||
started: resolveStarted(pid),
|
||||
});
|
||||
}
|
||||
|
||||
process.stdout.write(
|
||||
JSON.stringify({
|
||||
orphaned,
|
||||
ts: new Date().toISOString(),
|
||||
}) + "\n",
|
||||
);
|
||||
NODE
|
||||
Reference in New Issue
Block a user