feat: stream autoreview progress

2026-05-30 13:01:02 +00:00 · 2026-05-27 00:21:57 +01:00
parent 8f1fb675aa
commit 322ceb36ce
2 changed files with 175 additions and 11 deletions
--- a/.agents/skills/autoreview/SKILL.md
+++ b/.agents/skills/autoreview/SKILL.md
@@ -27,10 +27,11 @@ Use when:
 - For security-audit suppression changes, verify accepted findings remain auditable: suppressed findings stay in structured output, active output keeps an unsuppressible suppression notice, and aggregate findings cannot hide unrelated active risk.
 - Never switch or override the requested review engine/model. If the review hits model capacity, retry the same command a few times with the same engine/model.
 - Be patient with large bundles. Structured review can take up to 30 minutes while the model call is active, especially with Codex tools or web search.
- Treat heartbeat lines like `review still running: ... elapsed=... pid=...` as healthy progress, not a hang. Let the helper continue while heartbeats are advancing.
+- Treat heartbeat lines like `review still running: ... elapsed=... pid=...` as healthy progress, not a hang. Let the helper continue while heartbeats are advancing. Pass `--stream-engine-output` when live engine text is useful; Codex filters tool/file chatter and other engines pass raw output through.
 - Do not kill a review just because it has been quiet for 2-5 minutes, or because it is still running under the 30-minute window. Inspect the process only after missing multiple expected heartbeats, after 30 minutes, or after an obviously failed subprocess; prefer letting the same helper command finish.
 - Tools are useful in review mode. The helper allows read-only inspection tools and web search by default so reviewers can check dependency contracts, upstream docs, and current behavior.
 - Security perspective is always included, but it should not cripple legitimate functionality. Report security findings only when the change creates a concrete, actionable risk or removes an important safety check.
+- For regression provenance, if no blamed PR is traceable, use the blamed commit as the provenance: commit SHA, date, and author username. Do not guess a merger or frame missing PR metadata as a separate finding.
 - Do not invoke built-in `codex review`, nested reviewers, or reviewer panels from inside the review. The helper builds one bundle, calls one selected engine, validates one structured result, and stops.
 - Stop as soon as the helper exits 0 with no accepted/actionable findings. Do not run an extra review just to get a nicer "clean" line, a second opinion, or clearer closeout wording.
 - Treat the helper's successful exit plus absence of actionable findings as the clean review result, even if the underlying Codex CLI output is terse.
@@ -168,11 +169,12 @@ The helper:
 - supports `--engine codex`, `claude`, `droid`, and `copilot`; default is `AUTOREVIEW_ENGINE` or `codex`; Codex should remain the default when nothing is set
 - use `--mode commit --commit <ref>` for already-committed work, especially clean `main` after landing
 - should be left in `--mode auto` or forced to `--mode branch` for PR/branch work; do not force `--mode local` after committing
- writes only to stdout unless `--output` or `--json-output` is set
+- writes only to stdout unless `--output`, `--json-output`, or live streamed engine stderr is set
 - supports `--dry-run`, `--parallel-tests`, `--prompt`, `--prompt-file`, `--dataset`, `--no-tools`, `--no-web-search`, and commit refs
+- supports `--stream-engine-output` or `AUTOREVIEW_STREAM_ENGINE_OUTPUT=1` for live engine text while preserving structured validation; Codex hides tool/file event details, emits compact activity summaries, and reports token usage at turn completion
 - supports opt-in review panels with `--panel` / `--reviewers`, plus per-engine `--model` and `--thinking`
 - allows read-only tools and web search by default where the selected CLI supports them; forbids nested review in the prompt; Codex is run through `codex exec` with read-only sandbox and structured output
- prints `review still running: <engine> elapsed=<seconds>s pid=<pid>` to stderr at long-running intervals while waiting for the selected review engine
+- prints `review still running: <engine> elapsed=<seconds>s pid=<pid>` to stderr at long-running intervals while waiting for the selected review engine, unless streamed output or compact Codex activity has been visible recently
 - prints `autoreview clean: no accepted/actionable findings reported` when the selected review command exits 0
 - exits nonzero when accepted/actionable findings are present

--- a/.agents/skills/autoreview/scripts/autoreview
+++ b/.agents/skills/autoreview/scripts/autoreview
@@ -6,13 +6,15 @@ import concurrent.futures
 import copy
 import json
 import os
+import queue
 import subprocess
 import sys
 import tempfile
 import textwrap
+import threading
 import time
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable


 ENGINES = ("codex", "claude", "droid", "copilot")
@@ -100,7 +102,18 @@ def run_with_heartbeat(
    input_text: str | None = None,
    label: str,
    heartbeat_seconds: int = 60,
+    stream_output: bool = False,
+    stream_display: Callable[[str, str], str | None] | None = None,
 ) -> subprocess.CompletedProcess[str]:
+    if stream_output:
+        return run_with_stream(
+            args,
+            cwd,
+            input_text=input_text,
+            label=label,
+            heartbeat_seconds=heartbeat_seconds,
+            stream_display=stream_display,
+        )
    started = time.monotonic()
    proc = subprocess.Popen(
        args,
@@ -124,6 +137,82 @@ def run_with_heartbeat(
            print(f"review still running: {label} elapsed={elapsed}s pid={proc.pid}", file=sys.stderr, flush=True)


+def run_with_stream(
+    args: list[str],
+    cwd: Path,
+    *,
+    input_text: str | None,
+    label: str,
+    heartbeat_seconds: int,
+    stream_display: Callable[[str, str], str | None] | None,
+) -> subprocess.CompletedProcess[str]:
+    started = time.monotonic()
+    proc = subprocess.Popen(
+        args,
+        cwd=cwd,
+        stdin=subprocess.PIPE if input_text is not None else None,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        bufsize=1,
+    )
+    events: queue.Queue[tuple[str, str | None]] = queue.Queue()
+    stdout_parts: list[str] = []
+    stderr_parts: list[str] = []
+
+    def read_stream(name: str, stream: Any) -> None:
+        try:
+            for line in iter(stream.readline, ""):
+                events.put((name, line))
+        finally:
+            events.put((name, None))
+
+    def write_stdin() -> None:
+        if proc.stdin is None or input_text is None:
+            return
+        try:
+            proc.stdin.write(input_text)
+            proc.stdin.close()
+        except BrokenPipeError:
+            return
+
+    threads = [
+        threading.Thread(target=read_stream, args=("stdout", proc.stdout), daemon=True),
+        threading.Thread(target=read_stream, args=("stderr", proc.stderr), daemon=True),
+    ]
+    for thread in threads:
+        thread.start()
+    stdin_thread = threading.Thread(target=write_stdin, daemon=True)
+    stdin_thread.start()
+
+    open_streams = 2
+    while open_streams:
+        try:
+            name, line = events.get(timeout=heartbeat_seconds)
+        except queue.Empty:
+            elapsed = int(time.monotonic() - started)
+            print(f"review still running: {label} elapsed={elapsed}s pid={proc.pid}", file=sys.stderr, flush=True)
+            continue
+        if line is None:
+            open_streams -= 1
+            continue
+        if name == "stdout":
+            stdout_parts.append(line)
+        else:
+            stderr_parts.append(line)
+        display = stream_display(name, line) if stream_display else line
+        if display:
+            target = sys.stdout if name == "stdout" else sys.stderr
+            target.write(display)
+            target.flush()
+
+    for thread in threads:
+        thread.join()
+    stdin_thread.join(timeout=1)
+    returncode = proc.wait()
+    return subprocess.CompletedProcess(args, returncode, "".join(stdout_parts), "".join(stderr_parts))
+
+
 def git(repo: Path, *args: str, check: bool = True) -> str:
    return run(["git", *args], repo, check=check).stdout

@@ -336,9 +425,11 @@ def run_codex(args: argparse.Namespace, repo: Path, prompt: str) -> str:
        cmd.extend(["--model", args.model])
    if args.thinking:
        cmd.extend(["-c", f'model_reasoning_effort="{args.thinking}"'])
+    cmd.append("exec")
+    if args.stream_engine_output:
+        cmd.append("--json")
    cmd.extend(
        [
-            "exec",
            "--ephemeral",
            "-C",
            str(repo),
@@ -351,7 +442,14 @@ def run_codex(args: argparse.Namespace, repo: Path, prompt: str) -> str:
            "-",
        ]
    )
-    result = run_with_heartbeat(cmd, repo, input_text=prompt, label="codex")
+    result = run_with_heartbeat(
+        cmd,
+        repo,
+        input_text=prompt,
+        label="codex",
+        stream_output=args.stream_engine_output,
+        stream_display=CodexStreamDisplay() if args.stream_engine_output else None,
+    )
    try:
        output = output_path.read_text()
    finally:
@@ -368,7 +466,7 @@ def run_claude(args: argparse.Namespace, repo: Path, prompt: str) -> str:
        "--print",
        "--no-session-persistence",
        "--output-format",
-        "json",
+        "stream-json" if args.stream_engine_output else "json",
        "--json-schema",
        json.dumps(SCHEMA),
    ]
@@ -376,11 +474,13 @@ def run_claude(args: argparse.Namespace, repo: Path, prompt: str) -> str:
        cmd.extend(["--allowedTools", claude_allowed_tools(args)])
    else:
        cmd.extend(["--tools", ""])
+    if args.stream_engine_output:
+        cmd.append("--verbose")
    if args.model:
        cmd.extend(["--model", args.model])
    if args.thinking:
        cmd.extend(["--effort", args.thinking])
-    result = run_with_heartbeat(cmd, repo, input_text=prompt, label="claude")
+    result = run_with_heartbeat(cmd, repo, input_text=prompt, label="claude", stream_output=args.stream_engine_output)
    if result.returncode != 0:
        raise SystemExit(f"claude engine failed ({result.returncode})\n{result.stderr or result.stdout}")
    return result.stdout
@@ -405,7 +505,7 @@ def run_droid(args: argparse.Namespace, repo: Path, prompt: str) -> str:
        cmd.extend(["--model", args.model])
    if not args.tools:
        cmd.extend(["--disabled-tools", "*"])
-    result = run_with_heartbeat(cmd, repo, label="droid")
+    result = run_with_heartbeat(cmd, repo, label="droid", stream_output=args.stream_engine_output)
    prompt_path.unlink(missing_ok=True)
    if result.returncode != 0:
        raise SystemExit(f"droid engine failed ({result.returncode})\n{result.stderr or result.stdout}")
@@ -430,7 +530,7 @@ def run_copilot(args: argparse.Namespace, repo: Path, prompt: str) -> str:
            "--output-format",
            "json",
            "--stream",
-            "off",
+            "on" if args.stream_engine_output else "off",
            "--no-ask-user",
            "--disable-builtin-mcps",
        ]
@@ -447,12 +547,68 @@ def run_copilot(args: argparse.Namespace, repo: Path, prompt: str) -> str:
        )
        if args.web_search:
            cmd.append("--allow-all-urls")
-        result = run_with_heartbeat(cmd, Path(tempdir), label="copilot")
+        result = run_with_heartbeat(cmd, Path(tempdir), label="copilot", stream_output=args.stream_engine_output)
    if result.returncode != 0:
        raise SystemExit(f"copilot engine failed ({result.returncode})\n{result.stderr or result.stdout}")
    return result.stdout


+class CodexStreamDisplay:
+    def __init__(self, *, activity_seconds: int = 20) -> None:
+        self.activity_seconds = activity_seconds
+        self.hidden_events = 0
+        self.last_visible = time.monotonic()
+
+    def __call__(self, name: str, line: str) -> str | None:
+        if name != "stdout":
+            return line
+        try:
+            event = json.loads(line)
+        except json.JSONDecodeError:
+            return self.visible(line)
+        event_type = event.get("type")
+        if event_type == "thread.started":
+            return self.visible(f"codex thread: {event.get('thread_id', '<unknown>')}\n")
+        if event_type == "turn.started":
+            return self.visible("codex turn started\n")
+        if event_type == "turn.completed":
+            usage = event.get("usage")
+            message = format_codex_usage(usage) + "\n" if isinstance(usage, dict) else "codex turn completed\n"
+            return self.visible(self.flush_hidden() + message)
+        item = event.get("item")
+        if isinstance(item, dict) and item.get("type") == "agent_message" and isinstance(item.get("text"), str):
+            return self.visible(self.flush_hidden() + item["text"].rstrip() + "\n")
+        return self.hidden_activity()
+
+    def hidden_activity(self) -> str | None:
+        self.hidden_events += 1
+        if time.monotonic() - self.last_visible < self.activity_seconds:
+            return None
+        return self.visible(self.flush_hidden())
+
+    def flush_hidden(self) -> str:
+        if not self.hidden_events:
+            return ""
+        count = self.hidden_events
+        self.hidden_events = 0
+        return f"codex activity: {count} hidden tool/status events\n"
+
+    def visible(self, text: str) -> str:
+        self.last_visible = time.monotonic()
+        return text
+
+
+def format_codex_usage(usage: dict[str, Any]) -> str:
+    fields = [
+        "input_tokens",
+        "cached_input_tokens",
+        "output_tokens",
+        "reasoning_output_tokens",
+    ]
+    parts = [f"{field}={usage[field]}" for field in fields if isinstance(usage.get(field), int)]
+    return "codex usage: " + " ".join(parts) if parts else "codex usage: unavailable"
+
+
 def claude_allowed_tools(args: argparse.Namespace) -> str:
    tools = [tool.strip() for tool in args.claude_allowed_tools.split(",") if tool.strip()]
    if not args.web_search:
@@ -673,6 +829,12 @@ def parse_args() -> argparse.Namespace:
    parser.add_argument("--dataset", action="append", help="Extra evidence file to include in the review bundle.")
    parser.add_argument("--output", help="Write human output to a file as well as stdout.")
    parser.add_argument("--json-output", help="Write validated structured review JSON.")
+    parser.add_argument(
+        "--stream-engine-output",
+        action="store_true",
+        default=os.environ.get("AUTOREVIEW_STREAM_ENGINE_OUTPUT") == "1",
+        help="Stream review engine output while preserving buffered output for validation. Codex output is filtered to hide tool/file chatter.",
+    )
    parser.add_argument("--parallel-tests", help="Run a test command concurrently with review; failure fails the helper.")
    parser.add_argument("--require-finding", action="append", default=[], help="Require finding text to contain this substring.")
    parser.add_argument("--expect-findings", action="store_true", help="Treat findings as success; for harness acceptance tests.")