From bfc08897767667f435b08c5ec7c5f6c06dd05d4d Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Fri, 10 Apr 2026 13:52:24 +0100 Subject: [PATCH] docs: document Codex harness plugin workflow --- CHANGELOG.md | 1 + .../.generated/plugin-sdk-api-baseline.sha256 | 4 +- docs/concepts/model-providers.md | 4 + docs/docs.json | 1 + docs/help/testing.md | 46 ++ docs/plugins/sdk-agent-harness.md | 167 ++++++ docs/plugins/sdk-overview.md | 30 +- docs/plugins/sdk-provider-plugins.md | 7 + docs/plugins/sdk-runtime.md | 10 +- package.json | 7 + scripts/lib/plugin-sdk-entrypoints.json | 1 + scripts/test-live-codex-harness-docker.sh | 107 ++++ .../gateway-codex-harness.live.test.ts | 477 ++++++++++++++++++ 13 files changed, 844 insertions(+), 18 deletions(-) create mode 100644 docs/plugins/sdk-agent-harness.md create mode 100644 scripts/test-live-codex-harness-docker.sh create mode 100644 src/gateway/gateway-codex-harness.live.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 014e917a4b5..e110669d4e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ Docs: https://docs.openclaw.ai - Gateway: split startup and runtime seams so gateway lifecycle sequencing, reload state, and shutdown behavior stay easier to maintain without changing observed behavior. (#63975) Thanks @gumadeiras. - Matrix/partial streaming: add MSC4357 live markers to draft preview sends and edits so supporting Matrix clients can render a live/typewriter animation and stop it when the final edit lands. (#63513) Thanks @TigerInYourDream. - QA/Telegram: add a live `openclaw qa telegram` lane for private-group bot-to-bot checks, harden its artifact handling, and preserve native Telegram command reply threading for QA verification. (#64303) Thanks @obviyus. +- Models/Codex: add the bundled Codex provider and plugin-owned app-server harness so `codex/gpt-*` models use Codex-managed auth, native threads, model discovery, and compaction while `openai/gpt-*` stays on the normal OpenAI provider path. ### Fixes diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index 130d44d63d6..ba7e736b4db 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -8a754603a721c3816772a0a4f44b83a19547163ad52ff1b9f18eaaeeaf0de6d4 plugin-sdk-api-baseline.json -40539cf99459c77dab1d6e72af2e3f05cafb77f2c49af5fa210c78dc231592b4 plugin-sdk-api-baseline.jsonl +cceabd98fbc368e04aba61e1d3712fe0f86749dc3872e4b9ba057784e29a8559 plugin-sdk-api-baseline.json +de31f5f77bda7163bed395b9669861cb64c514cf1666324e406b7882d84dee7c plugin-sdk-api-baseline.jsonl diff --git a/docs/concepts/model-providers.md b/docs/concepts/model-providers.md index ca712a65064..d6425205a67 100644 --- a/docs/concepts/model-providers.md +++ b/docs/concepts/model-providers.md @@ -50,6 +50,10 @@ For model selection rules, see [/concepts/models](/concepts/models). family, transcript/tooling quirks, transport/cache hints). It is not the same as the [public capability model](/plugins/architecture#public-capability-model) which describes what a plugin registers (text inference, speech, etc.). +- The bundled `codex` provider is paired with the bundled Codex agent harness. + Use `codex/gpt-*` when you want Codex-owned login, model discovery, native + thread resume, and app-server execution. Plain `openai/gpt-*` refs continue + to use the OpenAI provider and the normal OpenClaw provider transport. ## Plugin-owned provider behavior diff --git a/docs/docs.json b/docs/docs.json index 881d729c24f..9d5b587cb09 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -1130,6 +1130,7 @@ "plugins/sdk-overview", "plugins/sdk-entrypoints", "plugins/sdk-runtime", + "plugins/sdk-agent-harness", "plugins/sdk-setup", "plugins/sdk-testing", "plugins/manifest", diff --git a/docs/help/testing.md b/docs/help/testing.md index 072fa368f36..4caaa478e00 100644 --- a/docs/help/testing.md +++ b/docs/help/testing.md @@ -390,6 +390,51 @@ Docker notes: - It sources `~/.profile`, stages the matching CLI auth material into the container, installs `acpx` into a writable npm prefix, then installs the requested live CLI (`@anthropic-ai/claude-code`, `@openai/codex`, or `@google/gemini-cli`) if missing. - Inside Docker, the runner sets `OPENCLAW_LIVE_ACP_BIND_ACPX_COMMAND=$HOME/.npm-global/bin/acpx` so acpx keeps provider env vars from the sourced profile available to the child harness CLI. +## Live: Codex app-server harness smoke + +- Goal: validate the plugin-owned Codex harness through the normal gateway + `agent` method: + - load the bundled `codex` plugin + - select `OPENCLAW_AGENT_RUNTIME=codex` + - send a first gateway agent turn to `codex/gpt-5.4` + - send a second turn to the same OpenClaw session and verify the app-server + thread can resume +- Test: `src/gateway/gateway-codex-harness.live.test.ts` +- Enable: `OPENCLAW_LIVE_CODEX_HARNESS=1` +- Default model: `codex/gpt-5.4` +- Optional image probe: `OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1` +- Optional MCP/tool probe: `OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1` +- Auth: `OPENAI_API_KEY` from the shell/profile, plus optional copied + `~/.codex/auth.json` and `~/.codex/config.toml` + +Local recipe: + +```bash +source ~/.profile +OPENCLAW_LIVE_CODEX_HARNESS=1 \ + OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=1 \ + OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=1 \ + OPENCLAW_LIVE_CODEX_HARNESS_MODEL=codex/gpt-5.4 \ + pnpm test:live -- src/gateway/gateway-codex-harness.live.test.ts +``` + +Docker recipe: + +```bash +source ~/.profile +pnpm test:docker:live-codex-harness +``` + +Docker notes: + +- The Docker runner lives at `scripts/test-live-codex-harness-docker.sh`. +- It sources the mounted `~/.profile`, passes `OPENAI_API_KEY`, copies Codex CLI + auth files when present, installs `@openai/codex` into a writable mounted npm + prefix, stages the source tree, then runs only the Codex-harness live test. +- Docker enables the image and MCP/tool probes by default. Set + `OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE=0` or + `OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE=0` when you need a narrower debug run. + ### Recommended live recipes Narrow, explicit allowlists are fastest and least flaky: @@ -618,6 +663,7 @@ The live-model Docker runners also bind-mount only the needed CLI auth homes (or - Direct models: `pnpm test:docker:live-models` (script: `scripts/test-live-models-docker.sh`) - ACP bind smoke: `pnpm test:docker:live-acp-bind` (script: `scripts/test-live-acp-bind-docker.sh`) - CLI backend smoke: `pnpm test:docker:live-cli-backend` (script: `scripts/test-live-cli-backend-docker.sh`) +- Codex app-server harness smoke: `pnpm test:docker:live-codex-harness` (script: `scripts/test-live-codex-harness-docker.sh`) - Gateway + dev agent: `pnpm test:docker:live-gateway` (script: `scripts/test-live-gateway-models-docker.sh`) - Open WebUI live smoke: `pnpm test:docker:openwebui` (script: `scripts/e2e/openwebui-docker.sh`) - Onboarding wizard (TTY, full scaffolding): `pnpm test:docker:onboard` (script: `scripts/e2e/onboard-docker.sh`) diff --git a/docs/plugins/sdk-agent-harness.md b/docs/plugins/sdk-agent-harness.md new file mode 100644 index 00000000000..f914a3bc2a1 --- /dev/null +++ b/docs/plugins/sdk-agent-harness.md @@ -0,0 +1,167 @@ +--- +title: "Agent Harness Plugins" +sidebarTitle: "Agent Harness" +summary: "Experimental SDK surface for plugins that replace the low level embedded agent executor" +read_when: + - You are changing the embedded agent runtime or harness registry + - You are registering an agent harness from a bundled or trusted plugin + - You need to understand how the Codex plugin relates to model providers +--- + +# Agent Harness Plugins + +An **agent harness** is the low level executor for one prepared OpenClaw agent +turn. It is not a model provider, not a channel, and not a tool registry. + +Use this surface only for bundled or trusted native plugins. The contract is +still experimental because the parameter types intentionally mirror the current +embedded runner. + +## When to use a harness + +Register an agent harness when a model family has its own native session +runtime and the normal OpenClaw provider transport is the wrong abstraction. + +Examples: + +- a native coding-agent server that owns threads and compaction +- a local CLI or daemon that must stream native plan/reasoning/tool events +- a model runtime that needs its own resume id in addition to the OpenClaw + session transcript + +Do **not** register a harness just to add a new LLM API. For normal HTTP or +WebSocket model APIs, build a [provider plugin](/plugins/sdk-provider-plugins). + +## What core still owns + +Before a harness is selected, OpenClaw has already resolved: + +- provider and model +- runtime auth state +- thinking level and context budget +- the OpenClaw transcript/session file +- workspace, sandbox, and tool policy +- channel reply callbacks and streaming callbacks +- model fallback and live model switching policy + +That split is intentional. A harness runs a prepared attempt; it does not pick +providers, replace channel delivery, or silently switch models. + +## Register a harness + +**Import:** `openclaw/plugin-sdk/agent-harness` + +```typescript +import type { AgentHarness } from "openclaw/plugin-sdk/agent-harness"; +import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry"; + +const myHarness: AgentHarness = { + id: "my-harness", + label: "My native agent harness", + + supports(ctx) { + return ctx.provider === "my-provider" + ? { supported: true, priority: 100 } + : { supported: false }; + }, + + async runAttempt(params) { + // Start or resume your native thread. + // Use params.prompt, params.tools, params.images, params.onPartialReply, + // params.onAgentEvent, and the other prepared attempt fields. + return await runMyNativeTurn(params); + }, +}; + +export default definePluginEntry({ + id: "my-native-agent", + name: "My Native Agent", + description: "Runs selected models through a native agent daemon.", + register(api) { + api.registerAgentHarness(myHarness); + }, +}); +``` + +## Selection policy + +OpenClaw chooses a harness after provider/model resolution: + +1. `OPENCLAW_AGENT_RUNTIME=` forces a registered harness with that id. +2. `OPENCLAW_AGENT_RUNTIME=pi` forces the built-in PI harness. +3. `OPENCLAW_AGENT_RUNTIME=auto` asks registered harnesses if they support the + resolved provider/model. +4. If no registered harness matches, OpenClaw uses PI. + +Forced plugin harness failures surface as run failures. In `auto` mode, +OpenClaw may fall back to PI when the selected plugin harness fails before a +turn has produced side effects. + +The bundled Codex plugin registers `codex` as its harness id. For compatibility, +`codex-app-server` and `app-server` also resolve to that same harness when you +set `OPENCLAW_AGENT_RUNTIME` manually. + +## Provider plus harness pairing + +Most harnesses should also register a provider. The provider makes model refs, +auth status, model metadata, and `/model` selection visible to the rest of +OpenClaw. The harness then claims that provider in `supports(...)`. + +The bundled Codex plugin follows this pattern: + +- provider id: `codex` +- user model refs: `codex/gpt-5.4`, `codex/gpt-5.2`, or another model returned + by the Codex app server +- harness id: `codex` +- auth: synthetic provider availability, because the Codex harness owns the + native Codex login/session +- app-server request: OpenClaw sends the bare model id to Codex and lets the + harness talk to the native app-server protocol + +The Codex plugin is additive. Plain `openai/gpt-*` refs remain OpenAI provider +refs and continue to use the normal OpenClaw provider path. Select `codex/gpt-*` +when you want Codex-managed auth, Codex model discovery, native threads, and +Codex app-server execution. `/model` can switch among the Codex models returned +by the Codex app server without requiring OpenAI provider credentials. + +## Native sessions and transcript mirror + +A harness may keep a native session id, thread id, or daemon-side resume token. +Keep that binding explicitly associated with the OpenClaw session, and keep +mirroring user-visible assistant/tool output into the OpenClaw transcript. + +The OpenClaw transcript remains the compatibility layer for: + +- channel-visible session history +- transcript search and indexing +- switching back to the built-in PI harness on a later turn +- generic `/new`, `/reset`, and session deletion behavior + +If your harness stores a sidecar binding, implement `reset(...)` so OpenClaw can +clear it when the owning OpenClaw session is reset. + +## Tool and media results + +Core constructs the OpenClaw tool list and passes it into the prepared attempt. +When a harness executes a dynamic tool call, return the tool result back through +the harness result shape instead of sending channel media yourself. + +This keeps text, image, video, music, TTS, approval, and messaging-tool outputs +on the same delivery path as PI-backed runs. + +## Current limitations + +- The public import path is generic, but some attempt/result type aliases still + carry `Pi` names for compatibility. +- Third-party harness installation is experimental. Prefer provider plugins + until you need a native session runtime. +- Harness switching is supported across turns. Do not switch harnesses in the + middle of a turn after native tools, approvals, assistant text, or message + sends have started. + +## Related + +- [SDK Overview](/plugins/sdk-overview) +- [Runtime Helpers](/plugins/sdk-runtime) +- [Provider Plugins](/plugins/sdk-provider-plugins) +- [Model Providers](/concepts/model-providers) diff --git a/docs/plugins/sdk-overview.md b/docs/plugins/sdk-overview.md index ace11b48d16..512d1cbc5e4 100644 --- a/docs/plugins/sdk-overview.md +++ b/docs/plugins/sdk-overview.md @@ -219,6 +219,7 @@ explicitly promotes one as public. | `plugin-sdk/models-provider-runtime` | `/models` command/provider reply helpers | | `plugin-sdk/skill-commands-runtime` | Skill command listing helpers | | `plugin-sdk/native-command-registry` | Native command registry/build/serialize helpers | + | `plugin-sdk/agent-harness` | Experimental trusted-plugin surface for low-level agent harnesses: harness types, active-run steer/abort helpers, OpenClaw tool bridge helpers, and attempt result utilities | | `plugin-sdk/provider-zai-endpoint` | Z.AI endpoint detection helpers | | `plugin-sdk/infra-runtime` | System event/heartbeat helpers | | `plugin-sdk/collection-runtime` | Small bounded cache helpers | @@ -302,20 +303,21 @@ methods: ### Capability registration -| Method | What it registers | -| ------------------------------------------------ | -------------------------------- | -| `api.registerProvider(...)` | Text inference (LLM) | -| `api.registerCliBackend(...)` | Local CLI inference backend | -| `api.registerChannel(...)` | Messaging channel | -| `api.registerSpeechProvider(...)` | Text-to-speech / STT synthesis | -| `api.registerRealtimeTranscriptionProvider(...)` | Streaming realtime transcription | -| `api.registerRealtimeVoiceProvider(...)` | Duplex realtime voice sessions | -| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis | -| `api.registerImageGenerationProvider(...)` | Image generation | -| `api.registerMusicGenerationProvider(...)` | Music generation | -| `api.registerVideoGenerationProvider(...)` | Video generation | -| `api.registerWebFetchProvider(...)` | Web fetch / scrape provider | -| `api.registerWebSearchProvider(...)` | Web search | +| Method | What it registers | +| ------------------------------------------------ | ------------------------------------- | +| `api.registerProvider(...)` | Text inference (LLM) | +| `api.registerAgentHarness(...)` | Experimental low-level agent executor | +| `api.registerCliBackend(...)` | Local CLI inference backend | +| `api.registerChannel(...)` | Messaging channel | +| `api.registerSpeechProvider(...)` | Text-to-speech / STT synthesis | +| `api.registerRealtimeTranscriptionProvider(...)` | Streaming realtime transcription | +| `api.registerRealtimeVoiceProvider(...)` | Duplex realtime voice sessions | +| `api.registerMediaUnderstandingProvider(...)` | Image/audio/video analysis | +| `api.registerImageGenerationProvider(...)` | Image generation | +| `api.registerMusicGenerationProvider(...)` | Music generation | +| `api.registerVideoGenerationProvider(...)` | Video generation | +| `api.registerWebFetchProvider(...)` | Web fetch / scrape provider | +| `api.registerWebSearchProvider(...)` | Web search | ### Tools and commands diff --git a/docs/plugins/sdk-provider-plugins.md b/docs/plugins/sdk-provider-plugins.md index d76c741b50d..3f8dc76c72f 100644 --- a/docs/plugins/sdk-provider-plugins.md +++ b/docs/plugins/sdk-provider-plugins.md @@ -20,6 +20,13 @@ API key auth, and dynamic model resolution. structure and manifest setup. + + Provider plugins add models to OpenClaw's normal inference loop. If the model + must run through a native agent daemon that owns threads, compaction, or tool + events, pair the provider with an [agent harness](/plugins/sdk-agent-harness) + instead of putting daemon protocol details in core. + + ## Walkthrough diff --git a/docs/plugins/sdk-runtime.md b/docs/plugins/sdk-runtime.md index e0a731976e4..94105634ad8 100644 --- a/docs/plugins/sdk-runtime.md +++ b/docs/plugins/sdk-runtime.md @@ -50,9 +50,9 @@ const timeoutMs = api.runtime.agent.resolveAgentTimeoutMs(cfg); // Ensure workspace exists await api.runtime.agent.ensureAgentWorkspace(cfg); -// Run an embedded Pi agent +// Run an embedded agent turn const agentDir = api.runtime.agent.resolveAgentDir(cfg); -const result = await api.runtime.agent.runEmbeddedPiAgent({ +const result = await api.runtime.agent.runEmbeddedAgent({ sessionId: "my-plugin:task-1", runId: crypto.randomUUID(), sessionFile: path.join(agentDir, "sessions", "my-plugin-task-1.jsonl"), @@ -62,6 +62,12 @@ const result = await api.runtime.agent.runEmbeddedPiAgent({ }); ``` +`runEmbeddedAgent(...)` is the neutral helper for starting a normal OpenClaw +agent turn from plugin code. It uses the same provider/model resolution and +agent-harness selection as channel-triggered replies. + +`runEmbeddedPiAgent(...)` remains as a compatibility alias. + **Session store helpers** are under `api.runtime.agent.session`: ```typescript diff --git a/package.json b/package.json index a9986f98987..7994cd5b44e 100644 --- a/package.json +++ b/package.json @@ -317,6 +317,10 @@ "types": "./dist/plugin-sdk/cli-backend.d.ts", "default": "./dist/plugin-sdk/cli-backend.js" }, + "./plugin-sdk/agent-harness": { + "types": "./dist/plugin-sdk/agent-harness.d.ts", + "default": "./dist/plugin-sdk/agent-harness.js" + }, "./plugin-sdk/hook-runtime": { "types": "./dist/plugin-sdk/hook-runtime.d.ts", "default": "./dist/plugin-sdk/hook-runtime.js" @@ -1099,6 +1103,7 @@ "check:import-cycles": "node --import tsx scripts/check-import-cycles.ts", "check:loc": "node --import tsx scripts/check-ts-max-loc.ts --max 500", "check:no-conflict-markers": "node scripts/check-no-conflict-markers.mjs", + "codex-app-server:protocol:check": "node --import tsx scripts/check-codex-app-server-protocol.ts", "config:channels:check": "node --import tsx scripts/generate-bundled-channel-config-metadata.ts --check", "config:channels:gen": "node --import tsx scripts/generate-bundled-channel-config-metadata.ts --write", "config:docs:check": "node --import tsx scripts/generate-config-doc-baseline.ts --check", @@ -1239,6 +1244,7 @@ "test:docker:live-cli-backend:claude-subscription": "OPENCLAW_LIVE_CLI_BACKEND_AUTH=subscription OPENCLAW_LIVE_CLI_BACKEND_MODEL=claude-cli/claude-sonnet-4-6 OPENCLAW_LIVE_CLI_BACKEND_DISABLE_MCP_CONFIG=1 OPENCLAW_LIVE_CLI_BACKEND_MODEL_SWITCH_PROBE=0 OPENCLAW_LIVE_CLI_BACKEND_RESUME_PROBE=1 OPENCLAW_LIVE_CLI_BACKEND_IMAGE_PROBE=0 OPENCLAW_LIVE_CLI_BACKEND_MCP_PROBE=0 bash scripts/test-live-cli-backend-docker.sh", "test:docker:live-cli-backend:codex": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=codex-cli/gpt-5.4 bash scripts/test-live-cli-backend-docker.sh", "test:docker:live-cli-backend:gemini": "OPENCLAW_LIVE_CLI_BACKEND_MODEL=google-gemini-cli/gemini-3-flash-preview bash scripts/test-live-cli-backend-docker.sh", + "test:docker:live-codex-harness": "bash scripts/test-live-codex-harness-docker.sh", "test:docker:live-gateway": "bash scripts/test-live-gateway-models-docker.sh", "test:docker:live-gateway:claude": "OPENCLAW_LIVE_GATEWAY_PROVIDERS=claude-cli OPENCLAW_LIVE_GATEWAY_MODELS=claude-cli/claude-sonnet-4-6 bash scripts/test-live-gateway-models-docker.sh", "test:docker:live-gateway:codex": "OPENCLAW_LIVE_GATEWAY_PROVIDERS=codex-cli OPENCLAW_LIVE_GATEWAY_MODELS=codex-cli/gpt-5.4 bash scripts/test-live-gateway-models-docker.sh", @@ -1271,6 +1277,7 @@ "test:install:smoke": "bash scripts/test-install-sh-docker.sh", "test:live": "node scripts/test-live.mjs", "test:live:cache": "bun scripts/check-live-cache.ts", + "test:live:codex-harness": "OPENCLAW_LIVE_CODEX_HARNESS=1 node scripts/test-live.mjs -- src/gateway/gateway-codex-harness.live.test.ts", "test:live:gateway-profiles": "node scripts/test-live.mjs -- src/gateway/gateway-models.profiles.live.test.ts", "test:live:media": "node --import tsx scripts/test-live-media.ts", "test:live:media:image": "node --import tsx scripts/test-live-media.ts image", diff --git a/scripts/lib/plugin-sdk-entrypoints.json b/scripts/lib/plugin-sdk-entrypoints.json index 55f7cd3dfd9..7047c3cac44 100644 --- a/scripts/lib/plugin-sdk-entrypoints.json +++ b/scripts/lib/plugin-sdk-entrypoints.json @@ -68,6 +68,7 @@ "github-copilot-token", "cli-runtime", "cli-backend", + "agent-harness", "hook-runtime", "host-runtime", "process-runtime", diff --git a/scripts/test-live-codex-harness-docker.sh b/scripts/test-live-codex-harness-docker.sh new file mode 100644 index 00000000000..e951c958149 --- /dev/null +++ b/scripts/test-live-codex-harness-docker.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +source "$ROOT_DIR/scripts/lib/live-docker-auth.sh" +IMAGE_NAME="${OPENCLAW_IMAGE:-openclaw:local}" +LIVE_IMAGE_NAME="${OPENCLAW_LIVE_IMAGE:-${IMAGE_NAME}-live}" +CONFIG_DIR="${OPENCLAW_CONFIG_DIR:-$HOME/.openclaw}" +WORKSPACE_DIR="${OPENCLAW_WORKSPACE_DIR:-$HOME/.openclaw/workspace}" +PROFILE_FILE="${OPENCLAW_PROFILE_FILE:-$HOME/.profile}" +CLI_TOOLS_DIR="${OPENCLAW_DOCKER_CLI_TOOLS_DIR:-$HOME/.cache/openclaw/docker-cli-tools}" + +mkdir -p "$CLI_TOOLS_DIR" + +PROFILE_MOUNT=() +if [[ -f "$PROFILE_FILE" ]]; then + PROFILE_MOUNT=(-v "$PROFILE_FILE":/home/node/.profile:ro) +fi + +AUTH_FILES=() +while IFS= read -r auth_file; do + [[ -n "$auth_file" ]] || continue + AUTH_FILES+=("$auth_file") +done < <(openclaw_live_collect_auth_files_from_csv "openai-codex") + +AUTH_FILES_CSV="" +if ((${#AUTH_FILES[@]} > 0)); then + AUTH_FILES_CSV="$(openclaw_live_join_csv "${AUTH_FILES[@]}")" +fi + +EXTERNAL_AUTH_MOUNTS=() +if ((${#AUTH_FILES[@]} > 0)); then + for auth_file in "${AUTH_FILES[@]}"; do + host_path="$HOME/$auth_file" + if [[ -f "$host_path" ]]; then + EXTERNAL_AUTH_MOUNTS+=(-v "$host_path":/host-auth-files/"$auth_file":ro) + fi + done +fi + +read -r -d '' LIVE_TEST_CMD <<'EOF' || true +set -euo pipefail +[ -f "$HOME/.profile" ] && source "$HOME/.profile" || true +export PATH="$HOME/.npm-global/bin:$PATH" +IFS=',' read -r -a auth_files <<<"${OPENCLAW_DOCKER_AUTH_FILES_RESOLVED:-}" +if ((${#auth_files[@]} > 0)); then + for auth_file in "${auth_files[@]}"; do + [ -n "$auth_file" ] || continue + if [ -f "/host-auth-files/$auth_file" ]; then + mkdir -p "$(dirname "$HOME/$auth_file")" + cp "/host-auth-files/$auth_file" "$HOME/$auth_file" + chmod u+rw "$HOME/$auth_file" || true + fi + done +fi +if [ ! -x "$HOME/.npm-global/bin/codex" ]; then + npm_config_prefix="$HOME/.npm-global" npm install -g @openai/codex +fi +tmp_dir="$(mktemp -d)" +cleanup() { + rm -rf "$tmp_dir" +} +trap cleanup EXIT +source /src/scripts/lib/live-docker-stage.sh +openclaw_live_stage_source_tree "$tmp_dir" +mkdir -p "$tmp_dir/node_modules" +cp -aRs /app/node_modules/. "$tmp_dir/node_modules" +rm -rf "$tmp_dir/node_modules/.vite-temp" +mkdir -p "$tmp_dir/node_modules/.vite-temp" +openclaw_live_link_runtime_tree "$tmp_dir" +openclaw_live_stage_state_dir "$tmp_dir/.openclaw-state" +openclaw_live_prepare_staged_config +cd "$tmp_dir" +pnpm test:live src/gateway/gateway-codex-harness.live.test.ts +EOF + +"$ROOT_DIR/scripts/test-live-build-docker.sh" + +echo "==> Run Codex harness live test in Docker" +echo "==> Model: ${OPENCLAW_LIVE_CODEX_HARNESS_MODEL:-codex/gpt-5.4}" +echo "==> Image probe: ${OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE:-1}" +echo "==> MCP probe: ${OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE:-1}" +echo "==> Auth files: ${AUTH_FILES_CSV:-none}" +docker run --rm -t \ + -u node \ + --entrypoint bash \ + -e COREPACK_ENABLE_DOWNLOAD_PROMPT=0 \ + -e HOME=/home/node \ + -e NODE_OPTIONS=--disable-warning=ExperimentalWarning \ + -e OPENAI_API_KEY \ + -e OPENCLAW_CODEX_APP_SERVER_BIN="${OPENCLAW_CODEX_APP_SERVER_BIN:-codex}" \ + -e OPENCLAW_DOCKER_AUTH_FILES_RESOLVED="$AUTH_FILES_CSV" \ + -e OPENCLAW_LIVE_CODEX_HARNESS=1 \ + -e OPENCLAW_LIVE_CODEX_HARNESS_DEBUG="${OPENCLAW_LIVE_CODEX_HARNESS_DEBUG:-}" \ + -e OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE="${OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE:-1}" \ + -e OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE="${OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE:-1}" \ + -e OPENCLAW_LIVE_CODEX_HARNESS_MODEL="${OPENCLAW_LIVE_CODEX_HARNESS_MODEL:-codex/gpt-5.4}" \ + -e OPENCLAW_LIVE_TEST=1 \ + -e OPENCLAW_VITEST_FS_MODULE_CACHE=0 \ + -v "$ROOT_DIR":/src:ro \ + -v "$CONFIG_DIR":/home/node/.openclaw \ + -v "$WORKSPACE_DIR":/home/node/.openclaw/workspace \ + -v "$CLI_TOOLS_DIR":/home/node/.npm-global \ + "${EXTERNAL_AUTH_MOUNTS[@]}" \ + "${PROFILE_MOUNT[@]}" \ + "$LIVE_IMAGE_NAME" \ + -lc "$LIVE_TEST_CMD" diff --git a/src/gateway/gateway-codex-harness.live.test.ts b/src/gateway/gateway-codex-harness.live.test.ts new file mode 100644 index 00000000000..5be06451ada --- /dev/null +++ b/src/gateway/gateway-codex-harness.live.test.ts @@ -0,0 +1,477 @@ +import { randomBytes, randomUUID } from "node:crypto"; +import fs from "node:fs/promises"; +import { createServer } from "node:net"; +import os from "node:os"; +import path from "node:path"; +import { describe, expect, it } from "vitest"; +import { isLiveTestEnabled } from "../agents/live-test-helpers.js"; +import type { OpenClawConfig } from "../config/config.js"; +import type { DeviceIdentity } from "../infra/device-identity.js"; +import { isTruthyEnvValue } from "../infra/env.js"; +import type { GatewayClient } from "./client.js"; +import { + assertCronJobMatches, + assertCronJobVisibleViaCli, + assertLiveImageProbeReply, + buildLiveCronProbeMessage, + createLiveCronProbeSpec, + runOpenClawCliJson, + type CronListJob, +} from "./live-agent-probes.js"; +import { renderCatFacePngBase64 } from "./live-image-probe.js"; + +const LIVE = isLiveTestEnabled(); +const CODEX_HARNESS_LIVE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_CODEX_HARNESS); +const CODEX_HARNESS_DEBUG = isTruthyEnvValue(process.env.OPENCLAW_LIVE_CODEX_HARNESS_DEBUG); +const CODEX_HARNESS_IMAGE_PROBE = isTruthyEnvValue( + process.env.OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE, +); +const CODEX_HARNESS_MCP_PROBE = isTruthyEnvValue(process.env.OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE); +const describeLive = LIVE && CODEX_HARNESS_LIVE ? describe : describe.skip; +const describeDisabled = LIVE && !CODEX_HARNESS_LIVE ? describe : describe.skip; +const CODEX_HARNESS_TIMEOUT_MS = 420_000; +const DEFAULT_CODEX_MODEL = "codex/gpt-5.4"; +const GATEWAY_CONNECT_TIMEOUT_MS = 60_000; + +type EnvSnapshot = { + agentRuntime?: string; + configPath?: string; + gatewayToken?: string; + openaiApiKey?: string; + skipBrowserControl?: string; + skipCanvas?: string; + skipChannels?: string; + skipCron?: string; + skipGmail?: string; + stateDir?: string; +}; + +function logCodexLiveStep(step: string, details?: Record): void { + if (!CODEX_HARNESS_DEBUG) { + return; + } + const suffix = details && Object.keys(details).length > 0 ? ` ${JSON.stringify(details)}` : ""; + console.error(`[gateway-codex-live] ${step}${suffix}`); +} + +function snapshotEnv(): EnvSnapshot { + return { + agentRuntime: process.env.OPENCLAW_AGENT_RUNTIME, + configPath: process.env.OPENCLAW_CONFIG_PATH, + gatewayToken: process.env.OPENCLAW_GATEWAY_TOKEN, + openaiApiKey: process.env.OPENAI_API_KEY, + skipBrowserControl: process.env.OPENCLAW_SKIP_BROWSER_CONTROL_SERVER, + skipCanvas: process.env.OPENCLAW_SKIP_CANVAS_HOST, + skipChannels: process.env.OPENCLAW_SKIP_CHANNELS, + skipCron: process.env.OPENCLAW_SKIP_CRON, + skipGmail: process.env.OPENCLAW_SKIP_GMAIL_WATCHER, + stateDir: process.env.OPENCLAW_STATE_DIR, + }; +} + +function restoreEnv(snapshot: EnvSnapshot): void { + restoreEnvVar("OPENCLAW_AGENT_RUNTIME", snapshot.agentRuntime); + restoreEnvVar("OPENCLAW_CONFIG_PATH", snapshot.configPath); + restoreEnvVar("OPENCLAW_GATEWAY_TOKEN", snapshot.gatewayToken); + restoreEnvVar("OPENAI_API_KEY", snapshot.openaiApiKey); + restoreEnvVar("OPENCLAW_SKIP_BROWSER_CONTROL_SERVER", snapshot.skipBrowserControl); + restoreEnvVar("OPENCLAW_SKIP_CANVAS_HOST", snapshot.skipCanvas); + restoreEnvVar("OPENCLAW_SKIP_CHANNELS", snapshot.skipChannels); + restoreEnvVar("OPENCLAW_SKIP_CRON", snapshot.skipCron); + restoreEnvVar("OPENCLAW_SKIP_GMAIL_WATCHER", snapshot.skipGmail); + restoreEnvVar("OPENCLAW_STATE_DIR", snapshot.stateDir); +} + +function restoreEnvVar(name: string, value: string | undefined): void { + if (value === undefined) { + delete process.env[name]; + return; + } + process.env[name] = value; +} + +async function getFreeGatewayPort(): Promise { + const server = createServer(); + await new Promise((resolve, reject) => { + server.once("error", reject); + server.listen(0, "127.0.0.1", resolve); + }); + const address = server.address(); + const port = typeof address === "object" && address ? address.port : 0; + await new Promise((resolve, reject) => { + server.close((error) => (error ? reject(error) : resolve())); + }); + if (port <= 0) { + throw new Error("failed to allocate gateway port"); + } + return port; +} + +async function ensurePairedTestGatewayClientIdentity(): Promise { + const { loadOrCreateDeviceIdentity, publicKeyRawBase64UrlFromPem } = + await import("../infra/device-identity.js"); + const { approveDevicePairing, getPairedDevice, requestDevicePairing } = + await import("../infra/device-pairing.js"); + const { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } = + await import("../utils/message-channel.js"); + const identity = loadOrCreateDeviceIdentity(); + const publicKey = publicKeyRawBase64UrlFromPem(identity.publicKeyPem); + const requiredScopes = ["operator.admin"]; + const paired = await getPairedDevice(identity.deviceId); + const pairedScopes = Array.isArray(paired?.approvedScopes) + ? paired.approvedScopes + : Array.isArray(paired?.scopes) + ? paired.scopes + : []; + if ( + paired?.publicKey === publicKey && + requiredScopes.every((scope) => pairedScopes.includes(scope)) + ) { + return identity; + } + const pairing = await requestDevicePairing({ + deviceId: identity.deviceId, + publicKey, + displayName: "vitest-codex-harness-live", + platform: process.platform, + clientId: GATEWAY_CLIENT_NAMES.TEST, + clientMode: GATEWAY_CLIENT_MODES.TEST, + role: "operator", + scopes: requiredScopes, + silent: true, + }); + const approved = await approveDevicePairing(pairing.request.requestId, { + callerScopes: requiredScopes, + }); + if (approved?.status !== "approved") { + throw new Error(`failed to pre-pair live test device: ${approved?.status ?? "missing"}`); + } + return identity; +} + +async function connectTestGatewayClient(params: { + deviceIdentity: DeviceIdentity; + token: string; + url: string; +}): Promise { + const { GatewayClient } = await import("./client.js"); + const { GATEWAY_CLIENT_MODES, GATEWAY_CLIENT_NAMES } = + await import("../utils/message-channel.js"); + return await new Promise((resolve, reject) => { + let done = false; + let client: GatewayClient | undefined; + const connectTimeout = setTimeout(() => { + finish({ error: new Error("gateway connect timeout") }); + }, GATEWAY_CONNECT_TIMEOUT_MS); + connectTimeout.unref(); + + function finish(result: { client?: GatewayClient; error?: Error }): void { + if (done) { + return; + } + done = true; + clearTimeout(connectTimeout); + if (result.error) { + if (client) { + void client.stopAndWait({ timeoutMs: 1_000 }).catch(() => {}); + } + reject(result.error); + return; + } + resolve(result.client as GatewayClient); + } + + client = new GatewayClient({ + url: params.url, + token: params.token, + clientName: GATEWAY_CLIENT_NAMES.TEST, + clientDisplayName: "vitest-codex-harness-live", + clientVersion: "dev", + mode: GATEWAY_CLIENT_MODES.TEST, + connectChallengeTimeoutMs: GATEWAY_CONNECT_TIMEOUT_MS, + deviceIdentity: params.deviceIdentity, + onHelloOk: () => finish({ client }), + onConnectError: (error) => finish({ error }), + onClose: (code, reason) => { + finish({ error: new Error(`gateway closed during connect (${code}): ${reason}`) }); + }, + }); + client.start(); + }); +} + +async function createLiveWorkspace(tempDir: string): Promise { + const workspace = path.join(tempDir, "workspace"); + await fs.mkdir(workspace, { recursive: true }); + await fs.writeFile( + path.join(workspace, "AGENTS.md"), + [ + "# AGENTS.md", + "", + "Follow exact reply instructions from the user.", + "Do not add commentary when asked for an exact response.", + ].join("\n"), + ); + return workspace; +} + +async function writeLiveGatewayConfig(params: { + configPath: string; + modelKey: string; + port: number; + token: string; + workspace: string; +}): Promise { + const cfg: OpenClawConfig = { + gateway: { + mode: "local", + port: params.port, + auth: { mode: "token", token: params.token }, + }, + plugins: { allow: ["codex"] }, + agents: { + defaults: { + workspace: params.workspace, + skipBootstrap: true, + model: { primary: params.modelKey }, + models: { [params.modelKey]: {} }, + sandbox: { mode: "off" }, + }, + }, + }; + await fs.writeFile(params.configPath, `${JSON.stringify(cfg, null, 2)}\n`); +} + +async function requestAgentText(params: { + client: GatewayClient; + expectedToken: string; + message: string; + sessionKey: string; +}): Promise { + const { extractPayloadText } = await import("./test-helpers.agent-results.js"); + const payload = await params.client.request( + "agent", + { + sessionKey: params.sessionKey, + idempotencyKey: `idem-${randomUUID()}`, + message: params.message, + deliver: false, + thinking: "low", + }, + { expectFinal: true }, + ); + if (payload?.status !== "ok") { + throw new Error(`agent status=${String(payload?.status)} payload=${JSON.stringify(payload)}`); + } + const text = extractPayloadText(payload.result); + expect(text).toContain(params.expectedToken); + return text; +} + +async function verifyCodexImageProbe(params: { + client: GatewayClient; + sessionKey: string; +}): Promise { + const runId = randomUUID(); + const payload = await params.client.request( + "agent", + { + sessionKey: params.sessionKey, + idempotencyKey: `idem-${runId}-image`, + message: + "Best match for the image: lobster, mouse, cat, horse. " + + "Reply with one lowercase word only.", + attachments: [ + { + mimeType: "image/png", + fileName: `codex-probe-${runId}.png`, + content: renderCatFacePngBase64(), + }, + ], + deliver: false, + thinking: "low", + }, + { expectFinal: true }, + ); + if (payload?.status !== "ok") { + throw new Error(`image probe failed: status=${String(payload?.status)}`); + } + const { extractPayloadText } = await import("./test-helpers.agent-results.js"); + assertLiveImageProbeReply(extractPayloadText(payload.result)); +} + +async function verifyCodexCronMcpProbe(params: { + client: GatewayClient; + env: NodeJS.ProcessEnv; + port: number; + sessionKey: string; + token: string; +}): Promise { + const cronProbe = createLiveCronProbeSpec(); + let createdJob: CronListJob | undefined; + let lastReply = ""; + + for (let attempt = 0; attempt < 2 && !createdJob; attempt += 1) { + const runId = randomUUID(); + const payload = await params.client.request( + "agent", + { + sessionKey: params.sessionKey, + idempotencyKey: `idem-${runId}-mcp-${attempt}`, + message: buildLiveCronProbeMessage({ + agent: "codex", + argsJson: cronProbe.argsJson, + attempt, + exactReply: cronProbe.name, + }), + deliver: false, + thinking: "low", + }, + { expectFinal: true }, + ); + if (payload?.status !== "ok") { + throw new Error(`cron mcp probe failed: status=${String(payload?.status)}`); + } + const { extractPayloadText } = await import("./test-helpers.agent-results.js"); + lastReply = extractPayloadText(payload.result).trim(); + createdJob = await assertCronJobVisibleViaCli({ + port: params.port, + token: params.token, + env: params.env, + expectedName: cronProbe.name, + expectedMessage: cronProbe.message, + }); + } + + if (!createdJob) { + throw new Error( + `cron cli verify could not find job ${cronProbe.name}: reply=${JSON.stringify(lastReply)}`, + ); + } + assertCronJobMatches({ + job: createdJob, + expectedName: cronProbe.name, + expectedMessage: cronProbe.message, + expectedSessionKey: params.sessionKey, + }); + if (createdJob.id) { + await runOpenClawCliJson( + [ + "cron", + "rm", + createdJob.id, + "--json", + "--url", + `ws://127.0.0.1:${params.port}`, + "--token", + params.token, + ], + params.env, + ); + } +} + +describeLive("gateway live (Codex harness)", () => { + it( + "runs gateway agent turns through the plugin-owned Codex app-server harness", + async () => { + const modelKey = process.env.OPENCLAW_LIVE_CODEX_HARNESS_MODEL ?? DEFAULT_CODEX_MODEL; + const openaiKey = process.env.OPENAI_API_KEY?.trim(); + if (!openaiKey) { + throw new Error("OPENAI_API_KEY is required for the Codex harness live test."); + } + const { clearRuntimeConfigSnapshot } = await import("../config/config.js"); + const { startGatewayServer } = await import("./server.js"); + + const previousEnv = snapshotEnv(); + const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-live-codex-harness-")); + const stateDir = path.join(tempDir, "state"); + const workspace = await createLiveWorkspace(tempDir); + const configPath = path.join(tempDir, "openclaw.json"); + const token = `test-${randomUUID()}`; + const port = await getFreeGatewayPort(); + + clearRuntimeConfigSnapshot(); + process.env.OPENCLAW_AGENT_RUNTIME = "codex"; + process.env.OPENCLAW_CONFIG_PATH = configPath; + process.env.OPENCLAW_GATEWAY_TOKEN = token; + process.env.OPENCLAW_SKIP_BROWSER_CONTROL_SERVER = "1"; + process.env.OPENCLAW_SKIP_CANVAS_HOST = "1"; + process.env.OPENCLAW_SKIP_CHANNELS = "1"; + process.env.OPENCLAW_SKIP_CRON = "1"; + process.env.OPENCLAW_SKIP_GMAIL_WATCHER = "1"; + process.env.OPENCLAW_STATE_DIR = stateDir; + + await fs.mkdir(stateDir, { recursive: true }); + await writeLiveGatewayConfig({ configPath, modelKey, port, token, workspace }); + const deviceIdentity = await ensurePairedTestGatewayClientIdentity(); + logCodexLiveStep("config-written", { configPath, modelKey, port }); + + const server = await startGatewayServer(port, { + bind: "loopback", + auth: { mode: "token", token }, + controlUiEnabled: false, + }); + const client = await connectTestGatewayClient({ + url: `ws://127.0.0.1:${port}`, + token, + deviceIdentity, + }); + logCodexLiveStep("client-connected"); + + try { + const sessionKey = "agent:dev:live-codex-harness"; + const firstNonce = randomBytes(3).toString("hex").toUpperCase(); + const firstToken = `CODEX-HARNESS-${firstNonce}`; + const firstText = await requestAgentText({ + client, + sessionKey, + expectedToken: firstToken, + message: `Reply with exactly ${firstToken} and nothing else.`, + }); + logCodexLiveStep("first-turn", { firstText }); + + const secondNonce = randomBytes(3).toString("hex").toUpperCase(); + const secondToken = `CODEX-HARNESS-RESUME-${secondNonce}`; + const secondText = await requestAgentText({ + client, + sessionKey, + expectedToken: secondToken, + message: `Reply with exactly ${secondToken} and nothing else. Do not repeat ${firstToken}.`, + }); + logCodexLiveStep("second-turn", { secondText }); + + if (CODEX_HARNESS_IMAGE_PROBE) { + logCodexLiveStep("image-probe:start", { sessionKey }); + await verifyCodexImageProbe({ client, sessionKey }); + logCodexLiveStep("image-probe:done"); + } + + if (CODEX_HARNESS_MCP_PROBE) { + logCodexLiveStep("cron-mcp-probe:start", { sessionKey }); + await verifyCodexCronMcpProbe({ + client, + sessionKey, + port, + token, + env: process.env, + }); + logCodexLiveStep("cron-mcp-probe:done"); + } + } finally { + clearRuntimeConfigSnapshot(); + await client.stopAndWait(); + await server.close(); + restoreEnv(previousEnv); + await fs.rm(tempDir, { recursive: true, force: true }); + } + }, + CODEX_HARNESS_TIMEOUT_MS, + ); +}); + +describeDisabled("gateway live (Codex harness disabled)", () => { + it("is opt-in", () => { + expect(CODEX_HARNESS_LIVE).toBe(false); + }); +});