From c81e9866ff3b7f5e7716d242255a6eeaff1cb9ee Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Thu, 26 Feb 2026 16:38:09 +0100 Subject: [PATCH] fix(pi): stop history image reinjection token blowup --- CHANGELOG.md | 1 + docs/pi.md | 4 + .../pi-embedded-runner/run/attempt.test.ts | 34 ++-- src/agents/pi-embedded-runner/run/attempt.ts | 93 +++++------ .../pi-embedded-runner/run/images.test.ts | 51 +----- src/agents/pi-embedded-runner/run/images.ts | 145 +----------------- 6 files changed, 69 insertions(+), 259 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 60977641ef9..21abc031de1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai ### Fixes +- Pi image-token usage: stop re-injecting history image blocks each turn, process image references from the current prompt only, and prune already-answered user-image blocks in stored history to prevent runaway token growth. (#27602) - Security/Gateway node pairing: pin paired-device `platform`/`deviceFamily` metadata across reconnects and bind those fields into device-auth signatures, so reconnect metadata spoofing cannot expand node command allowlists without explicit repair pairing. This ships in the next npm release (`2026.2.26`). Thanks @76embiid21 for reporting. - Security/Sandbox path alias guard: reject broken symlink targets by resolving through existing ancestors and failing closed on out-of-root targets, preventing workspace-only `apply_patch` writes from escaping sandbox/workspace boundaries via dangling symlinks. This ships in the next npm release (`2026.2.26`). Thanks @tdjackey for reporting. - Security/Workspace FS boundary aliases: harden canonical boundary resolution for non-existent-leaf symlink aliases while preserving valid in-root aliases, preventing first-write workspace escapes via out-of-root symlink targets. This ships in the next npm release (`2026.2.26`). Thanks @tdjackey for reporting. diff --git a/docs/pi.md b/docs/pi.md index 944224da19c..2689b480963 100644 --- a/docs/pi.md +++ b/docs/pi.md @@ -232,6 +232,10 @@ await session.prompt(effectivePrompt, { images: imageResult.images }); The SDK handles the full agent loop: sending to LLM, executing tool calls, streaming responses. +Image injection is prompt-local: OpenClaw loads image refs from the current prompt and +passes them via `images` for that turn only. It does not re-scan older history turns +to re-inject image payloads. + ## Tool Architecture ### Tool Pipeline diff --git a/src/agents/pi-embedded-runner/run/attempt.test.ts b/src/agents/pi-embedded-runner/run/attempt.test.ts index 97a881cf849..022c7f989d8 100644 --- a/src/agents/pi-embedded-runner/run/attempt.test.ts +++ b/src/agents/pi-embedded-runner/run/attempt.test.ts @@ -3,24 +3,29 @@ import type { ImageContent } from "@mariozechner/pi-ai"; import { describe, expect, it, vi } from "vitest"; import type { OpenClawConfig } from "../../../config/config.js"; import { - injectHistoryImagesIntoMessages, + PRUNED_HISTORY_IMAGE_MARKER, + pruneProcessedHistoryImages, resolveAttemptFsWorkspaceOnly, resolvePromptBuildHookResult, resolvePromptModeForSession, } from "./attempt.js"; -describe("injectHistoryImagesIntoMessages", () => { +describe("pruneProcessedHistoryImages", () => { const image: ImageContent = { type: "image", data: "abc", mimeType: "image/png" }; - it("injects history images and converts string content", () => { + it("prunes image blocks from user messages that already have assistant replies", () => { const messages: AgentMessage[] = [ { role: "user", - content: "See /tmp/photo.png", + content: [{ type: "text", text: "See /tmp/photo.png" }, { ...image }], } as AgentMessage, + { + role: "assistant", + content: "got it", + } as unknown as AgentMessage, ]; - const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[0, [image]]])); + const didMutate = pruneProcessedHistoryImages(messages); expect(didMutate).toBe(true); const firstUser = messages[0] as Extract | undefined; @@ -28,10 +33,10 @@ describe("injectHistoryImagesIntoMessages", () => { const content = firstUser?.content as Array<{ type: string; text?: string; data?: string }>; expect(content).toHaveLength(2); expect(content[0]?.type).toBe("text"); - expect(content[1]).toMatchObject({ type: "image", data: "abc" }); + expect(content[1]).toMatchObject({ type: "text", text: PRUNED_HISTORY_IMAGE_MARKER }); }); - it("avoids duplicating existing image content", () => { + it("does not prune latest user message when no assistant response exists yet", () => { const messages: AgentMessage[] = [ { role: "user", @@ -39,7 +44,7 @@ describe("injectHistoryImagesIntoMessages", () => { } as AgentMessage, ]; - const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[0, [image]]])); + const didMutate = pruneProcessedHistoryImages(messages); expect(didMutate).toBe(false); const first = messages[0] as Extract | undefined; @@ -47,21 +52,22 @@ describe("injectHistoryImagesIntoMessages", () => { throw new Error("expected array content"); } expect(first.content).toHaveLength(2); + expect(first.content[1]).toMatchObject({ type: "image", data: "abc" }); }); - it("ignores non-user messages and out-of-range indices", () => { + it("does not change messages when no assistant turn exists", () => { const messages: AgentMessage[] = [ { - role: "assistant", + role: "user", content: "noop", - } as unknown as AgentMessage, + } as AgentMessage, ]; - const didMutate = injectHistoryImagesIntoMessages(messages, new Map([[1, [image]]])); + const didMutate = pruneProcessedHistoryImages(messages); expect(didMutate).toBe(false); - const firstAssistant = messages[0] as Extract | undefined; - expect(firstAssistant?.content).toBe("noop"); + const firstUser = messages[0] as Extract | undefined; + expect(firstUser?.content).toBe("noop"); }); }); diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index 64c9e23170c..b3590a530c7 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -1,7 +1,6 @@ import fs from "node:fs/promises"; import os from "node:os"; import type { AgentMessage } from "@mariozechner/pi-agent-core"; -import type { ImageContent } from "@mariozechner/pi-ai"; import { streamSimple } from "@mariozechner/pi-ai"; import { createAgentSession, @@ -128,51 +127,45 @@ type PromptBuildHookRunner = { ) => Promise; }; -export function injectHistoryImagesIntoMessages( - messages: AgentMessage[], - historyImagesByIndex: Map, -): boolean { - if (historyImagesByIndex.size === 0) { +export const PRUNED_HISTORY_IMAGE_MARKER = "[image data removed - already processed by model]"; + +/** + * Prunes image blocks from user messages that already have an assistant response after them. + * This is a one-way cleanup for previously persisted history image data. + */ +export function pruneProcessedHistoryImages(messages: AgentMessage[]): boolean { + let lastAssistantIndex = -1; + for (let i = messages.length - 1; i >= 0; i--) { + if (messages[i]?.role === "assistant") { + lastAssistantIndex = i; + break; + } + } + if (lastAssistantIndex < 0) { return false; } - let didMutate = false; - for (const [msgIndex, images] of historyImagesByIndex) { - // Bounds check: ensure index is valid before accessing - if (msgIndex < 0 || msgIndex >= messages.length) { + let didMutate = false; + for (let i = 0; i < lastAssistantIndex; i++) { + const message = messages[i]; + if (!message || message.role !== "user" || !Array.isArray(message.content)) { continue; } - const msg = messages[msgIndex]; - if (msg && msg.role === "user") { - // Convert string content to array format if needed - if (typeof msg.content === "string") { - msg.content = [{ type: "text", text: msg.content }]; - didMutate = true; + for (let j = 0; j < message.content.length; j++) { + const block = message.content[j]; + if (!block || typeof block !== "object") { + continue; } - if (Array.isArray(msg.content)) { - // Check for existing image content to avoid duplicates across turns - const existingImageData = new Set( - msg.content - .filter( - (c): c is ImageContent => - c != null && - typeof c === "object" && - c.type === "image" && - typeof c.data === "string", - ) - .map((c) => c.data), - ); - for (const img of images) { - // Only add if this image isn't already in the message - if (!existingImageData.has(img.data)) { - msg.content.push(img); - didMutate = true; - } - } + if ((block as { type?: string }).type !== "image") { + continue; } + message.content[j] = { + type: "text", + text: PRUNED_HISTORY_IMAGE_MARKER, + } as (typeof message.content)[number]; + didMutate = true; } } - return didMutate; } @@ -1092,16 +1085,20 @@ export async function runEmbeddedAttempt( } try { + // One-time migration: prune image blocks from already-processed user turns. + // This prevents old persisted base64 payloads from bloating subsequent prompts. + const didPruneImages = pruneProcessedHistoryImages(activeSession.messages); + if (didPruneImages) { + activeSession.agent.replaceMessages(activeSession.messages); + } + // Detect and load images referenced in the prompt for vision-capable models. - // This eliminates the need for an explicit "view" tool call by injecting - // images directly into the prompt when the model supports it. - // Also scans conversation history to enable follow-up questions about earlier images. + // Images are prompt-local only (pi-like behavior). const imageResult = await detectAndLoadPromptImages({ prompt: effectivePrompt, workspaceDir: effectiveWorkspace, model: params.model, existingImages: params.images, - historyMessages: activeSession.messages, maxBytes: MAX_IMAGE_BYTES, maxDimensionPx: resolveImageSanitizationLimits(params.config).maxDimensionPx, workspaceOnly: effectiveFsWorkspaceOnly, @@ -1112,21 +1109,10 @@ export async function runEmbeddedAttempt( : undefined, }); - // Inject history images into their original message positions. - // This ensures the model sees images in context (e.g., "compare to the first image"). - const didMutate = injectHistoryImagesIntoMessages( - activeSession.messages, - imageResult.historyImagesByIndex, - ); - if (didMutate) { - // Persist message mutations (e.g., injected history images) so we don't re-scan/reload. - activeSession.agent.replaceMessages(activeSession.messages); - } - cacheTrace?.recordStage("prompt:images", { prompt: effectivePrompt, messages: activeSession.messages, - note: `images: prompt=${imageResult.images.length} history=${imageResult.historyImagesByIndex.size}`, + note: `images: prompt=${imageResult.images.length}`, }); // Diagnostic: log context sizes before prompt to help debug early overflow errors. @@ -1143,7 +1129,6 @@ export async function runEmbeddedAttempt( `historyImageBlocks=${sessionSummary.totalImageBlocks} ` + `systemPromptChars=${systemLen} promptChars=${promptLen} ` + `promptImages=${imageResult.images.length} ` + - `historyImageMessages=${imageResult.historyImagesByIndex.size} ` + `provider=${params.provider}/${params.modelId} sessionFile=${params.sessionFile}`, ); } diff --git a/src/agents/pi-embedded-runner/run/images.test.ts b/src/agents/pi-embedded-runner/run/images.test.ts index f9cb846da40..410c9de8ff6 100644 --- a/src/agents/pi-embedded-runner/run/images.test.ts +++ b/src/agents/pi-embedded-runner/run/images.test.ts @@ -256,25 +256,15 @@ describe("detectAndLoadPromptImages", () => { expect(result.detectedRefs).toHaveLength(0); }); - it("skips history messages that already include image content", async () => { + it("returns no detected refs when prompt has no image references", async () => { const result = await detectAndLoadPromptImages({ prompt: "no images here", workspaceDir: "/tmp", model: { input: ["text", "image"] }, - historyMessages: [ - { - role: "user", - content: [ - { type: "text", text: "See /tmp/should-not-load.png" }, - { type: "image", data: "abc", mimeType: "image/png" }, - ], - }, - ], }); expect(result.detectedRefs).toHaveLength(0); expect(result.images).toHaveLength(0); - expect(result.historyImagesByIndex.size).toBe(0); }); it("blocks prompt image refs outside workspace when sandbox workspaceOnly is enabled", async () => { @@ -309,43 +299,4 @@ describe("detectAndLoadPromptImages", () => { await fs.rm(stateDir, { recursive: true, force: true }); } }); - - it("blocks history image refs outside workspace when sandbox workspaceOnly is enabled", async () => { - const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-native-image-sandbox-")); - const sandboxRoot = path.join(stateDir, "sandbox"); - const agentRoot = path.join(stateDir, "agent"); - await fs.mkdir(sandboxRoot, { recursive: true }); - await fs.mkdir(agentRoot, { recursive: true }); - const pngB64 = - "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII="; - await fs.writeFile(path.join(agentRoot, "secret.png"), Buffer.from(pngB64, "base64")); - const sandbox = createUnsafeMountedSandbox({ sandboxRoot, agentRoot }); - const bridge = sandbox.fsBridge; - if (!bridge) { - throw new Error("sandbox fs bridge missing"); - } - - try { - const result = await detectAndLoadPromptImages({ - prompt: "No inline image in this turn.", - workspaceDir: sandboxRoot, - model: { input: ["text", "image"] }, - workspaceOnly: true, - historyMessages: [ - { - role: "user", - content: [{ type: "text", text: "Previous image /agent/secret.png" }], - }, - ], - sandbox: { root: sandbox.workspaceDir, bridge }, - }); - - expect(result.detectedRefs).toHaveLength(1); - expect(result.loadedCount).toBe(0); - expect(result.skippedCount).toBe(1); - expect(result.historyImagesByIndex.size).toBe(0); - } finally { - await fs.rm(stateDir, { recursive: true, force: true }); - } - }); }); diff --git a/src/agents/pi-embedded-runner/run/images.ts b/src/agents/pi-embedded-runner/run/images.ts index 897e8ca16e2..71b7aeb2273 100644 --- a/src/agents/pi-embedded-runner/run/images.ts +++ b/src/agents/pi-embedded-runner/run/images.ts @@ -36,8 +36,6 @@ export interface DetectedImageRef { type: "path" | "url"; /** The resolved/normalized path or URL */ resolved: string; - /** Index of the message this ref was found in (for history images) */ - messageIndex?: number; } /** @@ -268,93 +266,6 @@ export function modelSupportsImages(model: { input?: string[] }): boolean { return model.input?.includes("image") ?? false; } -function extractTextFromMessage(message: unknown): string { - if (!message || typeof message !== "object") { - return ""; - } - const content = (message as { content?: unknown }).content; - if (typeof content === "string") { - return content; - } - if (!Array.isArray(content)) { - return ""; - } - const textParts: string[] = []; - for (const part of content) { - if (!part || typeof part !== "object") { - continue; - } - const record = part as Record; - if (record.type === "text" && typeof record.text === "string") { - textParts.push(record.text); - } - } - return textParts.join("\n").trim(); -} - -/** - * Extracts image references from conversation history messages. - * Scans user messages for image paths/URLs that can be loaded. - * Each ref includes the messageIndex so images can be injected at their original location. - * - * Note: Global deduplication is intentional - if the same image appears in multiple - * messages, we only inject it at the FIRST occurrence. This is sufficient because: - * 1. The model sees all message content including the image - * 2. Later references to "the image" or "that picture" will work since it's in context - * 3. Injecting duplicates would waste tokens and potentially hit size limits - */ -function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] { - const allRefs: DetectedImageRef[] = []; - const seen = new Set(); - - const messageHasImageContent = (msg: unknown): boolean => { - if (!msg || typeof msg !== "object") { - return false; - } - const content = (msg as { content?: unknown }).content; - if (!Array.isArray(content)) { - return false; - } - return content.some( - (part) => - part != null && typeof part === "object" && (part as { type?: string }).type === "image", - ); - }; - - for (let i = 0; i < messages.length; i++) { - const msg = messages[i]; - if (!msg || typeof msg !== "object") { - continue; - } - const message = msg as { role?: string }; - // Only scan user messages for image references - if (message.role !== "user") { - continue; - } - // Skip if message already has image content (prevents reloading each turn) - if (messageHasImageContent(msg)) { - continue; - } - - const text = extractTextFromMessage(msg); - if (!text) { - continue; - } - - const refs = detectImageReferences(text); - for (const ref of refs) { - const key = ref.resolved.toLowerCase(); - if (seen.has(key)) { - continue; - } - seen.add(key); - allRefs.push({ ...ref, messageIndex: i }); - } - } - - return allRefs; -} - /** * Detects and loads images referenced in a prompt for models with vision capability. * @@ -362,18 +273,14 @@ function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] { * loads them, and returns them as ImageContent array ready to be passed to * the model's prompt method. * - * Also scans conversation history for images from previous turns and returns - * them mapped by message index so they can be injected at their original location. - * * @param params Configuration for image detection and loading - * @returns Object with loaded images for current prompt and history images by message index + * @returns Object with loaded images for current prompt only */ export async function detectAndLoadPromptImages(params: { prompt: string; workspaceDir: string; model: { input?: string[] }; existingImages?: ImageContent[]; - historyMessages?: unknown[]; maxBytes?: number; maxDimensionPx?: number; workspaceOnly?: boolean; @@ -381,8 +288,6 @@ export async function detectAndLoadPromptImages(params: { }): Promise<{ /** Images for the current prompt (existingImages + detected in current prompt) */ images: ImageContent[]; - /** Images from history messages, keyed by message index */ - historyImagesByIndex: Map; detectedRefs: DetectedImageRef[]; loadedCount: number; skippedCount: number; @@ -391,7 +296,6 @@ export async function detectAndLoadPromptImages(params: { if (!modelSupportsImages(params.model)) { return { images: [], - historyImagesByIndex: new Map(), detectedRefs: [], loadedCount: 0, skippedCount: 0, @@ -399,38 +303,20 @@ export async function detectAndLoadPromptImages(params: { } // Detect images from current prompt - const promptRefs = detectImageReferences(params.prompt); - - // Detect images from conversation history (with message indices) - const historyRefs = params.historyMessages ? detectImagesFromHistory(params.historyMessages) : []; - - // Deduplicate: if an image is in the current prompt, don't also load it from history. - // Current prompt images are passed via the `images` parameter to prompt(), while history - // images are injected into their original message positions. We don't want the same - // image loaded and sent twice (wasting tokens and potentially causing confusion). - const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase())); - const uniqueHistoryRefs = historyRefs.filter((r) => !seenPaths.has(r.resolved.toLowerCase())); - - const allRefs = [...promptRefs, ...uniqueHistoryRefs]; + const allRefs = detectImageReferences(params.prompt); if (allRefs.length === 0) { return { images: params.existingImages ?? [], - historyImagesByIndex: new Map(), detectedRefs: [], loadedCount: 0, skippedCount: 0, }; } - log.debug( - `Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`, - ); + log.debug(`Native image: detected ${allRefs.length} image refs in prompt`); - // Load images for current prompt const promptImages: ImageContent[] = [...(params.existingImages ?? [])]; - // Load images for history, grouped by message index - const historyImagesByIndex = new Map(); let loadedCount = 0; let skippedCount = 0; @@ -442,18 +328,7 @@ export async function detectAndLoadPromptImages(params: { sandbox: params.sandbox, }); if (image) { - if (ref.messageIndex !== undefined) { - // History image - add to the appropriate message index - const existing = historyImagesByIndex.get(ref.messageIndex); - if (existing) { - existing.push(image); - } else { - historyImagesByIndex.set(ref.messageIndex, [image]); - } - } else { - // Current prompt image - promptImages.push(image); - } + promptImages.push(image); loadedCount++; log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`); } else { @@ -469,21 +344,9 @@ export async function detectAndLoadPromptImages(params: { "prompt:images", imageSanitization, ); - const sanitizedHistoryImagesByIndex = new Map(); - for (const [index, images] of historyImagesByIndex) { - const sanitized = await sanitizeImagesWithLog( - images, - `history:images:${index}`, - imageSanitization, - ); - if (sanitized.length > 0) { - sanitizedHistoryImagesByIndex.set(index, sanitized); - } - } return { images: sanitizedPromptImages, - historyImagesByIndex: sanitizedHistoryImagesByIndex, detectedRefs: allRefs, loadedCount, skippedCount,