fix(agents): accept inbound media refs across tools

This commit is contained in:
Peter Steinberger
2026-04-25 00:48:53 +01:00
parent 4e9c83d4d8
commit 14e0a8c2bc
10 changed files with 256 additions and 91 deletions

View File

@@ -371,4 +371,34 @@ describe("detectAndLoadPromptImages", () => {
await fs.rm(stateDir, { recursive: true, force: true });
}
});
it("loads managed inbound absolute paths when workspaceOnly is enabled", async () => {
const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-native-image-managed-"));
const workspaceDir = path.join(stateDir, "workspace-agent");
const inboundDir = path.join(stateDir, "media", "inbound");
await fs.mkdir(workspaceDir, { recursive: true });
await fs.mkdir(inboundDir, { recursive: true });
const imagePath = path.join(inboundDir, "signal-replay.png");
const pngB64 =
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/woAAn8B9FD5fHAAAAAASUVORK5CYII=";
await fs.writeFile(imagePath, Buffer.from(pngB64, "base64"));
vi.stubEnv("OPENCLAW_STATE_DIR", stateDir);
try {
const result = await detectAndLoadPromptImages({
prompt: `Inspect ${imagePath}`,
workspaceDir,
model: { input: ["text", "image"] },
workspaceOnly: true,
});
expect(result.detectedRefs).toHaveLength(1);
expect(result.loadedCount).toBe(1);
expect(result.skippedCount).toBe(0);
expect(result.images).toHaveLength(1);
} finally {
vi.unstubAllEnvs();
await fs.rm(stateDir, { recursive: true, force: true });
}
});
});

View File

@@ -3,7 +3,6 @@ import type { ImageContent } from "@mariozechner/pi-ai";
import { formatErrorMessage } from "../../../infra/errors.js";
import { assertNoWindowsNetworkPath, safeFileURLToPath } from "../../../infra/local-file-access.js";
import type { PromptImageOrderEntry } from "../../../media/prompt-image-order.js";
import { resolveMediaBufferPath, getMediaDir } from "../../../media/store.js";
import { loadWebMedia } from "../../../media/web-media.js";
import { normalizeLowercaseStringOrEmpty } from "../../../shared/string-coerce.js";
import { resolveUserPath } from "../../../utils.js";
@@ -12,7 +11,6 @@ import {
createSandboxBridgeReadFile,
resolveSandboxedBridgeMediaPath,
} from "../../sandbox-media-paths.js";
import { assertSandboxPath } from "../../sandbox-paths.js";
import type { SandboxFsBridge } from "../../sandbox/fs-bridge.js";
import { sanitizeImageBlocks } from "../../tool-images.js";
import { log } from "../logger.js";
@@ -353,44 +351,6 @@ export async function loadImageFromRef(
sandbox?: { root: string; bridge: SandboxFsBridge };
},
): Promise<ImageContent | null> {
// Handle Gateway claim-check URIs (media://inbound/<id>).
// These are written by the Gateway's offload path and point to files that
// the Gateway has already validated and persisted. They are intentionally
// exempt from workspaceOnly checks because they live in the media store
// managed by the Gateway, not in the agent workspace.
if (ref.type === "media-uri") {
const uriMatch = ref.resolved.match(MEDIA_URI_REGEX);
if (!uriMatch) {
log.debug(`Native image: malformed media URI, skipping: ${ref.resolved}`);
return null;
}
const mediaId = uriMatch[1];
try {
// resolveMediaBufferPath accepts the media ID (with optional extension
// and original-filename prefix) and returns the absolute path of the
// persisted file. It applies its own guards against path traversal,
// symlinks, and null bytes.
const physicalPath = await resolveMediaBufferPath(mediaId, "inbound");
const media = await loadWebMedia(physicalPath, {
maxBytes: options?.maxBytes,
localRoots: [getMediaDir()],
});
if (media.kind !== "image") {
log.debug(`Native image: media store entry is not an image: ${mediaId}`);
return null;
}
const mimeType = media.contentType ?? "image/jpeg";
const data = media.buffer.toString("base64");
log.debug(`Native image: loaded media-uri ${ref.resolved} -> ${physicalPath}`);
return { type: "image", data, mimeType };
} catch (err) {
log.debug(
`Native image: failed to load media-uri ${ref.resolved}: ${formatErrorMessage(err)}`,
);
return null;
}
}
try {
let targetPath = ref.resolved;
@@ -415,14 +375,6 @@ export async function loadImageFromRef(
} else if (!path.isAbsolute(targetPath)) {
targetPath = path.resolve(workspaceDir, targetPath);
}
if (options?.workspaceOnly && !options?.sandbox) {
const root = options?.sandbox?.root ?? workspaceDir;
await assertSandboxPath({
filePath: targetPath,
cwd: root,
root,
});
}
// loadWebMedia handles local file paths (including file:// URLs)
const media = options?.sandbox
@@ -431,7 +383,12 @@ export async function loadImageFromRef(
sandboxValidated: true,
readFile: createSandboxBridgeReadFile({ sandbox: options.sandbox }),
})
: await loadWebMedia(targetPath, options?.maxBytes);
: await loadWebMedia(
targetPath,
options?.workspaceOnly
? { maxBytes: options.maxBytes, localRoots: [workspaceDir] }
: options?.maxBytes,
);
if (media.kind !== "image") {
log.debug(`Native image: not an image file: ${targetPath} (got ${media.kind})`);

View File

@@ -685,6 +685,23 @@ describe("createImageGenerateTool", () => {
);
});
it("accepts managed inbound reference images for edit mode", async () => {
stubEditedImageFlow({ width: 1024, height: 1024 });
const tool = createToolWithPrimaryImageModel("google/gemini-3-pro-image-preview", {
workspaceDir: process.cwd(),
});
await tool.execute("call-edit-managed", {
prompt: "Use this reference.",
image: "media://inbound/reference.png",
});
expect(webMedia.loadWebMedia).toHaveBeenCalledWith(
"media://inbound/reference.png",
expect.any(Object),
);
});
it("ignores non-finite mediaMaxMb when loading reference images", async () => {
stubImageGenerationProviders();
stubEditedImageFlow({ width: 3200, height: 1800 });

View File

@@ -20,6 +20,10 @@ import type {
} from "../../image-generation/types.js";
import { resolveConfiguredMediaMaxBytes } from "../../media/configured-max-bytes.js";
import { getImageMetadata } from "../../media/image-ops.js";
import {
classifyMediaReferenceSource,
normalizeMediaReferenceSource,
} from "../../media/media-reference.js";
import { saveMediaBuffer } from "../../media/store.js";
import { loadWebMedia } from "../../media/web-media.js";
import { getProviderEnvVars } from "../../secrets/provider-env-vars.js";
@@ -426,16 +430,15 @@ async function loadReferenceImages(params: {
for (const imageRawInput of params.imageInputs) {
const trimmed = imageRawInput.trim();
const imageRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
const imageRaw = normalizeMediaReferenceSource(
trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed,
);
if (!imageRaw) {
throw new ToolInputError("image required (empty string in array)");
}
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(imageRaw);
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(imageRaw);
const isFileUrl = /^file:/i.test(imageRaw);
const isHttpUrl = /^https?:\/\//i.test(imageRaw);
const isDataUrl = /^data:/i.test(imageRaw);
if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
const refInfo = classifyMediaReferenceSource(imageRaw);
const { isDataUrl, isHttpUrl } = refInfo;
if (refInfo.hasUnsupportedScheme) {
throw new ToolInputError(
`Unsupported image reference: ${imageRawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
);

View File

@@ -1383,6 +1383,68 @@ describe("image tool MiniMax VLM routing", () => {
});
});
describe("image tool managed inbound media", () => {
const priorFetch = global.fetch;
afterEach(() => {
vi.unstubAllEnvs();
global.fetch = priorFetch;
imageProviderHarness.reset();
__testing.setProviderDepsForTest();
});
async function withManagedInboundPng(
run: (params: { stateDir: string; mediaId: string; mediaPath: string }) => Promise<void>,
) {
const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-image-managed-inbound-"));
const inboundDir = path.join(stateDir, "media", "inbound");
const mediaId = "claim-check-test.png";
const mediaPath = path.join(inboundDir, mediaId);
await fs.mkdir(inboundDir, { recursive: true });
await fs.writeFile(mediaPath, Buffer.from(ONE_PIXEL_PNG_B64, "base64"));
vi.stubEnv("OPENCLAW_STATE_DIR", stateDir);
try {
await run({ stateDir, mediaId, mediaPath });
} finally {
await fs.rm(stateDir, { recursive: true, force: true });
}
}
it("resolves media://inbound refs", async () => {
await withManagedInboundPng(async ({ mediaId }) => {
installImageUnderstandingProviderStubs();
const fetch = stubMinimaxOkFetch();
await withTempAgentDir(async (agentDir) => {
const tool = createRequiredImageTool({
config: createMinimaxImageConfig(),
agentDir,
fsPolicy: { workspaceOnly: true },
});
await expectImageToolExecOk(tool, `media://inbound/${mediaId}`);
expect(fetch).toHaveBeenCalledTimes(1);
});
});
});
it("allows managed inbound absolute paths when workspaceOnly is enabled", async () => {
await withManagedInboundPng(async ({ mediaPath }) => {
installImageUnderstandingProviderStubs();
const fetch = stubMinimaxOkFetch();
await withTempAgentDir(async (agentDir) => {
const tool = createRequiredImageTool({
config: createMinimaxImageConfig(),
agentDir,
fsPolicy: { workspaceOnly: true },
});
await expectImageToolExecOk(tool, mediaPath);
expect(fetch).toHaveBeenCalledTimes(1);
});
});
});
});
describe("image tool response validation", () => {
function createAssistantMessage(
overrides: Partial<{

View File

@@ -7,6 +7,10 @@ import {
} from "../../media-understanding/defaults.js";
import { getMediaUnderstandingProvider } from "../../media-understanding/provider-registry.js";
import { buildProviderRegistry } from "../../media-understanding/runner.js";
import {
classifyMediaReferenceSource,
normalizeMediaReferenceSource,
} from "../../media/media-reference.js";
import { loadWebMedia } from "../../media/web-media.js";
import {
describeImageWithModel,
@@ -404,17 +408,16 @@ export function createImageTool(options?: {
throw new Error("image required (empty string in array)");
}
const normalizedRef = normalizeMediaReferenceSource(imageRaw);
// The tool accepts file paths, file/data URLs, or http(s) URLs. In some
// agent/model contexts, images can be referenced as pseudo-URIs like
// `image:0` (e.g. "first image in the prompt"). We don't have access to a
// shared image registry here, so fail gracefully instead of attempting to
// `fs.readFile("image:0")` and producing a noisy ENOENT.
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(imageRaw);
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(imageRaw);
const isFileUrl = /^file:/i.test(imageRaw);
const isHttpUrl = /^https?:\/\//i.test(imageRaw);
const isDataUrl = /^data:/i.test(imageRaw);
if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
const refInfo = classifyMediaReferenceSource(normalizedRef);
const { isDataUrl, isFileUrl, isHttpUrl } = refInfo;
if (refInfo.hasUnsupportedScheme) {
return {
content: [
{
@@ -435,10 +438,10 @@ export function createImageTool(options?: {
const resolvedImage = (() => {
if (sandboxConfig) {
return imageRaw;
return normalizedRef;
}
if (imageRaw.startsWith("~")) {
return resolveUserPath(imageRaw);
if (normalizedRef.startsWith("~")) {
return resolveUserPath(normalizedRef);
}
// Resolve relative paths against workspaceDir so agents can reference
// workspace-relative paths (e.g. "inbox/photo.png") without needing to
@@ -447,13 +450,13 @@ export function createImageTool(options?: {
!isDataUrl &&
!isFileUrl &&
!isHttpUrl &&
!looksLikeWindowsDrivePath &&
!isAbsolute(imageRaw) &&
!refInfo.looksLikeWindowsDrivePath &&
!isAbsolute(normalizedRef) &&
options?.workspaceDir
) {
return resolve(options.workspaceDir, imageRaw);
return resolve(options.workspaceDir, normalizedRef);
}
return imageRaw;
return normalizedRef;
})();
const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl
? { resolved: "" }

View File

@@ -4,6 +4,10 @@ import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { formatErrorMessage } from "../../infra/errors.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import { resolveConfiguredMediaMaxBytes } from "../../media/configured-max-bytes.js";
import {
classifyMediaReferenceSource,
normalizeMediaReferenceSource,
} from "../../media/media-reference.js";
import { saveMediaBuffer } from "../../media/store.js";
import { loadWebMedia } from "../../media/web-media.js";
import { resolveMusicGenerationModeCapabilities } from "../../music-generation/capabilities.js";
@@ -247,16 +251,15 @@ async function loadReferenceImages(params: {
for (const rawInput of params.inputs) {
const trimmed = rawInput.trim();
const inputRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
const inputRaw = normalizeMediaReferenceSource(
trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed,
);
if (!inputRaw) {
throw new ToolInputError("image required (empty string in array)");
}
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(inputRaw);
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(inputRaw);
const isFileUrl = /^file:/i.test(inputRaw);
const isHttpUrl = /^https?:\/\//i.test(inputRaw);
const isDataUrl = /^data:/i.test(inputRaw);
if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
const refInfo = classifyMediaReferenceSource(inputRaw);
const { isDataUrl, isHttpUrl } = refInfo;
if (refInfo.hasUnsupportedScheme) {
throw new ToolInputError(
`Unsupported image reference: ${rawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
);

View File

@@ -74,12 +74,16 @@ function withPdfModel(primary: string): OpenClawConfig {
async function stubPdfToolInfra(
agentDir: string,
params?: {
mockLoad?: boolean;
provider?: string;
input?: string[];
modelFound?: boolean;
},
) {
const loadSpy = vi.spyOn(webMedia, "loadWebMediaRaw").mockResolvedValue(FAKE_PDF_MEDIA as never);
const loadSpy = vi.spyOn(webMedia, "loadWebMediaRaw");
if (params?.mockLoad !== false) {
loadSpy.mockResolvedValue(FAKE_PDF_MEDIA as never);
}
vi.spyOn(modelDiscovery, "discoverAuthStorage").mockReturnValue({
setRuntimeApiKey: vi.fn(),
@@ -106,6 +110,23 @@ async function stubPdfToolInfra(
return { loadSpy };
}
async function withManagedInboundPdf(
run: (params: { stateDir: string; mediaId: string; mediaPath: string }) => Promise<void>,
) {
const stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "openclaw-pdf-managed-inbound-"));
const inboundDir = path.join(stateDir, "media", "inbound");
const mediaId = "claim-check-test.pdf";
const mediaPath = path.join(inboundDir, mediaId);
await fs.mkdir(inboundDir, { recursive: true });
await fs.writeFile(mediaPath, FAKE_PDF_MEDIA.buffer);
vi.stubEnv("OPENCLAW_STATE_DIR", stateDir);
try {
await run({ stateDir, mediaId, mediaPath });
} finally {
await fs.rm(stateDir, { recursive: true, force: true });
}
}
describe("createPdfTool", () => {
const priorFetch = global.fetch;
@@ -194,6 +215,71 @@ describe("createPdfTool", () => {
});
});
it("resolves media://inbound PDF refs", async () => {
await withManagedInboundPdf(async ({ mediaId }) => {
await withTempPdfAgentDir(async (agentDir) => {
const { loadSpy } = await stubPdfToolInfra(agentDir, {
mockLoad: false,
provider: "anthropic",
input: ["text", "document"],
});
vi.spyOn(pdfNativeProviders, "anthropicAnalyzePdf").mockResolvedValue("native summary");
const cfg = withPdfModel(ANTHROPIC_PDF_MODEL);
const tool = requirePdfTool(
(await loadCreatePdfTool())({
config: cfg,
agentDir,
fsPolicy: { workspaceOnly: true },
}),
);
const result = await tool.execute("t1", {
prompt: "summarize",
pdf: `media://inbound/${mediaId}`,
});
expect(loadSpy).toHaveBeenCalledWith(
`media://inbound/${mediaId}`,
expect.objectContaining({
localRoots: [],
}),
);
expect(result).toMatchObject({
content: [{ type: "text", text: "native summary" }],
details: { native: true, model: ANTHROPIC_PDF_MODEL },
});
});
});
});
it("allows managed inbound absolute PDF paths when workspaceOnly is enabled", async () => {
await withManagedInboundPdf(async ({ mediaPath }) => {
await withTempPdfAgentDir(async (agentDir) => {
const { loadSpy } = await stubPdfToolInfra(agentDir, {
mockLoad: false,
provider: "anthropic",
input: ["text", "document"],
});
vi.spyOn(pdfNativeProviders, "anthropicAnalyzePdf").mockResolvedValue("native summary");
const cfg = withPdfModel(ANTHROPIC_PDF_MODEL);
const tool = requirePdfTool(
(await loadCreatePdfTool())({
config: cfg,
agentDir,
fsPolicy: { workspaceOnly: true },
}),
);
await tool.execute("t1", {
prompt: "summarize",
pdf: mediaPath,
});
expect(loadSpy).toHaveBeenCalledWith(mediaPath, expect.any(Object));
});
});
});
it("uses native PDF path without eager extraction", async () => {
await withTempPdfAgentDir(async (agentDir) => {
await stubPdfToolInfra(agentDir, { provider: "anthropic", input: ["text", "document"] });

View File

@@ -1,6 +1,10 @@
import { type Context, complete } from "@mariozechner/pi-ai";
import { Type } from "typebox";
import type { OpenClawConfig } from "../../config/types.openclaw.js";
import {
classifyMediaReferenceSource,
normalizeMediaReferenceSource,
} from "../../media/media-reference.js";
import { extractPdfContent, type PdfExtractedContent } from "../../media/pdf-extract.js";
import { loadWebMediaRaw } from "../../media/web-media.js";
import {
@@ -331,14 +335,11 @@ export function createPdfTool(options?: {
}> = [];
for (const pdfRaw of pdfInputs) {
const trimmed = pdfRaw.trim();
const isHttpUrl = /^https?:\/\//i.test(trimmed);
const isFileUrl = /^file:/i.test(trimmed);
const isDataUrl = /^data:/i.test(trimmed);
const looksLikeWindowsDrive = /^[a-zA-Z]:[\\/]/.test(trimmed);
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(trimmed);
const trimmed = normalizeMediaReferenceSource(pdfRaw);
const refInfo = classifyMediaReferenceSource(trimmed);
const { isHttpUrl } = refInfo;
if (hasScheme && !looksLikeWindowsDrive && !isFileUrl && !isHttpUrl && !isDataUrl) {
if (refInfo.hasUnsupportedScheme) {
return {
content: [
{

View File

@@ -4,6 +4,10 @@ import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { formatErrorMessage } from "../../infra/errors.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import { resolveConfiguredMediaMaxBytes } from "../../media/configured-max-bytes.js";
import {
classifyMediaReferenceSource,
normalizeMediaReferenceSource,
} from "../../media/media-reference.js";
import { saveMediaBuffer } from "../../media/store.js";
import { loadWebMedia } from "../../media/web-media.js";
import { readSnakeCaseParamRaw } from "../../param-key.js";
@@ -441,16 +445,15 @@ async function loadReferenceAssets(params: {
for (const rawInput of params.inputs) {
const trimmed = rawInput.trim();
const inputRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
const inputRaw = normalizeMediaReferenceSource(
trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed,
);
if (!inputRaw) {
throw new ToolInputError(`${params.expectedKind} required (empty string in array)`);
}
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(inputRaw);
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(inputRaw);
const isFileUrl = /^file:/i.test(inputRaw);
const isHttpUrl = /^https?:\/\//i.test(inputRaw);
const isDataUrl = /^data:/i.test(inputRaw);
if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
const refInfo = classifyMediaReferenceSource(inputRaw);
const { isDataUrl, isHttpUrl } = refInfo;
if (refInfo.hasUnsupportedScheme) {
throw new ToolInputError(
`Unsupported ${params.expectedKind} reference: ${rawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
);