mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-04 10:04:05 +00:00
* feat(browser): add optional vision understanding to screenshot tool
* fix(browser): wrap vision output as external content, enforce maxBytes, forward auth profiles
* fix(browser): remove no-op scope/attachments config, drop profile pass-through lacking runtime support
* feat(media-understanding): add profile/preferredProfile to DescribeImageFileWithModelParams and forward to describeImage
* style(browser): add curly braces to satisfy eslint curly rule
* fix(browser): correct tools.browser.enabled help text to match actual behavior
* fix(browser): thread agentDir/workspaceDir from plugin tool context into browser vision
* refactor(browser): move vision config from tools.browser to browser.models
The browser plugin's vision configuration now lives on the top-level
`browser` config namespace (browser.models, browser.visionEnabled,
browser.visionPrompt, etc.) instead of `tools.browser`. This aligns
with the plugin's existing config location and avoids confusion between
tool-level and plugin-level settings.
- Remove tools.browser from ToolsSchema and ToolsConfig
- Add models/vision* fields to BrowserConfig and its zod schema
- Update getBrowserVisionConfig to read from cfg.browser
- Update schema help, labels, and quality test
- Update vision.test.ts to use new config shape
* docs(browser): add screenshot vision configuration section
Document the new browser.models config for automatic screenshot
description via vision models, enabling text-only main models to
reason about web page content.
* fix(browser): remove deliverable media markers from vision result, drop unused import
P1: Vision-success path no longer exposes the raw screenshot as
deliverable media (removes MEDIA: line and details.media.mediaUrl).
This prevents channel delivery from auto-sending sensitive page content
when the intended output is a text description.
P2: Remove unused ToolsMediaUnderstandingSchema import that would fail
noUnusedLocals typecheck.
* fix(browser): add command/args fields to browser models schema
The browser vision model schema uses .strict(), so CLI-type entries
with command/args were rejected by TypeScript. Add these fields to
align with MediaUnderstandingModelSchema.
* chore(browser): remove debug console.log statements
* fix(browser): harden screenshot vision result against MEDIA: directive injection and restore image sanitization on failure fallback
ClawSweeper #84247 review round 2:
P1 (security, high): neutralize line-start MEDIA: directives in vision descriptions
before wrapping with wrapExternalContent. The agent media extractor scans every
browser tool-result text block via splitMediaFromOutput which treats line-start
MEDIA: as a trusted local-media delivery directive, and browser is on the
trusted-media allowlist. Without neutralization, page or vision-provider output
containing 'MEDIA:/tmp/secret.png' could synthesize a channel-deliverable media
artifact from untrusted content. wrapExternalContent itself does not strip
line-start directives. Introduce neutralizeMediaDirectives in vision.ts that
prepends '[neutralized] ' to any line whose trimStart() begins with MEDIA:
(case-insensitive), defanging the parser anchor while keeping the original
text human-readable.
P2 (compatibility): pass resolveRuntimeImageSanitization() to imageResultFromFile
in the vision-failure catch fallback. The non-vision screenshot path already
forwards this option (d5cc0d53b7) so configured agents.defaults.imageMaxDimensionPx
takes effect. Without this fix, any provider timeout/error silently bypasses the
sanitization guard and returns a raw full-resolution screenshot.
Regression coverage:
- vision.test.ts: 6 unit cases for neutralizeMediaDirectives (no-op fast path,
mid-line MEDIA: untouched, line-start defanged, leading-whitespace defanged,
case-insensitive, multiple directives per blob).
- browser-tool.test.ts: 2 integration cases that drive the full screenshot
tool execute path:
- 'neutralizes MEDIA: directives in vision text and does not attach media'
asserts no line matches /^\s*MEDIA:/i in returned text, secret path text
is preserved verbatim, details.media is absent, and imageResultFromFile
is not called on the success path.
- 'preserves screenshot image sanitization on vision failure fallback'
mocks describeImageFileWithModel to reject and asserts the fallback
imageResultFromFile call receives imageSanitization: {maxDimensionPx:1600}
plus the 'browser screenshot vision failed' extraText.
* fix(browser): apply clawsweeper fallback media fix from PR #84247
* refactor: reuse media image understanding for browser screenshots
* refactor: use structured media delivery
* test: update music completion media instruction expectation
* fix: trim buffered reply directive padding
* test: refresh codex prompt snapshots for message media aliases
---------
Co-authored-by: scotthuang <scotthuang@tencent.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
355 lines
11 KiB
TypeScript
355 lines
11 KiB
TypeScript
import * as fs from "node:fs/promises";
|
|
import { Command } from "commander";
|
|
import { afterAll, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
|
|
import { IOS_NODE, createIosNodeListResponse } from "./program.nodes-test-helpers.js";
|
|
import { callGateway, installBaseProgramMocks, runtime } from "./program.test-mocks.js";
|
|
|
|
installBaseProgramMocks();
|
|
let registerNodesCli: typeof import("./nodes-cli.js").registerNodesCli;
|
|
|
|
function getFirstRuntimeLogLine(): string {
|
|
const first = runtime.log.mock.calls[0]?.[0];
|
|
if (typeof first !== "string") {
|
|
throw new Error(`Expected runtime.log first arg to be string, got ${typeof first}`);
|
|
}
|
|
return first;
|
|
}
|
|
|
|
async function expectLoggedSingleMediaFile(params?: {
|
|
expectedContent?: string;
|
|
expectedPathPattern?: RegExp;
|
|
}): Promise<string> {
|
|
const out = getFirstRuntimeLogLine();
|
|
const mediaPath = out.trim();
|
|
if (params?.expectedPathPattern) {
|
|
expect(mediaPath).toMatch(params.expectedPathPattern);
|
|
}
|
|
try {
|
|
await expect(fs.readFile(mediaPath, "utf8")).resolves.toBe(params?.expectedContent ?? "hi");
|
|
} finally {
|
|
await fs.unlink(mediaPath).catch(() => {});
|
|
}
|
|
return mediaPath;
|
|
}
|
|
|
|
function mockNodeGateway(command?: string, payload?: Record<string, unknown>) {
|
|
callGateway.mockImplementation(async (...args: unknown[]) => {
|
|
const opts = (args[0] ?? {}) as { method?: string };
|
|
if (opts.method === "node.list") {
|
|
return createIosNodeListResponse();
|
|
}
|
|
if (opts.method === "node.invoke" && command) {
|
|
return {
|
|
ok: true,
|
|
nodeId: IOS_NODE.nodeId,
|
|
command,
|
|
payload,
|
|
};
|
|
}
|
|
return { ok: true };
|
|
});
|
|
}
|
|
|
|
function nodeInvokeCalls(): Array<{
|
|
method?: unknown;
|
|
params: Record<string, unknown>;
|
|
commandParams: Record<string, unknown>;
|
|
}> {
|
|
return callGateway.mock.calls
|
|
.map((call) => call[0] as { method?: unknown; params?: Record<string, unknown> })
|
|
.filter((call) => call.method === "node.invoke")
|
|
.map((call) => {
|
|
const params = call.params ?? {};
|
|
const commandParams = (params.params ?? {}) as Record<string, unknown>;
|
|
return { method: call.method, params, commandParams };
|
|
});
|
|
}
|
|
|
|
function latestNodeInvokeCall() {
|
|
const call = nodeInvokeCalls().at(-1);
|
|
if (!call) {
|
|
throw new Error("expected node.invoke gateway call");
|
|
}
|
|
return call;
|
|
}
|
|
|
|
function expectUuidString(value: unknown) {
|
|
expect(value).toEqual(
|
|
expect.stringMatching(
|
|
/^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i,
|
|
),
|
|
);
|
|
}
|
|
|
|
describe("cli program (nodes media)", () => {
|
|
let program: Command;
|
|
|
|
beforeAll(async () => {
|
|
({ registerNodesCli } = await import("./nodes-cli.js"));
|
|
program = new Command();
|
|
program.exitOverride();
|
|
await registerNodesCli(program);
|
|
});
|
|
|
|
async function runNodesCommand(argv: string[]) {
|
|
runtime.log.mockClear();
|
|
await program.parseAsync(argv, { from: "user" });
|
|
}
|
|
|
|
async function expectCameraSnapParseFailure(args: string[], expectedError: RegExp) {
|
|
mockNodeGateway();
|
|
|
|
const parseProgram = new Command();
|
|
parseProgram.exitOverride();
|
|
await registerNodesCli(parseProgram);
|
|
runtime.error.mockClear();
|
|
|
|
await expect(parseProgram.parseAsync(args, { from: "user" })).rejects.toThrow(/exit/i);
|
|
expect(runtime.error).toHaveBeenCalledWith(expect.stringMatching(expectedError));
|
|
}
|
|
|
|
async function runAndExpectUrlPayloadMediaFile(params: {
|
|
command: "camera.snap" | "camera.clip";
|
|
payload: Record<string, unknown>;
|
|
argv: string[];
|
|
expectedPathPattern: RegExp;
|
|
}) {
|
|
mockNodeGateway(params.command, params.payload);
|
|
await runNodesCommand(params.argv);
|
|
await expectLoggedSingleMediaFile({
|
|
expectedPathPattern: params.expectedPathPattern,
|
|
expectedContent: "url-content",
|
|
});
|
|
}
|
|
|
|
beforeEach(() => {
|
|
vi.clearAllMocks();
|
|
});
|
|
|
|
it("runs nodes camera snap and prints two MEDIA paths", async () => {
|
|
mockNodeGateway("camera.snap", { format: "jpg", base64: "aGk=", width: 1, height: 1 });
|
|
|
|
await runNodesCommand(["nodes", "camera", "snap", "--node", "ios-node"]);
|
|
|
|
const invokeCalls = nodeInvokeCalls();
|
|
const facings = invokeCalls
|
|
.map((call) => call.commandParams.facing)
|
|
.filter((facing): facing is string => Boolean(facing))
|
|
.toSorted((a, b) => a.localeCompare(b));
|
|
expect(facings).toEqual(["back", "front"]);
|
|
|
|
const out = getFirstRuntimeLogLine();
|
|
const mediaPaths: string[] = [];
|
|
for (const line of out.split("\n")) {
|
|
const mediaPath = line.trim();
|
|
if (!mediaPath) {
|
|
continue;
|
|
}
|
|
if (mediaPath.length > 0) {
|
|
mediaPaths.push(mediaPath);
|
|
}
|
|
}
|
|
expect(mediaPaths).toHaveLength(2);
|
|
expect(mediaPaths[0]).toContain("openclaw-camera-snap-");
|
|
expect(mediaPaths[1]).toContain("openclaw-camera-snap-");
|
|
|
|
try {
|
|
// Content bytes are covered by single-output camera/file tests; here we
|
|
// only verify dual snapshot behavior and that both paths were written.
|
|
expect((await fs.stat(mediaPaths[0])).isFile()).toBe(true);
|
|
expect((await fs.stat(mediaPaths[1])).isFile()).toBe(true);
|
|
} finally {
|
|
await Promise.all(mediaPaths.map((p) => fs.unlink(p).catch(() => {})));
|
|
}
|
|
});
|
|
|
|
it("runs nodes camera clip and prints one MEDIA path", async () => {
|
|
mockNodeGateway("camera.clip", {
|
|
format: "mp4",
|
|
base64: "aGk=",
|
|
durationMs: 3000,
|
|
hasAudio: true,
|
|
});
|
|
|
|
await runNodesCommand(["nodes", "camera", "clip", "--node", "ios-node", "--duration", "3000"]);
|
|
|
|
const invoke = latestNodeInvokeCall();
|
|
expect(invoke.method).toBe("node.invoke");
|
|
expect(invoke.params.nodeId).toBe("ios-node");
|
|
expect(invoke.params.command).toBe("camera.clip");
|
|
expect(invoke.params.timeoutMs).toBe(90000);
|
|
expectUuidString(invoke.params.idempotencyKey);
|
|
expect(invoke.commandParams.facing).toBe("front");
|
|
expect(invoke.commandParams.durationMs).toBe(3000);
|
|
expect(invoke.commandParams.includeAudio).toBe(true);
|
|
expect(invoke.commandParams.format).toBe("mp4");
|
|
|
|
await expectLoggedSingleMediaFile({
|
|
expectedPathPattern: /openclaw-camera-clip-front-.*\.mp4$/,
|
|
});
|
|
});
|
|
|
|
it("runs nodes camera snap with facing front and passes params", async () => {
|
|
mockNodeGateway("camera.snap", { format: "jpg", base64: "aGk=", width: 1, height: 1 });
|
|
|
|
await runNodesCommand([
|
|
"nodes",
|
|
"camera",
|
|
"snap",
|
|
"--node",
|
|
"ios-node",
|
|
"--facing",
|
|
"front",
|
|
"--max-width",
|
|
"640",
|
|
"--quality",
|
|
"0.8",
|
|
"--delay-ms",
|
|
"2000",
|
|
"--device-id",
|
|
"cam-123",
|
|
]);
|
|
|
|
const invoke = latestNodeInvokeCall();
|
|
expect(invoke.method).toBe("node.invoke");
|
|
expect(invoke.params.nodeId).toBe("ios-node");
|
|
expect(invoke.params.command).toBe("camera.snap");
|
|
expect(invoke.params.timeoutMs).toBe(20000);
|
|
expectUuidString(invoke.params.idempotencyKey);
|
|
expect(invoke.commandParams.facing).toBe("front");
|
|
expect(invoke.commandParams.maxWidth).toBe(640);
|
|
expect(invoke.commandParams.quality).toBe(0.8);
|
|
expect(invoke.commandParams.delayMs).toBe(2000);
|
|
expect(invoke.commandParams.deviceId).toBe("cam-123");
|
|
|
|
await expectLoggedSingleMediaFile();
|
|
});
|
|
|
|
it("runs nodes camera clip with --no-audio", async () => {
|
|
mockNodeGateway("camera.clip", {
|
|
format: "mp4",
|
|
base64: "aGk=",
|
|
durationMs: 3000,
|
|
hasAudio: false,
|
|
});
|
|
|
|
await runNodesCommand([
|
|
"nodes",
|
|
"camera",
|
|
"clip",
|
|
"--node",
|
|
"ios-node",
|
|
"--duration",
|
|
"3000",
|
|
"--no-audio",
|
|
"--device-id",
|
|
"cam-123",
|
|
]);
|
|
|
|
const invoke = latestNodeInvokeCall();
|
|
expect(invoke.method).toBe("node.invoke");
|
|
expect(invoke.params.nodeId).toBe("ios-node");
|
|
expect(invoke.params.command).toBe("camera.clip");
|
|
expect(invoke.params.timeoutMs).toBe(90000);
|
|
expectUuidString(invoke.params.idempotencyKey);
|
|
expect(invoke.commandParams.includeAudio).toBe(false);
|
|
expect(invoke.commandParams.deviceId).toBe("cam-123");
|
|
|
|
await expectLoggedSingleMediaFile();
|
|
});
|
|
|
|
it("runs nodes camera clip with human duration (10s)", async () => {
|
|
mockNodeGateway("camera.clip", {
|
|
format: "mp4",
|
|
base64: "aGk=",
|
|
durationMs: 10_000,
|
|
hasAudio: true,
|
|
});
|
|
|
|
await runNodesCommand(["nodes", "camera", "clip", "--node", "ios-node", "--duration", "10s"]);
|
|
|
|
const invoke = latestNodeInvokeCall();
|
|
expect(invoke.method).toBe("node.invoke");
|
|
expect(invoke.params.nodeId).toBe("ios-node");
|
|
expect(invoke.params.command).toBe("camera.clip");
|
|
expect(invoke.commandParams.durationMs).toBe(10_000);
|
|
});
|
|
|
|
it("fails nodes camera snap on invalid facing", async () => {
|
|
await expectCameraSnapParseFailure(
|
|
["nodes", "camera", "snap", "--node", "ios-node", "--facing", "nope"],
|
|
/invalid facing/i,
|
|
);
|
|
});
|
|
|
|
it("fails nodes camera snap when --facing both and --device-id are combined", async () => {
|
|
await expectCameraSnapParseFailure(
|
|
[
|
|
"nodes",
|
|
"camera",
|
|
"snap",
|
|
"--node",
|
|
"ios-node",
|
|
"--facing",
|
|
"both",
|
|
"--device-id",
|
|
"cam-123",
|
|
],
|
|
/facing=both is not allowed when --device-id is set/i,
|
|
);
|
|
});
|
|
|
|
describe("URL-based payloads", () => {
|
|
let originalFetch: typeof globalThis.fetch;
|
|
|
|
beforeAll(() => {
|
|
originalFetch = globalThis.fetch;
|
|
globalThis.fetch = vi.fn(
|
|
async () =>
|
|
new Response("url-content", {
|
|
status: 200,
|
|
headers: { "content-length": "11" },
|
|
}),
|
|
) as unknown as typeof globalThis.fetch;
|
|
});
|
|
|
|
afterAll(() => {
|
|
globalThis.fetch = originalFetch;
|
|
});
|
|
|
|
it.each([
|
|
{
|
|
label: "runs nodes camera snap with url payload",
|
|
command: "camera.snap" as const,
|
|
payload: {
|
|
format: "jpg",
|
|
url: `https://${IOS_NODE.remoteIp}/photo.jpg`,
|
|
width: 640,
|
|
height: 480,
|
|
},
|
|
argv: ["nodes", "camera", "snap", "--node", "ios-node", "--facing", "front"],
|
|
expectedPathPattern: /openclaw-camera-snap-front-.*\.jpg$/,
|
|
},
|
|
{
|
|
label: "runs nodes camera clip with url payload",
|
|
command: "camera.clip" as const,
|
|
payload: {
|
|
format: "mp4",
|
|
url: `https://${IOS_NODE.remoteIp}/clip.mp4`,
|
|
durationMs: 5000,
|
|
hasAudio: true,
|
|
},
|
|
argv: ["nodes", "camera", "clip", "--node", "ios-node", "--duration", "5000"],
|
|
expectedPathPattern: /openclaw-camera-clip-front-.*\.mp4$/,
|
|
},
|
|
])("$label", async ({ command, payload, argv, expectedPathPattern }) => {
|
|
await runAndExpectUrlPayloadMediaFile({
|
|
command,
|
|
payload,
|
|
argv,
|
|
expectedPathPattern,
|
|
});
|
|
});
|
|
});
|
|
});
|