feat(agents): track video generation tasks

This commit is contained in:
Peter Steinberger
2026-04-06 00:09:48 +01:00
parent 6d34a1c814
commit 40c499d489
5 changed files with 328 additions and 103 deletions

View File

@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
- Agents/cache: stabilize cache-relevant system prompt fingerprints by normalizing equivalent structured prompt whitespace, line endings, hook-added system context, and runtime capability ordering so semantically unchanged prompts reuse KV/cache more reliably. Thanks @vincentkoc.
- Agents/tool prompts: remove the duplicate in-band tool inventory from agent system prompts so tool-calling models rely on the structured tool definitions as the single source of truth, improving prompt stability and reducing stale tool guidance.
- Tools/video generation: add bundled xAI (`grok-imagine-video`) and Alibaba Model Studio Wan video providers, plus live-test/default model wiring for both.
- Agents/video generation: register `video_generate` runs in the task ledger with task/run ids and lifecycle updates so long-running generations can be tracked more reliably.
- Providers/CLI: remove bundled CLI text-provider backends and the `agents.defaults.cliBackends` surface, while keeping ACP harness sessions and Gemini media understanding on the native bundled providers.
- Matrix/exec approvals: clarify unavailable-approval replies so Matrix no longer claims chat approvals are unsupported when native exec approvals are merely unconfigured. (#61424) Thanks @gumadeiras.
- Docs/IRC: replace public IRC hostname examples with `irc.example.com` and recommend private servers for bot coordination while listing common public networks for intentional use.
@@ -108,6 +109,7 @@ Docs: https://docs.openclaw.ai
- Exec approvals: remove heuristic command-obfuscation gating from host exec so gateway and node runs rely on explicit policy, allowlist, and strict inline-eval rules only.
- Agents/tool results: cap live tool-result persistence and overflow-recovery truncation at 40k characters so oversized tool output stays bounded without discarding recent context entirely.
- Discord/video replies: split text-plus-video deliveries into a text reply followed by a media-only send, and let live provider auth checks honor manifest-declared API key env vars like `MODELSTUDIO_API_KEY`.
- Providers/fal video: switch long-running fal video generation to the queue-backed submit/status/result flow, and accept `FAL_API_KEY` as a compatibility alias for the canonical `FAL_KEY`.
- Config/All Settings: keep the raw config view intact when sensitive fields are blank instead of corrupting or dropping the rendered snapshot. (#28214) Thanks @solodmd.
- Plugin SDK/facades: back-fill bundled plugin facade sentinels before plugin-id tracking re-enters config loading, so CLI/provider startup no longer crashes with `shouldNormalizeGoogleProviderConfig is not a function` or other empty-facade reads during bundled plugin re-entry. Thanks @adam91holt.
- Plugins/facades: back-fill facade sentinels before tracked-plugin resolution re-enters config loading, so facade exports stay defined during circular provider normalization. (#61180) Thanks @adam91holt.

View File

@@ -15,6 +15,10 @@ The `video_generate` tool lets the agent create videos using your configured pro
The tool only appears when at least one video-generation provider is available. If you don't see `video_generate` in your agent's tools, configure `agents.defaults.videoGenerationModel` or set up a provider API key.
</Note>
<Note>
OpenClaw now records `video_generate` runs in the task ledger when the agent has a session key, so long-running generations can be tracked with task/run ids even though the tool still waits for completion in the current turn.
</Note>
## Quick start
1. Set an API key for at least one provider (for example `OPENAI_API_KEY`, `GEMINI_API_KEY`, `MODELSTUDIO_API_KEY`, or `QWEN_API_KEY`).
@@ -115,6 +119,7 @@ If a provider fails, the next candidate is tried automatically. If all fail, the
- OpenAI uses the native video endpoint and currently defaults to `sora-2`.
- Qwen supports image/video references, but the upstream DashScope video endpoint currently requires remote `http(s)` URLs for those references.
- xAI uses the native xAI video API and supports text-to-video, image-to-video, and remote video edit/extend flows.
- fal uses the queue-backed fal video flow for long-running jobs instead of a single blocking inference request.
## Qwen reference inputs

View File

@@ -164,6 +164,8 @@ export function createOpenClawTools(
const videoGenerateTool = createVideoGenerateTool({
config: options?.config,
agentDir: options?.agentDir,
agentSessionKey: options?.agentSessionKey,
requesterOrigin: deliveryContext ?? undefined,
workspaceDir,
sandbox,
fsPolicy: options?.fsPolicy,

View File

@@ -4,6 +4,14 @@ import * as mediaStore from "../../media/store.js";
import * as videoGenerationRuntime from "../../video-generation/runtime.js";
import { createVideoGenerateTool } from "./video-generate-tool.js";
const taskExecutorMocks = vi.hoisted(() => ({
createRunningTaskRun: vi.fn(),
completeTaskRunByRunId: vi.fn(),
failTaskRunByRunId: vi.fn(),
}));
vi.mock("../../tasks/task-executor.js", () => taskExecutorMocks);
function asConfig(value: unknown): OpenClawConfig {
return value as OpenClawConfig;
}
@@ -12,6 +20,9 @@ describe("createVideoGenerateTool", () => {
beforeEach(() => {
vi.restoreAllMocks();
vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([]);
taskExecutorMocks.createRunningTaskRun.mockReset();
taskExecutorMocks.completeTaskRunByRunId.mockReset();
taskExecutorMocks.failTaskRunByRunId.mockReset();
});
afterEach(() => {
@@ -39,6 +50,19 @@ describe("createVideoGenerateTool", () => {
});
it("generates videos, saves them, and emits MEDIA paths", async () => {
taskExecutorMocks.createRunningTaskRun.mockReturnValue({
taskId: "task-123",
runtime: "cli",
requesterSessionKey: "agent:main:discord:direct:123",
ownerKey: "agent:main:discord:direct:123",
scopeKind: "session",
task: "friendly lobster surfing",
status: "running",
deliveryStatus: "not_applicable",
notifyPolicy: "silent",
createdAt: Date.now(),
});
taskExecutorMocks.completeTaskRunByRunId.mockReturnValue(undefined);
vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({
provider: "qwen",
model: "wan2.6-t2v",
@@ -67,6 +91,11 @@ describe("createVideoGenerateTool", () => {
},
},
}),
agentSessionKey: "agent:main:discord:direct:123",
requesterOrigin: {
channel: "discord",
to: "channel:1",
},
});
expect(tool).not.toBeNull();
if (!tool) {
@@ -82,12 +111,71 @@ describe("createVideoGenerateTool", () => {
provider: "qwen",
model: "wan2.6-t2v",
count: 1,
task: {
taskId: "task-123",
},
media: {
mediaUrls: ["/tmp/generated-lobster.mp4"],
},
paths: ["/tmp/generated-lobster.mp4"],
metadata: { taskId: "task-1" },
});
expect(taskExecutorMocks.createRunningTaskRun).toHaveBeenCalledWith(
expect.objectContaining({
runtime: "cli",
requesterSessionKey: "agent:main:discord:direct:123",
ownerKey: "agent:main:discord:direct:123",
label: "Video generation",
task: "friendly lobster surfing",
}),
);
expect(taskExecutorMocks.completeTaskRunByRunId).toHaveBeenCalledWith(
expect.objectContaining({
runId: expect.stringMatching(/^tool:video_generate:/),
}),
);
});
it("marks the task failed when provider generation throws", async () => {
taskExecutorMocks.createRunningTaskRun.mockReturnValue({
taskId: "task-fail",
runtime: "cli",
requesterSessionKey: "agent:main:discord:direct:123",
ownerKey: "agent:main:discord:direct:123",
scopeKind: "session",
task: "broken lobster",
status: "running",
deliveryStatus: "not_applicable",
notifyPolicy: "silent",
createdAt: Date.now(),
});
taskExecutorMocks.failTaskRunByRunId.mockReturnValue(undefined);
vi.spyOn(videoGenerationRuntime, "generateVideo").mockRejectedValue(new Error("queue boom"));
const tool = createVideoGenerateTool({
config: asConfig({
agents: {
defaults: {
videoGenerationModel: { primary: "qwen/wan2.6-t2v" },
},
},
}),
agentSessionKey: "agent:main:discord:direct:123",
});
expect(tool).not.toBeNull();
if (!tool) {
throw new Error("expected video_generate tool");
}
await expect(tool.execute("call-2", { prompt: "broken lobster" })).rejects.toThrow(
"queue boom",
);
expect(taskExecutorMocks.failTaskRunByRunId).toHaveBeenCalledWith(
expect.objectContaining({
runId: expect.stringMatching(/^tool:video_generate:/),
error: "queue boom",
}),
);
});
it("shows duration normalization details from runtime metadata", async () => {

View File

@@ -1,11 +1,19 @@
import crypto from "node:crypto";
import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import { loadConfig } from "../../config/config.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import { saveMediaBuffer } from "../../media/store.js";
import { loadWebMedia } from "../../media/web-media.js";
import { readSnakeCaseParamRaw } from "../../param-key.js";
import { getProviderEnvVars } from "../../secrets/provider-env-vars.js";
import {
completeTaskRunByRunId,
createRunningTaskRun,
failTaskRunByRunId,
} from "../../tasks/task-executor.js";
import { resolveUserPath } from "../../utils.js";
import type { DeliveryContext } from "../../utils/delivery-context.js";
import { resolveVideoGenerationSupportedDurations } from "../../video-generation/duration-support.js";
import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js";
import {
@@ -45,6 +53,7 @@ import {
type ToolFsPolicy,
} from "./tool-runtime.helpers.js";
const log = createSubsystemLogger("agents/tools/video-generate");
const MAX_INPUT_IMAGES = 5;
const MAX_INPUT_VIDEOS = 4;
const SUPPORTED_ASPECT_RATIOS = new Set([
@@ -398,6 +407,94 @@ type VideoGenerateSandboxConfig = {
bridge: SandboxFsBridge;
};
type VideoGenerationTaskHandle = {
taskId: string;
runId: string;
};
function createVideoGenerationTaskRun(params: {
sessionKey?: string;
requesterOrigin?: DeliveryContext;
prompt: string;
providerId?: string;
}): VideoGenerationTaskHandle | null {
const sessionKey = params.sessionKey?.trim();
if (!sessionKey) {
return null;
}
const runId = `tool:video_generate:${crypto.randomUUID()}`;
try {
const task = createRunningTaskRun({
runtime: "cli",
sourceId: params.providerId ? `video_generate:${params.providerId}` : "video_generate",
requesterSessionKey: sessionKey,
ownerKey: sessionKey,
scopeKind: "session",
requesterOrigin: params.requesterOrigin,
childSessionKey: sessionKey,
runId,
label: "Video generation",
task: params.prompt,
deliveryStatus: "not_applicable",
notifyPolicy: "silent",
startedAt: Date.now(),
lastEventAt: Date.now(),
progressSummary: "Generating video",
});
return {
taskId: task.taskId,
runId,
};
} catch (error) {
log.warn("Failed to create video generation task ledger record", {
sessionKey,
providerId: params.providerId,
error,
});
return null;
}
}
function completeVideoGenerationTaskRun(params: {
handle: VideoGenerationTaskHandle | null;
provider: string;
model: string;
count: number;
paths: string[];
}) {
if (!params.handle) {
return;
}
const endedAt = Date.now();
const target = params.count === 1 ? params.paths[0] : `${params.count} files`;
completeTaskRunByRunId({
runId: params.handle.runId,
runtime: "cli",
endedAt,
lastEventAt: endedAt,
terminalSummary: `Generated ${params.count} video${params.count === 1 ? "" : "s"} with ${params.provider}/${params.model}${target ? ` -> ${target}` : ""}.`,
});
}
function failVideoGenerationTaskRun(params: {
handle: VideoGenerationTaskHandle | null;
error: unknown;
}) {
if (!params.handle) {
return;
}
const endedAt = Date.now();
const errorText = params.error instanceof Error ? params.error.message : String(params.error);
failTaskRunByRunId({
runId: params.handle.runId,
runtime: "cli",
endedAt,
lastEventAt: endedAt,
error: errorText,
terminalSummary: errorText,
});
}
async function loadReferenceAssets(params: {
inputs: string[];
expectedKind: "image" | "video";
@@ -516,6 +613,8 @@ async function loadReferenceAssets(params: {
export function createVideoGenerateTool(options?: {
config?: OpenClawConfig;
agentDir?: string;
agentSessionKey?: string;
requesterOrigin?: DeliveryContext;
workspaceDir?: string;
sandbox?: VideoGenerateSandboxConfig;
fsPolicy?: ToolFsPolicy;
@@ -668,118 +767,147 @@ export function createVideoGenerateTool(options?: {
audio,
watermark,
});
const result = await generateVideo({
cfg: effectiveCfg,
const taskHandle = createVideoGenerationTaskRun({
sessionKey: options?.agentSessionKey,
requesterOrigin: options?.requesterOrigin,
prompt,
agentDir: options?.agentDir,
modelOverride: model,
size,
aspectRatio,
resolution,
durationSeconds,
audio,
watermark,
inputImages: loadedReferenceImages.map((entry) => entry.sourceAsset),
inputVideos: loadedReferenceVideos.map((entry) => entry.sourceAsset),
providerId: selectedProvider?.id,
});
const savedVideos = await Promise.all(
result.videos.map((video) =>
saveMediaBuffer(
video.buffer,
video.mimeType,
"tool-video-generation",
undefined,
filename || video.fileName,
),
),
);
const requestedDurationSeconds =
typeof result.metadata?.requestedDurationSeconds === "number" &&
Number.isFinite(result.metadata.requestedDurationSeconds)
? result.metadata.requestedDurationSeconds
: durationSeconds;
const normalizedDurationSeconds =
typeof result.metadata?.normalizedDurationSeconds === "number" &&
Number.isFinite(result.metadata.normalizedDurationSeconds)
? result.metadata.normalizedDurationSeconds
: requestedDurationSeconds;
const supportedDurationSeconds = Array.isArray(result.metadata?.supportedDurationSeconds)
? result.metadata.supportedDurationSeconds.filter(
(entry): entry is number => typeof entry === "number" && Number.isFinite(entry),
)
: undefined;
const lines = [
`Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
typeof requestedDurationSeconds === "number" &&
typeof normalizedDurationSeconds === "number" &&
requestedDurationSeconds !== normalizedDurationSeconds
? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.`
: null,
...savedVideos.map((video) => `MEDIA:${video.path}`),
].filter((entry): entry is string => Boolean(entry));
return {
content: [{ type: "text", text: lines.join("\n") }],
details: {
try {
const result = await generateVideo({
cfg: effectiveCfg,
prompt,
agentDir: options?.agentDir,
modelOverride: model,
size,
aspectRatio,
resolution,
durationSeconds,
audio,
watermark,
inputImages: loadedReferenceImages.map((entry) => entry.sourceAsset),
inputVideos: loadedReferenceVideos.map((entry) => entry.sourceAsset),
});
const savedVideos = await Promise.all(
result.videos.map((video) =>
saveMediaBuffer(
video.buffer,
video.mimeType,
"tool-video-generation",
undefined,
filename || video.fileName,
),
),
);
completeVideoGenerationTaskRun({
handle: taskHandle,
provider: result.provider,
model: result.model,
count: savedVideos.length,
media: {
mediaUrls: savedVideos.map((video) => video.path),
},
paths: savedVideos.map((video) => video.path),
...(loadedReferenceImages.length === 1
? {
image: loadedReferenceImages[0]?.resolvedInput,
...(loadedReferenceImages[0]?.rewrittenFrom
? { rewrittenFrom: loadedReferenceImages[0].rewrittenFrom }
: {}),
}
: loadedReferenceImages.length > 1
? {
images: loadedReferenceImages.map((entry) => ({
image: entry.resolvedInput,
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
})),
}
: {}),
...(loadedReferenceVideos.length === 1
? {
video: loadedReferenceVideos[0]?.resolvedInput,
...(loadedReferenceVideos[0]?.rewrittenFrom
? { videoRewrittenFrom: loadedReferenceVideos[0].rewrittenFrom }
: {}),
}
: loadedReferenceVideos.length > 1
? {
videos: loadedReferenceVideos.map((entry) => ({
video: entry.resolvedInput,
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
})),
}
: {}),
...(size ? { size } : {}),
...(aspectRatio ? { aspectRatio } : {}),
...(resolution ? { resolution } : {}),
...(typeof normalizedDurationSeconds === "number"
? { durationSeconds: normalizedDurationSeconds }
: {}),
...(typeof requestedDurationSeconds === "number" &&
});
const requestedDurationSeconds =
typeof result.metadata?.requestedDurationSeconds === "number" &&
Number.isFinite(result.metadata.requestedDurationSeconds)
? result.metadata.requestedDurationSeconds
: durationSeconds;
const normalizedDurationSeconds =
typeof result.metadata?.normalizedDurationSeconds === "number" &&
Number.isFinite(result.metadata.normalizedDurationSeconds)
? result.metadata.normalizedDurationSeconds
: requestedDurationSeconds;
const supportedDurationSeconds = Array.isArray(result.metadata?.supportedDurationSeconds)
? result.metadata.supportedDurationSeconds.filter(
(entry): entry is number => typeof entry === "number" && Number.isFinite(entry),
)
: undefined;
const lines = [
`Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
typeof requestedDurationSeconds === "number" &&
typeof normalizedDurationSeconds === "number" &&
requestedDurationSeconds !== normalizedDurationSeconds
? { requestedDurationSeconds }
: {}),
...(supportedDurationSeconds && supportedDurationSeconds.length > 0
? { supportedDurationSeconds }
: {}),
...(typeof audio === "boolean" ? { audio } : {}),
...(typeof watermark === "boolean" ? { watermark } : {}),
...(filename ? { filename } : {}),
attempts: result.attempts,
metadata: result.metadata,
},
};
? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.`
: null,
...savedVideos.map((video) => `MEDIA:${video.path}`),
].filter((entry): entry is string => Boolean(entry));
return {
content: [{ type: "text", text: lines.join("\n") }],
details: {
provider: result.provider,
model: result.model,
count: savedVideos.length,
media: {
mediaUrls: savedVideos.map((video) => video.path),
},
paths: savedVideos.map((video) => video.path),
...(taskHandle
? {
task: {
taskId: taskHandle.taskId,
runId: taskHandle.runId,
},
}
: {}),
...(loadedReferenceImages.length === 1
? {
image: loadedReferenceImages[0]?.resolvedInput,
...(loadedReferenceImages[0]?.rewrittenFrom
? { rewrittenFrom: loadedReferenceImages[0].rewrittenFrom }
: {}),
}
: loadedReferenceImages.length > 1
? {
images: loadedReferenceImages.map((entry) => ({
image: entry.resolvedInput,
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
})),
}
: {}),
...(loadedReferenceVideos.length === 1
? {
video: loadedReferenceVideos[0]?.resolvedInput,
...(loadedReferenceVideos[0]?.rewrittenFrom
? { videoRewrittenFrom: loadedReferenceVideos[0].rewrittenFrom }
: {}),
}
: loadedReferenceVideos.length > 1
? {
videos: loadedReferenceVideos.map((entry) => ({
video: entry.resolvedInput,
...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
})),
}
: {}),
...(size ? { size } : {}),
...(aspectRatio ? { aspectRatio } : {}),
...(resolution ? { resolution } : {}),
...(typeof normalizedDurationSeconds === "number"
? { durationSeconds: normalizedDurationSeconds }
: {}),
...(typeof requestedDurationSeconds === "number" &&
typeof normalizedDurationSeconds === "number" &&
requestedDurationSeconds !== normalizedDurationSeconds
? { requestedDurationSeconds }
: {}),
...(supportedDurationSeconds && supportedDurationSeconds.length > 0
? { supportedDurationSeconds }
: {}),
...(typeof audio === "boolean" ? { audio } : {}),
...(typeof watermark === "boolean" ? { watermark } : {}),
...(filename ? { filename } : {}),
attempts: result.attempts,
metadata: result.metadata,
},
};
} catch (error) {
failVideoGenerationTaskRun({
handle: taskHandle,
error,
});
throw error;
}
},
};
}