Files
openclaw/src/agents/tools/video-generate-tool.ts
2026-04-11 03:08:30 -07:00

1078 lines
38 KiB
TypeScript

import { Type } from "@sinclair/typebox";
import type { OpenClawConfig } from "../../config/config.js";
import { loadConfig } from "../../config/config.js";
import { formatErrorMessage } from "../../infra/errors.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import { saveMediaBuffer } from "../../media/store.js";
import { loadWebMedia } from "../../media/web-media.js";
import { readSnakeCaseParamRaw } from "../../param-key.js";
import { resolveUserPath } from "../../utils.js";
import type { DeliveryContext } from "../../utils/delivery-context.js";
import {
resolveVideoGenerationMode,
resolveVideoGenerationModeCapabilities,
} from "../../video-generation/capabilities.js";
import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js";
import {
generateVideo,
listRuntimeVideoGenerationProviders,
} from "../../video-generation/runtime.js";
import type {
VideoGenerationIgnoredOverride,
VideoGenerationProvider,
VideoGenerationResolution,
VideoGenerationSourceAsset,
} from "../../video-generation/types.js";
import { ToolInputError, readNumberParam, readStringParam } from "./common.js";
import { decodeDataUrl } from "./image-tool.helpers.js";
import {
applyVideoGenerationModelConfigDefaults,
buildMediaReferenceDetails,
buildTaskRunDetails,
normalizeMediaReferenceInputs,
readBooleanToolParam,
resolveCapabilityModelConfigForTool,
resolveGenerateAction,
resolveMediaToolLocalRoots,
resolveSelectedCapabilityProvider,
} from "./media-tool-shared.js";
import { type ToolModelConfig } from "./model-config.helpers.js";
import {
createSandboxBridgeReadFile,
resolveSandboxedBridgeMediaPath,
type AnyAgentTool,
type SandboxFsBridge,
type ToolFsPolicy,
} from "./tool-runtime.helpers.js";
import {
completeVideoGenerationTaskRun,
createVideoGenerationTaskRun,
failVideoGenerationTaskRun,
recordVideoGenerationTaskProgress,
type VideoGenerationTaskHandle,
wakeVideoGenerationTaskCompletion,
} from "./video-generate-background.js";
import {
createVideoGenerateDuplicateGuardResult,
createVideoGenerateListActionResult,
createVideoGenerateStatusActionResult,
} from "./video-generate-tool.actions.js";
const log = createSubsystemLogger("agents/tools/video-generate");
const MAX_INPUT_IMAGES = 9;
const MAX_INPUT_VIDEOS = 4;
const MAX_INPUT_AUDIOS = 3;
const SUPPORTED_ASPECT_RATIOS = new Set([
"1:1",
"2:3",
"3:2",
"3:4",
"4:3",
"4:5",
"5:4",
"9:16",
"16:9",
"21:9",
// Provider-specific sentinel: accepted at the tool boundary, then forwarded
// to the active provider only if that provider declares "adaptive" in its
// capabilities.aspectRatios list. Providers that do not declare it see the
// value pushed into `ignoredOverrides` in the normalization layer so the
// tool surfaces a user-visible "ignored override" warning rather than
// silently dropping the request. Seedance uses this to auto-detect the
// ratio from input image dimensions.
"adaptive",
]);
const VideoGenerateToolSchema = Type.Object({
action: Type.Optional(
Type.String({
description:
'Optional action: "generate" (default), "status" to inspect the active session task, or "list" to inspect available providers/models.',
}),
),
prompt: Type.Optional(Type.String({ description: "Video generation prompt." })),
image: Type.Optional(
Type.String({
description: "Optional single reference image path or URL.",
}),
),
images: Type.Optional(
Type.Array(Type.String(), {
description: `Optional reference images (up to ${MAX_INPUT_IMAGES}).`,
}),
),
imageRoles: Type.Optional(
Type.Array(Type.String(), {
description:
"Optional semantic roles for the combined reference image list, parallel by index. " +
"The list is `image` (if provided) followed by each entry in `images`, in order, " +
"after de-duplication. " +
'Canonical values: "first_frame", "last_frame", "reference_image". ' +
"Providers may accept additional role strings. " +
"Must not have more entries than the combined image list; use an empty string to leave a position unset.",
}),
),
video: Type.Optional(
Type.String({
description: "Optional single reference video path or URL.",
}),
),
videos: Type.Optional(
Type.Array(Type.String(), {
description: `Optional reference videos (up to ${MAX_INPUT_VIDEOS}).`,
}),
),
videoRoles: Type.Optional(
Type.Array(Type.String(), {
description:
"Optional semantic roles for the combined reference video list, parallel by index. " +
"The list is `video` (if provided) followed by each entry in `videos`, in order, " +
"after de-duplication. " +
'Canonical value: "reference_video". Providers may accept additional role strings. ' +
"Must not have more entries than the combined video list; use an empty string to leave a position unset.",
}),
),
audioRef: Type.Optional(
Type.String({
description: "Optional single reference audio path or URL (e.g. background music).",
}),
),
audioRefs: Type.Optional(
Type.Array(Type.String(), {
description: `Optional reference audios (up to ${MAX_INPUT_AUDIOS}).`,
}),
),
audioRoles: Type.Optional(
Type.Array(Type.String(), {
description:
"Optional semantic roles for the combined reference audio list, parallel by index. " +
"The list is `audioRef` (if provided) followed by each entry in `audioRefs`, in order, " +
"after de-duplication. " +
'Canonical value: "reference_audio". Providers may accept additional role strings. ' +
"Must not have more entries than the combined audio list; use an empty string to leave a position unset.",
}),
),
model: Type.Optional(
Type.String({ description: "Optional provider/model override, e.g. qwen/wan2.6-t2v." }),
),
filename: Type.Optional(
Type.String({
description:
"Optional output filename hint. OpenClaw preserves the basename and saves under its managed media directory.",
}),
),
size: Type.Optional(
Type.String({
description: "Optional size hint like 1280x720 or 1920x1080 when the provider supports it.",
}),
),
aspectRatio: Type.Optional(
Type.String({
description:
'Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9, or "adaptive".',
}),
),
resolution: Type.Optional(
Type.String({
description: "Optional resolution hint: 480P, 720P, 768P, or 1080P.",
}),
),
durationSeconds: Type.Optional(
Type.Number({
description:
"Optional target duration in seconds. OpenClaw may round this to the nearest provider-supported duration.",
minimum: 1,
}),
),
audio: Type.Optional(
Type.Boolean({
description: "Optional audio toggle when the provider supports generated audio.",
}),
),
watermark: Type.Optional(
Type.Boolean({
description: "Optional watermark toggle when the provider supports it.",
}),
),
providerOptions: Type.Optional(
Type.Record(Type.String(), Type.Unknown(), {
description:
'Optional provider-specific options as a JSON object, e.g. `{"seed": 42, "draft": true}`. ' +
"Each provider declares its own accepted keys and primitive types (number/boolean/string) " +
"via its capabilities; unknown keys or type mismatches skip the candidate during fallback " +
"and never silently reach the wrong provider. Run `video_generate action=list` to see which " +
"keys each provider accepts.",
}),
),
});
export function resolveVideoGenerationModelConfigForTool(params: {
cfg?: OpenClawConfig;
agentDir?: string;
}): ToolModelConfig | null {
return resolveCapabilityModelConfigForTool({
cfg: params.cfg,
agentDir: params.agentDir,
modelConfig: params.cfg?.agents?.defaults?.videoGenerationModel,
providers: listRuntimeVideoGenerationProviders({ config: params.cfg }),
});
}
function resolveAction(args: Record<string, unknown>): "generate" | "list" | "status" {
return resolveGenerateAction({
args,
allowed: ["generate", "status", "list"],
defaultAction: "generate",
});
}
function normalizeResolution(raw: string | undefined): VideoGenerationResolution | undefined {
const normalized = raw?.trim().toUpperCase();
if (!normalized) {
return undefined;
}
if (
normalized === "480P" ||
normalized === "720P" ||
normalized === "768P" ||
normalized === "1080P"
) {
return normalized;
}
throw new ToolInputError("resolution must be one of 480P, 720P, 768P, or 1080P");
}
function normalizeAspectRatio(raw: string | undefined): string | undefined {
const normalized = raw?.trim();
if (!normalized) {
return undefined;
}
if (SUPPORTED_ASPECT_RATIOS.has(normalized)) {
return normalized;
}
throw new ToolInputError(
"aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, 21:9, or adaptive",
);
}
/**
* Parse a `*Roles` parallel string array for `video_generate`. Throws when
* the caller supplies more roles than assets so off-by-one alignment bugs
* fail loudly at the tool boundary instead of silently dropping the
* trailing roles. Empty strings in the array are allowed and mean "no
* role at this position". Non-string entries are coerced to empty strings
* and treated as "unset" so providers can leave individual slots empty.
*/
function parseRoleArray(params: {
raw: unknown;
kind: "imageRoles" | "videoRoles" | "audioRoles";
assetCount: number;
}): string[] {
if (params.raw === undefined || params.raw === null) {
return [];
}
if (!Array.isArray(params.raw)) {
throw new ToolInputError(
`${params.kind} must be a JSON array of role strings, parallel to the reference list.`,
);
}
const roles = params.raw.map((entry) => (typeof entry === "string" ? entry.trim() : ""));
if (roles.length > params.assetCount) {
throw new ToolInputError(
`${params.kind} has ${roles.length} entries but only ${params.assetCount} reference ${params.kind === "imageRoles" ? "image" : params.kind === "videoRoles" ? "video" : "audio"}${params.assetCount === 1 ? "" : "s"} were provided; extra roles cannot be aligned positionally.`,
);
}
return roles;
}
function normalizeReferenceInputs(params: {
args: Record<string, unknown>;
singularKey: "image" | "video" | "audioRef";
pluralKey: "images" | "videos" | "audioRefs";
maxCount: number;
}): string[] {
return normalizeMediaReferenceInputs({
args: params.args,
singularKey: params.singularKey,
pluralKey: params.pluralKey,
maxCount: params.maxCount,
label: `reference ${params.pluralKey}`,
});
}
function resolveSelectedVideoGenerationProvider(params: {
config?: OpenClawConfig;
videoGenerationModelConfig: ToolModelConfig;
modelOverride?: string;
}): VideoGenerationProvider | undefined {
return resolveSelectedCapabilityProvider({
providers: listRuntimeVideoGenerationProviders({ config: params.config }),
modelConfig: params.videoGenerationModelConfig,
modelOverride: params.modelOverride,
parseModelRef: parseVideoGenerationModelRef,
});
}
function validateVideoGenerationCapabilities(params: {
provider: VideoGenerationProvider | undefined;
model?: string;
inputImageCount: number;
inputVideoCount: number;
inputAudioCount: number;
size?: string;
aspectRatio?: string;
resolution?: VideoGenerationResolution;
durationSeconds?: number;
audio?: boolean;
watermark?: boolean;
}) {
const provider = params.provider;
if (!provider) {
return;
}
const mode = resolveVideoGenerationMode({
inputImageCount: params.inputImageCount,
inputVideoCount: params.inputVideoCount,
});
const { capabilities: caps } = resolveVideoGenerationModeCapabilities({
provider,
inputImageCount: params.inputImageCount,
inputVideoCount: params.inputVideoCount,
});
if (!caps && mode === "imageToVideo" && params.inputVideoCount === 0) {
throw new ToolInputError(`${provider.id} does not support image-to-video reference inputs.`);
}
if (!caps && mode === "videoToVideo" && params.inputImageCount === 0) {
throw new ToolInputError(`${provider.id} does not support video-to-video reference inputs.`);
}
if (!caps) {
return;
}
if (
mode === "imageToVideo" &&
"enabled" in caps &&
!caps.enabled &&
params.inputVideoCount === 0
) {
throw new ToolInputError(`${provider.id} does not support image-to-video reference inputs.`);
}
if (
mode === "videoToVideo" &&
"enabled" in caps &&
!caps.enabled &&
params.inputImageCount === 0
) {
throw new ToolInputError(`${provider.id} does not support video-to-video reference inputs.`);
}
if (params.inputImageCount > 0) {
const maxInputImages = caps.maxInputImages ?? MAX_INPUT_IMAGES;
if (params.inputImageCount > maxInputImages) {
throw new ToolInputError(
`${provider.id} supports at most ${maxInputImages} reference image${maxInputImages === 1 ? "" : "s"}.`,
);
}
}
if (params.inputVideoCount > 0) {
const maxInputVideos = caps.maxInputVideos ?? MAX_INPUT_VIDEOS;
if (params.inputVideoCount > maxInputVideos) {
throw new ToolInputError(
`${provider.id} supports at most ${maxInputVideos} reference video${maxInputVideos === 1 ? "" : "s"}.`,
);
}
}
// Audio-count validation is intentionally deferred to runtime.ts (generateVideo).
// The runtime guard skips per-candidate providers that lack audio support, allowing
// fallback candidates that do support audio to run. A ToolInputError here would fire
// against only the primary provider and prevent valid fallback-based audio requests.
// maxDurationSeconds validation is intentionally deferred to runtime.ts (generateVideo).
// The runtime guard skips per-candidate providers whose hard cap is below the requested
// duration, allowing a fallback with a higher cap to run — same rationale as the audio
// check above. When providers declare an explicit supportedDurationSeconds list, runtime
// normalization snaps to the nearest valid value instead of skipping.
}
function formatIgnoredVideoGenerationOverride(override: VideoGenerationIgnoredOverride): string {
return `${override.key}=${String(override.value)}`;
}
type VideoGenerateSandboxConfig = {
root: string;
bridge: SandboxFsBridge;
};
type VideoGenerateBackgroundScheduler = (work: () => Promise<void>) => void;
function defaultScheduleVideoGenerateBackgroundWork(work: () => Promise<void>) {
queueMicrotask(() => {
void work().catch((error) => {
log.error("Detached video generation job crashed", {
error,
});
});
});
}
async function loadReferenceAssets(params: {
inputs: string[];
expectedKind: "image" | "video" | "audio";
maxBytes?: number;
workspaceDir?: string;
sandboxConfig: { root: string; bridge: SandboxFsBridge; workspaceOnly: boolean } | null;
}): Promise<
Array<{
sourceAsset: VideoGenerationSourceAsset;
resolvedInput: string;
rewrittenFrom?: string;
}>
> {
const loaded: Array<{
sourceAsset: VideoGenerationSourceAsset;
resolvedInput: string;
rewrittenFrom?: string;
}> = [];
for (const rawInput of params.inputs) {
const trimmed = rawInput.trim();
const inputRaw = trimmed.startsWith("@") ? trimmed.slice(1).trim() : trimmed;
if (!inputRaw) {
throw new ToolInputError(`${params.expectedKind} required (empty string in array)`);
}
const looksLikeWindowsDrivePath = /^[a-zA-Z]:[\\/]/.test(inputRaw);
const hasScheme = /^[a-z][a-z0-9+.-]*:/i.test(inputRaw);
const isFileUrl = /^file:/i.test(inputRaw);
const isHttpUrl = /^https?:\/\//i.test(inputRaw);
const isDataUrl = /^data:/i.test(inputRaw);
if (hasScheme && !looksLikeWindowsDrivePath && !isFileUrl && !isHttpUrl && !isDataUrl) {
throw new ToolInputError(
`Unsupported ${params.expectedKind} reference: ${rawInput}. Use a file path, a file:// URL, a data: URL, or an http(s) URL.`,
);
}
if (params.sandboxConfig && isHttpUrl) {
throw new ToolInputError(
`Sandboxed video_generate does not allow remote ${params.expectedKind} URLs.`,
);
}
const resolvedInput = (() => {
if (params.sandboxConfig) {
return inputRaw;
}
if (inputRaw.startsWith("~")) {
return resolveUserPath(inputRaw);
}
return inputRaw;
})();
if (isHttpUrl && !params.sandboxConfig) {
loaded.push({
sourceAsset: { url: resolvedInput },
resolvedInput,
});
continue;
}
const resolvedPathInfo: { resolved: string; rewrittenFrom?: string } = isDataUrl
? { resolved: "" }
: params.sandboxConfig
? await resolveSandboxedBridgeMediaPath({
sandbox: params.sandboxConfig,
mediaPath: resolvedInput,
inboundFallbackDir: "media/inbound",
})
: {
resolved: resolvedInput.startsWith("file://")
? resolvedInput.slice("file://".length)
: resolvedInput,
};
const resolvedPath = isDataUrl ? null : resolvedPathInfo.resolved;
const localRoots = resolveMediaToolLocalRoots(
params.workspaceDir,
{
workspaceOnly: params.sandboxConfig?.workspaceOnly === true,
},
resolvedPath ? [resolvedPath] : undefined,
);
const media = isDataUrl
? params.expectedKind === "image"
? decodeDataUrl(resolvedInput)
: (() => {
throw new ToolInputError(
`${params.expectedKind} data: URLs are not supported for video_generate.`,
);
})()
: params.sandboxConfig
? await loadWebMedia(resolvedPath ?? resolvedInput, {
maxBytes: params.maxBytes,
sandboxValidated: true,
readFile: createSandboxBridgeReadFile({ sandbox: params.sandboxConfig }),
})
: await loadWebMedia(resolvedPath ?? resolvedInput, {
maxBytes: params.maxBytes,
localRoots,
});
if (media.kind !== params.expectedKind) {
throw new ToolInputError(`Unsupported media type: ${media.kind ?? "unknown"}`);
}
const mimeType = "mimeType" in media ? media.mimeType : media.contentType;
const fileName = "fileName" in media ? media.fileName : undefined;
loaded.push({
sourceAsset: {
buffer: media.buffer,
mimeType,
fileName,
},
resolvedInput,
...(resolvedPathInfo.rewrittenFrom ? { rewrittenFrom: resolvedPathInfo.rewrittenFrom } : {}),
});
}
return loaded;
}
type LoadedReferenceAsset = Awaited<ReturnType<typeof loadReferenceAssets>>[number];
type ExecutedVideoGeneration = {
provider: string;
model: string;
savedPaths: string[];
/** URLs of url-only assets that were not saved locally. */
urlOnlyUrls: string[];
/** Total generated video count, including url-only assets. */
count: number;
contentText: string;
details: Record<string, unknown>;
wakeResult: string;
};
async function executeVideoGenerationJob(params: {
effectiveCfg: OpenClawConfig;
prompt: string;
agentDir?: string;
model?: string;
size?: string;
aspectRatio?: string;
resolution?: VideoGenerationResolution;
durationSeconds?: number;
audio?: boolean;
watermark?: boolean;
filename?: string;
loadedReferenceImages: LoadedReferenceAsset[];
loadedReferenceVideos: LoadedReferenceAsset[];
loadedReferenceAudios: LoadedReferenceAsset[];
taskHandle?: VideoGenerationTaskHandle | null;
providerOptions?: Record<string, unknown>;
}): Promise<ExecutedVideoGeneration> {
if (params.taskHandle) {
recordVideoGenerationTaskProgress({
handle: params.taskHandle,
progressSummary: "Generating video",
});
}
const result = await generateVideo({
cfg: params.effectiveCfg,
prompt: params.prompt,
agentDir: params.agentDir,
modelOverride: params.model,
size: params.size,
aspectRatio: params.aspectRatio,
resolution: params.resolution,
durationSeconds: params.durationSeconds,
audio: params.audio,
watermark: params.watermark,
inputImages: params.loadedReferenceImages.map((entry) => entry.sourceAsset),
inputVideos: params.loadedReferenceVideos.map((entry) => entry.sourceAsset),
inputAudios: params.loadedReferenceAudios.map((entry) => entry.sourceAsset),
providerOptions: params.providerOptions,
});
if (params.taskHandle) {
recordVideoGenerationTaskProgress({
handle: params.taskHandle,
progressSummary: "Saving generated video",
});
}
const urlOnlyVideos: Array<{ url: string; mimeType: string; fileName?: string }> = [];
const bufferVideos: Array<(typeof result.videos)[number] & { buffer: Buffer }> = [];
for (const video of result.videos) {
if (video.buffer) {
bufferVideos.push(video as (typeof result.videos)[number] & { buffer: Buffer });
continue;
}
if (video.url) {
urlOnlyVideos.push({
url: video.url,
mimeType: video.mimeType,
fileName: video.fileName,
});
continue;
}
throw new Error(
`Provider ${result.provider} returned a video asset with neither buffer nor url — cannot deliver.`,
);
}
const savedVideos = await Promise.all(
bufferVideos.map((video) =>
saveMediaBuffer(
video.buffer,
video.mimeType,
"tool-video-generation",
undefined,
params.filename || video.fileName,
),
),
);
const totalCount = savedVideos.length + urlOnlyVideos.length;
const requestedDurationSeconds =
result.normalization?.durationSeconds?.requested ??
(typeof result.metadata?.requestedDurationSeconds === "number" &&
Number.isFinite(result.metadata.requestedDurationSeconds)
? result.metadata.requestedDurationSeconds
: params.durationSeconds);
const ignoredOverrides = result.ignoredOverrides ?? [];
const ignoredOverrideKeys = new Set(ignoredOverrides.map((entry) => entry.key));
const warning =
ignoredOverrides.length > 0
? `Ignored unsupported overrides for ${result.provider}/${result.model}: ${ignoredOverrides.map(formatIgnoredVideoGenerationOverride).join(", ")}.`
: undefined;
const normalizedDurationSeconds =
result.normalization?.durationSeconds?.applied ??
(typeof result.metadata?.normalizedDurationSeconds === "number" &&
Number.isFinite(result.metadata.normalizedDurationSeconds)
? result.metadata.normalizedDurationSeconds
: requestedDurationSeconds);
const supportedDurationSeconds =
result.normalization?.durationSeconds?.supportedValues ??
(Array.isArray(result.metadata?.supportedDurationSeconds)
? result.metadata.supportedDurationSeconds.filter(
(entry): entry is number => typeof entry === "number" && Number.isFinite(entry),
)
: undefined);
const normalizedSize =
result.normalization?.size?.applied ??
(typeof result.metadata?.normalizedSize === "string" && result.metadata.normalizedSize.trim()
? result.metadata.normalizedSize
: undefined);
const normalizedAspectRatio =
result.normalization?.aspectRatio?.applied ??
(typeof result.metadata?.normalizedAspectRatio === "string" &&
result.metadata.normalizedAspectRatio.trim()
? result.metadata.normalizedAspectRatio
: undefined);
const normalizedResolution =
result.normalization?.resolution?.applied ??
(typeof result.metadata?.normalizedResolution === "string" &&
result.metadata.normalizedResolution.trim()
? result.metadata.normalizedResolution
: undefined);
const sizeTranslatedToAspectRatio =
result.normalization?.aspectRatio?.derivedFrom === "size" ||
(!normalizedSize &&
typeof result.metadata?.requestedSize === "string" &&
result.metadata.requestedSize === params.size &&
Boolean(normalizedAspectRatio));
const allMediaUrls = [
...savedVideos.map((video) => video.path),
...urlOnlyVideos.map((video) => video.url),
];
const lines = [
`Generated ${totalCount} video${totalCount === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
...(warning ? [`Warning: ${warning}`] : []),
typeof requestedDurationSeconds === "number" &&
typeof normalizedDurationSeconds === "number" &&
requestedDurationSeconds !== normalizedDurationSeconds
? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.`
: null,
...savedVideos.map((video) => `MEDIA:${video.path}`),
...urlOnlyVideos.map((video) => `MEDIA:${video.url}`),
].filter((entry): entry is string => Boolean(entry));
return {
provider: result.provider,
model: result.model,
savedPaths: savedVideos.map((video) => video.path),
urlOnlyUrls: urlOnlyVideos.map((video) => video.url),
count: totalCount,
contentText: lines.join("\n"),
wakeResult: lines.join("\n"),
details: {
provider: result.provider,
model: result.model,
count: totalCount,
media: {
mediaUrls: allMediaUrls,
},
paths: allMediaUrls,
...buildTaskRunDetails(params.taskHandle),
...buildMediaReferenceDetails({
entries: params.loadedReferenceImages,
singleKey: "image",
pluralKey: "images",
getResolvedInput: (entry) => entry.resolvedInput,
}),
...buildMediaReferenceDetails({
entries: params.loadedReferenceVideos,
singleKey: "video",
pluralKey: "videos",
getResolvedInput: (entry) => entry.resolvedInput,
singleRewriteKey: "videoRewrittenFrom",
}),
...(normalizedSize ||
(!ignoredOverrideKeys.has("size") && params.size && !sizeTranslatedToAspectRatio)
? { size: normalizedSize ?? params.size }
: {}),
...(normalizedAspectRatio || (!ignoredOverrideKeys.has("aspectRatio") && params.aspectRatio)
? { aspectRatio: normalizedAspectRatio ?? params.aspectRatio }
: {}),
...(normalizedResolution || (!ignoredOverrideKeys.has("resolution") && params.resolution)
? { resolution: normalizedResolution ?? params.resolution }
: {}),
...(typeof normalizedDurationSeconds === "number"
? { durationSeconds: normalizedDurationSeconds }
: {}),
...(typeof requestedDurationSeconds === "number" &&
typeof normalizedDurationSeconds === "number" &&
requestedDurationSeconds !== normalizedDurationSeconds
? { requestedDurationSeconds }
: {}),
...(supportedDurationSeconds && supportedDurationSeconds.length > 0
? { supportedDurationSeconds }
: {}),
...(!ignoredOverrideKeys.has("audio") && typeof params.audio === "boolean"
? { audio: params.audio }
: {}),
...(!ignoredOverrideKeys.has("watermark") && typeof params.watermark === "boolean"
? { watermark: params.watermark }
: {}),
...(params.filename ? { filename: params.filename } : {}),
attempts: result.attempts,
...(result.normalization ? { normalization: result.normalization } : {}),
metadata: result.metadata,
...(warning ? { warning } : {}),
...(ignoredOverrides.length > 0 ? { ignoredOverrides } : {}),
},
};
}
export function createVideoGenerateTool(options?: {
config?: OpenClawConfig;
agentDir?: string;
agentSessionKey?: string;
requesterOrigin?: DeliveryContext;
workspaceDir?: string;
sandbox?: VideoGenerateSandboxConfig;
fsPolicy?: ToolFsPolicy;
scheduleBackgroundWork?: VideoGenerateBackgroundScheduler;
}): AnyAgentTool | null {
const cfg: OpenClawConfig = options?.config ?? loadConfig();
const videoGenerationModelConfig = resolveVideoGenerationModelConfigForTool({
cfg,
agentDir: options?.agentDir,
});
if (!videoGenerationModelConfig) {
return null;
}
const sandboxConfig = options?.sandbox
? {
root: options.sandbox.root,
bridge: options.sandbox.bridge,
workspaceOnly: options.fsPolicy?.workspaceOnly === true,
}
: null;
const scheduleBackgroundWork =
options?.scheduleBackgroundWork ?? defaultScheduleVideoGenerateBackgroundWork;
return {
label: "Video Generation",
name: "video_generate",
displaySummary: "Generate videos",
description:
"Generate videos using configured providers. Generated videos are saved under OpenClaw-managed media storage and delivered automatically as attachments. Duration requests may be rounded to the nearest provider-supported value.",
parameters: VideoGenerateToolSchema,
execute: async (_toolCallId, rawArgs) => {
const args = rawArgs as Record<string, unknown>;
const action = resolveAction(args);
const effectiveCfg =
applyVideoGenerationModelConfigDefaults(cfg, videoGenerationModelConfig) ?? cfg;
if (action === "list") {
return createVideoGenerateListActionResult(effectiveCfg);
}
if (action === "status") {
return createVideoGenerateStatusActionResult(options?.agentSessionKey);
}
const duplicateGuardResult = createVideoGenerateDuplicateGuardResult(
options?.agentSessionKey,
);
if (duplicateGuardResult) {
return duplicateGuardResult;
}
const prompt = readStringParam(args, "prompt", { required: true });
const model = readStringParam(args, "model");
const filename = readStringParam(args, "filename");
const size = readStringParam(args, "size");
const aspectRatio = normalizeAspectRatio(readStringParam(args, "aspectRatio"));
const resolution = normalizeResolution(readStringParam(args, "resolution"));
const durationSeconds = readNumberParam(args, "durationSeconds", {
integer: true,
strict: true,
});
const audio = readBooleanToolParam(args, "audio");
const watermark = readBooleanToolParam(args, "watermark");
// providerOptions must be a plain object. Arrays are objects in JS, so
// exclude them explicitly — a bogus call like `providerOptions: ["seed", 42]`
// would otherwise be cast to `Record<string, unknown>` with numeric-string
// keys and silently forwarded to the provider.
const providerOptionsRaw = readSnakeCaseParamRaw(args, "providerOptions");
if (
providerOptionsRaw != null &&
(typeof providerOptionsRaw !== "object" || Array.isArray(providerOptionsRaw))
) {
throw new ToolInputError(
"providerOptions must be a JSON object keyed by provider-specific option name.",
);
}
const providerOptions =
providerOptionsRaw != null ? (providerOptionsRaw as Record<string, unknown>) : undefined;
const imageInputs = normalizeReferenceInputs({
args,
singularKey: "image",
pluralKey: "images",
maxCount: MAX_INPUT_IMAGES,
});
// *Roles: parallel string arrays giving each asset a semantic role hint.
// Use readSnakeCaseParamRaw so both camelCase and snake_case keys are accepted.
const imageRoles = parseRoleArray({
raw: readSnakeCaseParamRaw(args, "imageRoles"),
kind: "imageRoles",
assetCount: imageInputs.length,
});
const videoInputs = normalizeReferenceInputs({
args,
singularKey: "video",
pluralKey: "videos",
maxCount: MAX_INPUT_VIDEOS,
});
const videoRoles = parseRoleArray({
raw: readSnakeCaseParamRaw(args, "videoRoles"),
kind: "videoRoles",
assetCount: videoInputs.length,
});
const audioInputs = normalizeReferenceInputs({
args,
singularKey: "audioRef",
pluralKey: "audioRefs",
maxCount: MAX_INPUT_AUDIOS,
});
const audioRoles = parseRoleArray({
raw: readSnakeCaseParamRaw(args, "audioRoles"),
kind: "audioRoles",
assetCount: audioInputs.length,
});
const selectedProvider = resolveSelectedVideoGenerationProvider({
config: effectiveCfg,
videoGenerationModelConfig,
modelOverride: model,
});
const loadedReferenceImages = await loadReferenceAssets({
inputs: imageInputs,
expectedKind: "image",
workspaceDir: options?.workspaceDir,
sandboxConfig,
});
// Attach roles to the loaded image assets (positional, by index into images[]).
for (let i = 0; i < loadedReferenceImages.length; i++) {
const role = imageRoles[i];
if (role) {
loadedReferenceImages[i].sourceAsset.role = role;
}
}
const loadedReferenceVideos = await loadReferenceAssets({
inputs: videoInputs,
expectedKind: "video",
workspaceDir: options?.workspaceDir,
sandboxConfig,
});
for (let i = 0; i < loadedReferenceVideos.length; i++) {
const role = videoRoles[i];
if (role) {
loadedReferenceVideos[i].sourceAsset.role = role;
}
}
const loadedReferenceAudios = await loadReferenceAssets({
inputs: audioInputs,
expectedKind: "audio",
workspaceDir: options?.workspaceDir,
sandboxConfig,
});
for (let i = 0; i < loadedReferenceAudios.length; i++) {
const role = audioRoles[i];
if (role) {
loadedReferenceAudios[i].sourceAsset.role = role;
}
}
validateVideoGenerationCapabilities({
provider: selectedProvider,
model:
parseVideoGenerationModelRef(model)?.model ?? model ?? selectedProvider?.defaultModel,
inputImageCount: loadedReferenceImages.length,
inputVideoCount: loadedReferenceVideos.length,
inputAudioCount: loadedReferenceAudios.length,
size,
aspectRatio,
resolution,
durationSeconds,
audio,
watermark,
});
const taskHandle = createVideoGenerationTaskRun({
sessionKey: options?.agentSessionKey,
requesterOrigin: options?.requesterOrigin,
prompt,
providerId: selectedProvider?.id,
});
const shouldDetach = Boolean(taskHandle && options?.agentSessionKey?.trim());
if (shouldDetach) {
scheduleBackgroundWork(async () => {
try {
const executed = await executeVideoGenerationJob({
effectiveCfg,
prompt,
agentDir: options?.agentDir,
model,
size,
aspectRatio,
resolution,
durationSeconds,
audio,
watermark,
filename,
loadedReferenceImages,
loadedReferenceVideos,
loadedReferenceAudios,
taskHandle,
providerOptions,
});
completeVideoGenerationTaskRun({
handle: taskHandle,
provider: executed.provider,
model: executed.model,
count: executed.count,
paths: executed.savedPaths,
});
try {
await wakeVideoGenerationTaskCompletion({
config: effectiveCfg,
handle: taskHandle,
status: "ok",
statusLabel: "completed successfully",
result: executed.wakeResult,
mediaUrls: [...executed.savedPaths, ...executed.urlOnlyUrls],
});
} catch (error) {
log.warn("Video generation completion wake failed after successful generation", {
taskId: taskHandle?.taskId,
runId: taskHandle?.runId,
error,
});
}
} catch (error) {
failVideoGenerationTaskRun({
handle: taskHandle,
error,
});
await wakeVideoGenerationTaskCompletion({
config: effectiveCfg,
handle: taskHandle,
status: "error",
statusLabel: "failed",
result: formatErrorMessage(error),
});
return;
}
});
return {
content: [
{
type: "text",
text: `Background task started for video generation (${taskHandle?.taskId ?? "unknown"}). Do not call video_generate again for this request. Wait for the completion event; I'll post the finished video here when it's ready.`,
},
],
details: {
async: true,
status: "started",
...buildTaskRunDetails(taskHandle),
...buildMediaReferenceDetails({
entries: loadedReferenceImages,
singleKey: "image",
pluralKey: "images",
getResolvedInput: (entry) => entry.resolvedInput,
}),
...buildMediaReferenceDetails({
entries: loadedReferenceVideos,
singleKey: "video",
pluralKey: "videos",
getResolvedInput: (entry) => entry.resolvedInput,
singleRewriteKey: "videoRewrittenFrom",
}),
...(model ? { model } : {}),
...(size ? { size } : {}),
...(aspectRatio ? { aspectRatio } : {}),
...(resolution ? { resolution } : {}),
...(typeof durationSeconds === "number" ? { durationSeconds } : {}),
...(typeof audio === "boolean" ? { audio } : {}),
...(typeof watermark === "boolean" ? { watermark } : {}),
...(filename ? { filename } : {}),
},
};
}
try {
const executed = await executeVideoGenerationJob({
effectiveCfg,
prompt,
agentDir: options?.agentDir,
model,
size,
aspectRatio,
resolution,
durationSeconds,
audio,
watermark,
filename,
loadedReferenceImages,
loadedReferenceVideos,
loadedReferenceAudios,
taskHandle,
providerOptions,
});
completeVideoGenerationTaskRun({
handle: taskHandle,
provider: executed.provider,
model: executed.model,
count: executed.count,
paths: executed.savedPaths,
});
return {
content: [{ type: "text", text: executed.contentText }],
details: executed.details,
};
} catch (error) {
failVideoGenerationTaskRun({
handle: taskHandle,
error,
});
throw error;
}
},
};
}