feat(agents): track video generation tasks

2026-04-20 21:51:28 +00:00 · 2026-04-06 00:09:48 +01:00
parent 6d34a1c814
commit 40c499d489
5 changed files with 328 additions and 103 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -34,6 +34,7 @@ Docs: https://docs.openclaw.ai
 - Agents/cache: stabilize cache-relevant system prompt fingerprints by normalizing equivalent structured prompt whitespace, line endings, hook-added system context, and runtime capability ordering so semantically unchanged prompts reuse KV/cache more reliably. Thanks @vincentkoc.
 - Agents/tool prompts: remove the duplicate in-band tool inventory from agent system prompts so tool-calling models rely on the structured tool definitions as the single source of truth, improving prompt stability and reducing stale tool guidance.
 - Tools/video generation: add bundled xAI (`grok-imagine-video`) and Alibaba Model Studio Wan video providers, plus live-test/default model wiring for both.
+- Agents/video generation: register `video_generate` runs in the task ledger with task/run ids and lifecycle updates so long-running generations can be tracked more reliably.
 - Providers/CLI: remove bundled CLI text-provider backends and the `agents.defaults.cliBackends` surface, while keeping ACP harness sessions and Gemini media understanding on the native bundled providers.
 - Matrix/exec approvals: clarify unavailable-approval replies so Matrix no longer claims chat approvals are unsupported when native exec approvals are merely unconfigured. (#61424) Thanks @gumadeiras.
 - Docs/IRC: replace public IRC hostname examples with `irc.example.com` and recommend private servers for bot coordination while listing common public networks for intentional use.
@@ -108,6 +109,7 @@ Docs: https://docs.openclaw.ai
 - Exec approvals: remove heuristic command-obfuscation gating from host exec so gateway and node runs rely on explicit policy, allowlist, and strict inline-eval rules only.
 - Agents/tool results: cap live tool-result persistence and overflow-recovery truncation at 40k characters so oversized tool output stays bounded without discarding recent context entirely.
 - Discord/video replies: split text-plus-video deliveries into a text reply followed by a media-only send, and let live provider auth checks honor manifest-declared API key env vars like `MODELSTUDIO_API_KEY`.
+- Providers/fal video: switch long-running fal video generation to the queue-backed submit/status/result flow, and accept `FAL_API_KEY` as a compatibility alias for the canonical `FAL_KEY`.
 - Config/All Settings: keep the raw config view intact when sensitive fields are blank instead of corrupting or dropping the rendered snapshot. (#28214) Thanks @solodmd.
 - Plugin SDK/facades: back-fill bundled plugin facade sentinels before plugin-id tracking re-enters config loading, so CLI/provider startup no longer crashes with `shouldNormalizeGoogleProviderConfig is not a function` or other empty-facade reads during bundled plugin re-entry. Thanks @adam91holt.
 - Plugins/facades: back-fill facade sentinels before tracked-plugin resolution re-enters config loading, so facade exports stay defined during circular provider normalization. (#61180) Thanks @adam91holt.
--- a/docs/tools/video-generation.md
+++ b/docs/tools/video-generation.md
@@ -15,6 +15,10 @@ The `video_generate` tool lets the agent create videos using your configured pro
 The tool only appears when at least one video-generation provider is available. If you don't see `video_generate` in your agent's tools, configure `agents.defaults.videoGenerationModel` or set up a provider API key.
 </Note>

+<Note>
+OpenClaw now records `video_generate` runs in the task ledger when the agent has a session key, so long-running generations can be tracked with task/run ids even though the tool still waits for completion in the current turn.
+</Note>
+
 ## Quick start

 1. Set an API key for at least one provider (for example `OPENAI_API_KEY`, `GEMINI_API_KEY`, `MODELSTUDIO_API_KEY`, or `QWEN_API_KEY`).
@@ -115,6 +119,7 @@ If a provider fails, the next candidate is tried automatically. If all fail, the
 - OpenAI uses the native video endpoint and currently defaults to `sora-2`.
 - Qwen supports image/video references, but the upstream DashScope video endpoint currently requires remote `http(s)` URLs for those references.
 - xAI uses the native xAI video API and supports text-to-video, image-to-video, and remote video edit/extend flows.
+- fal uses the queue-backed fal video flow for long-running jobs instead of a single blocking inference request.

 ## Qwen reference inputs

--- a/src/agents/openclaw-tools.ts
+++ b/src/agents/openclaw-tools.ts
@@ -164,6 +164,8 @@ export function createOpenClawTools(
  const videoGenerateTool = createVideoGenerateTool({
    config: options?.config,
    agentDir: options?.agentDir,
+    agentSessionKey: options?.agentSessionKey,
+    requesterOrigin: deliveryContext ?? undefined,
    workspaceDir,
    sandbox,
    fsPolicy: options?.fsPolicy,
--- a/src/agents/tools/video-generate-tool.test.ts
+++ b/src/agents/tools/video-generate-tool.test.ts
@@ -4,6 +4,14 @@ import * as mediaStore from "../../media/store.js";
 import * as videoGenerationRuntime from "../../video-generation/runtime.js";
 import { createVideoGenerateTool } from "./video-generate-tool.js";

+const taskExecutorMocks = vi.hoisted(() => ({
+  createRunningTaskRun: vi.fn(),
+  completeTaskRunByRunId: vi.fn(),
+  failTaskRunByRunId: vi.fn(),
+}));
+
+vi.mock("../../tasks/task-executor.js", () => taskExecutorMocks);
+
 function asConfig(value: unknown): OpenClawConfig {
  return value as OpenClawConfig;
 }
@@ -12,6 +20,9 @@ describe("createVideoGenerateTool", () => {
  beforeEach(() => {
    vi.restoreAllMocks();
    vi.spyOn(videoGenerationRuntime, "listRuntimeVideoGenerationProviders").mockReturnValue([]);
+    taskExecutorMocks.createRunningTaskRun.mockReset();
+    taskExecutorMocks.completeTaskRunByRunId.mockReset();
+    taskExecutorMocks.failTaskRunByRunId.mockReset();
  });

  afterEach(() => {
@@ -39,6 +50,19 @@ describe("createVideoGenerateTool", () => {
  });

  it("generates videos, saves them, and emits MEDIA paths", async () => {
+    taskExecutorMocks.createRunningTaskRun.mockReturnValue({
+      taskId: "task-123",
+      runtime: "cli",
+      requesterSessionKey: "agent:main:discord:direct:123",
+      ownerKey: "agent:main:discord:direct:123",
+      scopeKind: "session",
+      task: "friendly lobster surfing",
+      status: "running",
+      deliveryStatus: "not_applicable",
+      notifyPolicy: "silent",
+      createdAt: Date.now(),
+    });
+    taskExecutorMocks.completeTaskRunByRunId.mockReturnValue(undefined);
    vi.spyOn(videoGenerationRuntime, "generateVideo").mockResolvedValue({
      provider: "qwen",
      model: "wan2.6-t2v",
@@ -67,6 +91,11 @@ describe("createVideoGenerateTool", () => {
          },
        },
      }),
+      agentSessionKey: "agent:main:discord:direct:123",
+      requesterOrigin: {
+        channel: "discord",
+        to: "channel:1",
+      },
    });
    expect(tool).not.toBeNull();
    if (!tool) {
@@ -82,12 +111,71 @@ describe("createVideoGenerateTool", () => {
      provider: "qwen",
      model: "wan2.6-t2v",
      count: 1,
+      task: {
+        taskId: "task-123",
+      },
      media: {
        mediaUrls: ["/tmp/generated-lobster.mp4"],
      },
      paths: ["/tmp/generated-lobster.mp4"],
      metadata: { taskId: "task-1" },
    });
+    expect(taskExecutorMocks.createRunningTaskRun).toHaveBeenCalledWith(
+      expect.objectContaining({
+        runtime: "cli",
+        requesterSessionKey: "agent:main:discord:direct:123",
+        ownerKey: "agent:main:discord:direct:123",
+        label: "Video generation",
+        task: "friendly lobster surfing",
+      }),
+    );
+    expect(taskExecutorMocks.completeTaskRunByRunId).toHaveBeenCalledWith(
+      expect.objectContaining({
+        runId: expect.stringMatching(/^tool:video_generate:/),
+      }),
+    );
+  });
+
+  it("marks the task failed when provider generation throws", async () => {
+    taskExecutorMocks.createRunningTaskRun.mockReturnValue({
+      taskId: "task-fail",
+      runtime: "cli",
+      requesterSessionKey: "agent:main:discord:direct:123",
+      ownerKey: "agent:main:discord:direct:123",
+      scopeKind: "session",
+      task: "broken lobster",
+      status: "running",
+      deliveryStatus: "not_applicable",
+      notifyPolicy: "silent",
+      createdAt: Date.now(),
+    });
+    taskExecutorMocks.failTaskRunByRunId.mockReturnValue(undefined);
+    vi.spyOn(videoGenerationRuntime, "generateVideo").mockRejectedValue(new Error("queue boom"));
+
+    const tool = createVideoGenerateTool({
+      config: asConfig({
+        agents: {
+          defaults: {
+            videoGenerationModel: { primary: "qwen/wan2.6-t2v" },
+          },
+        },
+      }),
+      agentSessionKey: "agent:main:discord:direct:123",
+    });
+    expect(tool).not.toBeNull();
+    if (!tool) {
+      throw new Error("expected video_generate tool");
+    }
+
+    await expect(tool.execute("call-2", { prompt: "broken lobster" })).rejects.toThrow(
+      "queue boom",
+    );
+    expect(taskExecutorMocks.failTaskRunByRunId).toHaveBeenCalledWith(
+      expect.objectContaining({
+        runId: expect.stringMatching(/^tool:video_generate:/),
+        error: "queue boom",
+      }),
+    );
  });

  it("shows duration normalization details from runtime metadata", async () => {
--- a/src/agents/tools/video-generate-tool.ts
+++ b/src/agents/tools/video-generate-tool.ts
@@ -1,11 +1,19 @@
+import crypto from "node:crypto";
 import { Type } from "@sinclair/typebox";
 import type { OpenClawConfig } from "../../config/config.js";
 import { loadConfig } from "../../config/config.js";
+import { createSubsystemLogger } from "../../logging/subsystem.js";
 import { saveMediaBuffer } from "../../media/store.js";
 import { loadWebMedia } from "../../media/web-media.js";
 import { readSnakeCaseParamRaw } from "../../param-key.js";
 import { getProviderEnvVars } from "../../secrets/provider-env-vars.js";
+import {
+  completeTaskRunByRunId,
+  createRunningTaskRun,
+  failTaskRunByRunId,
+} from "../../tasks/task-executor.js";
 import { resolveUserPath } from "../../utils.js";
+import type { DeliveryContext } from "../../utils/delivery-context.js";
 import { resolveVideoGenerationSupportedDurations } from "../../video-generation/duration-support.js";
 import { parseVideoGenerationModelRef } from "../../video-generation/model-ref.js";
 import {
@@ -45,6 +53,7 @@ import {
  type ToolFsPolicy,
 } from "./tool-runtime.helpers.js";

+const log = createSubsystemLogger("agents/tools/video-generate");
 const MAX_INPUT_IMAGES = 5;
 const MAX_INPUT_VIDEOS = 4;
 const SUPPORTED_ASPECT_RATIOS = new Set([
@@ -398,6 +407,94 @@ type VideoGenerateSandboxConfig = {
  bridge: SandboxFsBridge;
 };

+type VideoGenerationTaskHandle = {
+  taskId: string;
+  runId: string;
+};
+
+function createVideoGenerationTaskRun(params: {
+  sessionKey?: string;
+  requesterOrigin?: DeliveryContext;
+  prompt: string;
+  providerId?: string;
+}): VideoGenerationTaskHandle | null {
+  const sessionKey = params.sessionKey?.trim();
+  if (!sessionKey) {
+    return null;
+  }
+  const runId = `tool:video_generate:${crypto.randomUUID()}`;
+  try {
+    const task = createRunningTaskRun({
+      runtime: "cli",
+      sourceId: params.providerId ? `video_generate:${params.providerId}` : "video_generate",
+      requesterSessionKey: sessionKey,
+      ownerKey: sessionKey,
+      scopeKind: "session",
+      requesterOrigin: params.requesterOrigin,
+      childSessionKey: sessionKey,
+      runId,
+      label: "Video generation",
+      task: params.prompt,
+      deliveryStatus: "not_applicable",
+      notifyPolicy: "silent",
+      startedAt: Date.now(),
+      lastEventAt: Date.now(),
+      progressSummary: "Generating video",
+    });
+    return {
+      taskId: task.taskId,
+      runId,
+    };
+  } catch (error) {
+    log.warn("Failed to create video generation task ledger record", {
+      sessionKey,
+      providerId: params.providerId,
+      error,
+    });
+    return null;
+  }
+}
+
+function completeVideoGenerationTaskRun(params: {
+  handle: VideoGenerationTaskHandle | null;
+  provider: string;
+  model: string;
+  count: number;
+  paths: string[];
+}) {
+  if (!params.handle) {
+    return;
+  }
+  const endedAt = Date.now();
+  const target = params.count === 1 ? params.paths[0] : `${params.count} files`;
+  completeTaskRunByRunId({
+    runId: params.handle.runId,
+    runtime: "cli",
+    endedAt,
+    lastEventAt: endedAt,
+    terminalSummary: `Generated ${params.count} video${params.count === 1 ? "" : "s"} with ${params.provider}/${params.model}${target ? ` -> ${target}` : ""}.`,
+  });
+}
+
+function failVideoGenerationTaskRun(params: {
+  handle: VideoGenerationTaskHandle | null;
+  error: unknown;
+}) {
+  if (!params.handle) {
+    return;
+  }
+  const endedAt = Date.now();
+  const errorText = params.error instanceof Error ? params.error.message : String(params.error);
+  failTaskRunByRunId({
+    runId: params.handle.runId,
+    runtime: "cli",
+    endedAt,
+    lastEventAt: endedAt,
+    error: errorText,
+    terminalSummary: errorText,
+  });
+}
+
 async function loadReferenceAssets(params: {
  inputs: string[];
  expectedKind: "image" | "video";
@@ -516,6 +613,8 @@ async function loadReferenceAssets(params: {
 export function createVideoGenerateTool(options?: {
  config?: OpenClawConfig;
  agentDir?: string;
+  agentSessionKey?: string;
+  requesterOrigin?: DeliveryContext;
  workspaceDir?: string;
  sandbox?: VideoGenerateSandboxConfig;
  fsPolicy?: ToolFsPolicy;
@@ -668,118 +767,147 @@ export function createVideoGenerateTool(options?: {
        audio,
        watermark,
      });
-
-      const result = await generateVideo({
-        cfg: effectiveCfg,
+      const taskHandle = createVideoGenerationTaskRun({
+        sessionKey: options?.agentSessionKey,
+        requesterOrigin: options?.requesterOrigin,
        prompt,
-        agentDir: options?.agentDir,
-        modelOverride: model,
-        size,
-        aspectRatio,
-        resolution,
-        durationSeconds,
-        audio,
-        watermark,
-        inputImages: loadedReferenceImages.map((entry) => entry.sourceAsset),
-        inputVideos: loadedReferenceVideos.map((entry) => entry.sourceAsset),
+        providerId: selectedProvider?.id,
      });
-      const savedVideos = await Promise.all(
-        result.videos.map((video) =>
-          saveMediaBuffer(
-            video.buffer,
-            video.mimeType,
-            "tool-video-generation",
-            undefined,
-            filename || video.fileName,
-          ),
-        ),
-      );
-      const requestedDurationSeconds =
-        typeof result.metadata?.requestedDurationSeconds === "number" &&
-        Number.isFinite(result.metadata.requestedDurationSeconds)
-          ? result.metadata.requestedDurationSeconds
-          : durationSeconds;
-      const normalizedDurationSeconds =
-        typeof result.metadata?.normalizedDurationSeconds === "number" &&
-        Number.isFinite(result.metadata.normalizedDurationSeconds)
-          ? result.metadata.normalizedDurationSeconds
-          : requestedDurationSeconds;
-      const supportedDurationSeconds = Array.isArray(result.metadata?.supportedDurationSeconds)
-        ? result.metadata.supportedDurationSeconds.filter(
-            (entry): entry is number => typeof entry === "number" && Number.isFinite(entry),
-          )
-        : undefined;
-      const lines = [
-        `Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
-        typeof requestedDurationSeconds === "number" &&
-        typeof normalizedDurationSeconds === "number" &&
-        requestedDurationSeconds !== normalizedDurationSeconds
-          ? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.`
-          : null,
-        ...savedVideos.map((video) => `MEDIA:${video.path}`),
-      ].filter((entry): entry is string => Boolean(entry));

-      return {
-        content: [{ type: "text", text: lines.join("\n") }],
-        details: {
+      try {
+        const result = await generateVideo({
+          cfg: effectiveCfg,
+          prompt,
+          agentDir: options?.agentDir,
+          modelOverride: model,
+          size,
+          aspectRatio,
+          resolution,
+          durationSeconds,
+          audio,
+          watermark,
+          inputImages: loadedReferenceImages.map((entry) => entry.sourceAsset),
+          inputVideos: loadedReferenceVideos.map((entry) => entry.sourceAsset),
+        });
+        const savedVideos = await Promise.all(
+          result.videos.map((video) =>
+            saveMediaBuffer(
+              video.buffer,
+              video.mimeType,
+              "tool-video-generation",
+              undefined,
+              filename || video.fileName,
+            ),
+          ),
+        );
+        completeVideoGenerationTaskRun({
+          handle: taskHandle,
          provider: result.provider,
          model: result.model,
          count: savedVideos.length,
-          media: {
-            mediaUrls: savedVideos.map((video) => video.path),
-          },
          paths: savedVideos.map((video) => video.path),
-          ...(loadedReferenceImages.length === 1
-            ? {
-                image: loadedReferenceImages[0]?.resolvedInput,
-                ...(loadedReferenceImages[0]?.rewrittenFrom
-                  ? { rewrittenFrom: loadedReferenceImages[0].rewrittenFrom }
-                  : {}),
-              }
-            : loadedReferenceImages.length > 1
-              ? {
-                  images: loadedReferenceImages.map((entry) => ({
-                    image: entry.resolvedInput,
-                    ...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
-                  })),
-                }
-              : {}),
-          ...(loadedReferenceVideos.length === 1
-            ? {
-                video: loadedReferenceVideos[0]?.resolvedInput,
-                ...(loadedReferenceVideos[0]?.rewrittenFrom
-                  ? { videoRewrittenFrom: loadedReferenceVideos[0].rewrittenFrom }
-                  : {}),
-              }
-            : loadedReferenceVideos.length > 1
-              ? {
-                  videos: loadedReferenceVideos.map((entry) => ({
-                    video: entry.resolvedInput,
-                    ...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
-                  })),
-                }
-              : {}),
-          ...(size ? { size } : {}),
-          ...(aspectRatio ? { aspectRatio } : {}),
-          ...(resolution ? { resolution } : {}),
-          ...(typeof normalizedDurationSeconds === "number"
-            ? { durationSeconds: normalizedDurationSeconds }
-            : {}),
-          ...(typeof requestedDurationSeconds === "number" &&
+        });
+        const requestedDurationSeconds =
+          typeof result.metadata?.requestedDurationSeconds === "number" &&
+          Number.isFinite(result.metadata.requestedDurationSeconds)
+            ? result.metadata.requestedDurationSeconds
+            : durationSeconds;
+        const normalizedDurationSeconds =
+          typeof result.metadata?.normalizedDurationSeconds === "number" &&
+          Number.isFinite(result.metadata.normalizedDurationSeconds)
+            ? result.metadata.normalizedDurationSeconds
+            : requestedDurationSeconds;
+        const supportedDurationSeconds = Array.isArray(result.metadata?.supportedDurationSeconds)
+          ? result.metadata.supportedDurationSeconds.filter(
+              (entry): entry is number => typeof entry === "number" && Number.isFinite(entry),
+            )
+          : undefined;
+        const lines = [
+          `Generated ${savedVideos.length} video${savedVideos.length === 1 ? "" : "s"} with ${result.provider}/${result.model}.`,
+          typeof requestedDurationSeconds === "number" &&
          typeof normalizedDurationSeconds === "number" &&
          requestedDurationSeconds !== normalizedDurationSeconds
-            ? { requestedDurationSeconds }
-            : {}),
-          ...(supportedDurationSeconds && supportedDurationSeconds.length > 0
-            ? { supportedDurationSeconds }
-            : {}),
-          ...(typeof audio === "boolean" ? { audio } : {}),
-          ...(typeof watermark === "boolean" ? { watermark } : {}),
-          ...(filename ? { filename } : {}),
-          attempts: result.attempts,
-          metadata: result.metadata,
-        },
-      };
+            ? `Duration normalized: requested ${requestedDurationSeconds}s; used ${normalizedDurationSeconds}s.`
+            : null,
+          ...savedVideos.map((video) => `MEDIA:${video.path}`),
+        ].filter((entry): entry is string => Boolean(entry));
+
+        return {
+          content: [{ type: "text", text: lines.join("\n") }],
+          details: {
+            provider: result.provider,
+            model: result.model,
+            count: savedVideos.length,
+            media: {
+              mediaUrls: savedVideos.map((video) => video.path),
+            },
+            paths: savedVideos.map((video) => video.path),
+            ...(taskHandle
+                ? {
+                  task: {
+                    taskId: taskHandle.taskId,
+                    runId: taskHandle.runId,
+                  },
+                }
+              : {}),
+            ...(loadedReferenceImages.length === 1
+              ? {
+                  image: loadedReferenceImages[0]?.resolvedInput,
+                  ...(loadedReferenceImages[0]?.rewrittenFrom
+                    ? { rewrittenFrom: loadedReferenceImages[0].rewrittenFrom }
+                    : {}),
+                }
+              : loadedReferenceImages.length > 1
+                ? {
+                    images: loadedReferenceImages.map((entry) => ({
+                      image: entry.resolvedInput,
+                      ...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
+                    })),
+                  }
+                : {}),
+            ...(loadedReferenceVideos.length === 1
+              ? {
+                  video: loadedReferenceVideos[0]?.resolvedInput,
+                  ...(loadedReferenceVideos[0]?.rewrittenFrom
+                    ? { videoRewrittenFrom: loadedReferenceVideos[0].rewrittenFrom }
+                    : {}),
+                }
+              : loadedReferenceVideos.length > 1
+                ? {
+                    videos: loadedReferenceVideos.map((entry) => ({
+                      video: entry.resolvedInput,
+                      ...(entry.rewrittenFrom ? { rewrittenFrom: entry.rewrittenFrom } : {}),
+                    })),
+                  }
+                : {}),
+            ...(size ? { size } : {}),
+            ...(aspectRatio ? { aspectRatio } : {}),
+            ...(resolution ? { resolution } : {}),
+            ...(typeof normalizedDurationSeconds === "number"
+              ? { durationSeconds: normalizedDurationSeconds }
+              : {}),
+            ...(typeof requestedDurationSeconds === "number" &&
+            typeof normalizedDurationSeconds === "number" &&
+            requestedDurationSeconds !== normalizedDurationSeconds
+              ? { requestedDurationSeconds }
+              : {}),
+            ...(supportedDurationSeconds && supportedDurationSeconds.length > 0
+              ? { supportedDurationSeconds }
+              : {}),
+            ...(typeof audio === "boolean" ? { audio } : {}),
+            ...(typeof watermark === "boolean" ? { watermark } : {}),
+            ...(filename ? { filename } : {}),
+            attempts: result.attempts,
+            metadata: result.metadata,
+          },
+        };
+      } catch (error) {
+        failVideoGenerationTaskRun({
+          handle: taskHandle,
+          error,
+        });
+        throw error;
+      }
    },
  };
 }