feat(agents): add generation tool timeouts

This commit is contained in:
Peter Steinberger
2026-04-24 00:03:59 +01:00
parent bd49117a50
commit f0a7a85e7a
23 changed files with 141 additions and 7 deletions

View File

@@ -6,6 +6,7 @@ Docs: https://docs.openclaw.ai
### Changes
- Agents/tools: add optional per-call `timeoutMs` support for image, video, music, and TTS generation tools so agents can extend provider request timeouts only when a specific generation needs it.
- Agents/subagents: add optional forked context for native `sessions_spawn` runs so agents can let a child inherit the requester transcript when needed, while keeping clean isolated sessions as the default; includes prompt guidance, context-engine hook metadata, docs, and QA coverage.
- Codex harness: add structured debug logging for embedded harness selection decisions so `/status` stays simple while gateway logs explain auto-selection and Pi fallback reasons. (#70760) Thanks @100yenadmin.
- Providers/OpenAI: add forward-compatible `gpt-5.5` and `gpt-5.5-pro` support for OpenAI API keys, OpenAI Codex OAuth, and the Codex CLI default model.

View File

@@ -96,6 +96,10 @@ Resolution hint.
Number of images to generate (14).
</ParamField>
<ParamField path="timeoutMs" type="number">
Optional provider request timeout in milliseconds.
</ParamField>
<ParamField path="filename" type="string">
Output filename hint.
</ParamField>

View File

@@ -125,6 +125,7 @@ Direct generation example:
| `image` | string | Single reference image path or URL |
| `images` | string[] | Multiple reference images (up to 10) |
| `durationSeconds` | number | Target duration in seconds when the provider supports duration hints |
| `timeoutMs` | number | Optional provider request timeout in milliseconds |
| `format` | string | Output format hint (`mp3` or `wav`) when the provider supports it |
| `filename` | string | Output filename hint |

View File

@@ -507,6 +507,8 @@ Notes:
The `tts` tool converts text to speech and returns an audio attachment for
reply delivery. When the channel is Feishu, Matrix, Telegram, or WhatsApp,
the audio is delivered as a voice message rather than a file attachment.
It accepts optional `channel` and `timeoutMs` fields; `timeoutMs` is a
per-call provider request timeout in milliseconds.
## Gateway RPC

View File

@@ -170,6 +170,7 @@ dimensions). Providers that do not declare it surface the value via
| `action` | string | `"generate"` (default), `"status"`, or `"list"` |
| `model` | string | Provider/model override (e.g. `runway/gen4.5`) |
| `filename` | string | Output filename hint |
| `timeoutMs` | number | Optional provider request timeout in milliseconds |
| `providerOptions` | object | Provider-specific options as a JSON object (e.g. `{"seed": 42, "draft": true}`). Providers that declare a typed schema validate the keys and types; unknown keys or mismatches skip the candidate during fallback. Providers without a declared schema receive the options as-is. Run `video_generate action=list` to see what each provider accepts |
Not all providers support all parameters. OpenClaw already normalizes duration to the closest provider-supported value, and it also remaps translated geometry hints such as size-to-aspect-ratio when a fallback provider exposes a different control surface. Truly unsupported overrides are ignored on a best-effort basis and reported as warnings in the tool result. Hard capability limits (such as too many reference inputs) fail before submission.

View File

@@ -165,7 +165,7 @@ export function buildGoogleImageGenerationProvider(): ImageGenerationProvider {
: {}),
},
},
timeoutMs: 60_000,
timeoutMs: req.timeoutMs ?? 60_000,
fetchFn: fetch,
pinDns: false,
allowPrivateNetwork,

View File

@@ -753,6 +753,7 @@ export async function textToSpeech(params: {
channel?: string;
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
timeoutMs?: number;
}): Promise<TtsResult> {
const synthesis = await synthesizeSpeech(params);
if (!synthesis.success || !synthesis.audioBuffer || !synthesis.fileExtension) {
@@ -791,6 +792,7 @@ export async function synthesizeSpeech(params: {
channel?: string;
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
timeoutMs?: number;
}): Promise<TtsSynthesisResult> {
const setup = resolveTtsRequestSetup({
text: params.text,
@@ -804,6 +806,7 @@ export async function synthesizeSpeech(params: {
}
const { config, providers } = setup;
const timeoutMs = params.timeoutMs ?? config.timeoutMs;
const target = supportsNativeVoiceNoteTts(params.channel) ? "voice-note" : "audio-file";
const errors: string[] = [];
@@ -840,7 +843,7 @@ export async function synthesizeSpeech(params: {
providerConfig: resolvedProvider.providerConfig,
target,
providerOverrides: params.overrides?.providerOverrides?.[resolvedProvider.provider.id],
timeoutMs: config.timeoutMs,
timeoutMs,
});
const latencyMs = Date.now() - providerStart;
attempts.push({

View File

@@ -25,6 +25,7 @@ import {
buildMediaReferenceDetails,
isCapabilityProviderConfigured,
normalizeMediaReferenceInputs,
readGenerationTimeoutMs,
resolveCapabilityModelConfigForTool,
resolveGenerateAction,
resolveMediaToolLocalRoots,
@@ -108,6 +109,12 @@ const ImageGenerateToolSchema = Type.Object({
maximum: MAX_COUNT,
}),
),
timeoutMs: Type.Optional(
Type.Number({
description: "Optional provider request timeout in milliseconds.",
minimum: 1,
}),
),
});
function getImageGenerationProviderAuthEnvVars(providerId: string): string[] {
@@ -490,6 +497,7 @@ export function createImageGenerateTool(options?: {
const size = readStringParam(params, "size");
const aspectRatio = normalizeAspectRatio(readStringParam(params, "aspectRatio"));
const explicitResolution = normalizeResolution(readStringParam(params, "resolution"));
const timeoutMs = readGenerationTimeoutMs(params);
const selectedProvider = resolveSelectedImageGenerationProvider({
config: effectiveCfg,
imageGenerationModelConfig,
@@ -535,6 +543,7 @@ export function createImageGenerateTool(options?: {
resolution,
count,
inputImages,
timeoutMs,
});
const ignoredOverrides = result.ignoredOverrides ?? [];
const displayProvider = sanitizeInlineDirectiveText(result.provider);
@@ -617,6 +626,7 @@ export function createImageGenerateTool(options?: {
? { aspectRatio: normalizedAspectRatio ?? aspectRatio }
: {}),
...(filename ? { filename } : {}),
...(timeoutMs !== undefined ? { timeoutMs } : {}),
attempts: result.attempts,
...(result.normalization ? { normalization: result.normalization } : {}),
metadata: result.metadata,

View File

@@ -9,7 +9,12 @@ import {
} from "../../shared/string-coerce.js";
import { normalizeModelRef } from "../model-selection.js";
import { normalizeProviderId } from "../provider-id.js";
import { ToolInputError, readStringArrayParam, readStringParam } from "./common.js";
import {
ToolInputError,
readNumberParam,
readStringArrayParam,
readStringParam,
} from "./common.js";
import type { ImageModelConfig } from "./image-tool.helpers.js";
import {
buildToolModelConfigFromCandidates,
@@ -78,6 +83,20 @@ export function applyMusicGenerationModelConfigDefaults(
return applyAgentDefaultModelConfig(cfg, "musicGenerationModel", musicGenerationModelConfig);
}
export function readGenerationTimeoutMs(args: Record<string, unknown>): number | undefined {
const timeoutMs = readNumberParam(args, "timeoutMs", {
integer: true,
strict: true,
});
if (timeoutMs === undefined) {
return undefined;
}
if (timeoutMs <= 0) {
throw new ToolInputError("timeoutMs must be a positive integer in milliseconds.");
}
return timeoutMs;
}
function applyAgentDefaultModelConfig(
cfg: OpenClawConfig | undefined,
key: "imageModel" | "imageGenerationModel" | "videoGenerationModel" | "musicGenerationModel",

View File

@@ -28,6 +28,7 @@ import {
buildTaskRunDetails,
normalizeMediaReferenceInputs,
readBooleanToolParam,
readGenerationTimeoutMs,
resolveCapabilityModelConfigForTool,
resolveGenerateAction,
resolveMediaToolLocalRoots,
@@ -98,6 +99,12 @@ const MusicGenerateToolSchema = Type.Object({
minimum: 1,
}),
),
timeoutMs: Type.Optional(
Type.Number({
description: "Optional provider request timeout in milliseconds.",
minimum: 1,
}),
),
format: Type.Optional(
Type.String({
description: 'Optional output format hint: "mp3" or "wav" when the provider supports it.',
@@ -336,6 +343,7 @@ async function executeMusicGenerationJob(params: {
filename?: string;
loadedReferenceImages: LoadedReferenceImage[];
taskHandle?: MusicGenerationTaskHandle | null;
timeoutMs?: number;
}): Promise<ExecutedMusicGeneration> {
if (params.taskHandle) {
recordMusicGenerationTaskProgress({
@@ -353,6 +361,7 @@ async function executeMusicGenerationJob(params: {
durationSeconds: params.durationSeconds,
format: params.format,
inputImages: params.loadedReferenceImages.map((entry) => entry.sourceImage),
timeoutMs: params.timeoutMs,
});
if (params.taskHandle) {
recordMusicGenerationTaskProgress({
@@ -437,6 +446,7 @@ async function executeMusicGenerationJob(params: {
: {}),
...(!ignoredOverrideKeys.has("format") && params.format ? { format: params.format } : {}),
...(params.filename ? { filename: params.filename } : {}),
...(params.timeoutMs !== undefined ? { timeoutMs: params.timeoutMs } : {}),
...buildMediaReferenceDetails({
entries: params.loadedReferenceImages,
singleKey: "image",
@@ -520,6 +530,7 @@ export function createMusicGenerateTool(options?: {
});
const format = normalizeOutputFormat(readStringParam(args, "format"));
const filename = readStringParam(args, "filename");
const timeoutMs = readGenerationTimeoutMs(args);
const imageInputs = normalizeReferenceImageInputs(args);
const selectedProvider = resolveSelectedMusicGenerationProvider({
config: effectiveCfg,
@@ -564,6 +575,7 @@ export function createMusicGenerateTool(options?: {
filename,
loadedReferenceImages,
taskHandle,
timeoutMs,
});
completeMusicGenerationTaskRun({
handle: taskHandle,
@@ -627,6 +639,7 @@ export function createMusicGenerateTool(options?: {
...(typeof durationSeconds === "number" ? { durationSeconds } : {}),
...(format ? { format } : {}),
...(filename ? { filename } : {}),
...(timeoutMs !== undefined ? { timeoutMs } : {}),
},
};
}
@@ -644,6 +657,7 @@ export function createMusicGenerateTool(options?: {
filename,
loadedReferenceImages,
taskHandle,
timeoutMs,
});
completeMusicGenerationTaskRun({
handle: taskHandle,

View File

@@ -43,6 +43,26 @@ describe("createTtsTool", () => {
expect(JSON.stringify(result.content)).not.toContain("MEDIA:");
});
it("passes an optional timeout to speech generation", async () => {
textToSpeechSpy.mockResolvedValue({
success: true,
audioPath: "/tmp/reply.opus",
provider: "test",
voiceCompatible: true,
});
const tool = createTtsTool();
const result = await tool.execute("call-1", { text: "hello", timeoutMs: 12_345 });
expect(textToSpeechSpy).toHaveBeenCalledWith(
expect.objectContaining({
text: "hello",
timeoutMs: 12_345,
}),
);
expect(result.details).toMatchObject({ timeoutMs: 12_345 });
});
it("echoes longer utterances verbatim into the tool-result content", async () => {
textToSpeechSpy.mockResolvedValue({
success: true,

View File

@@ -5,15 +5,35 @@ import type { OpenClawConfig } from "../../config/types.openclaw.js";
import { textToSpeech } from "../../tts/tts.js";
import type { GatewayMessageChannel } from "../../utils/message-channel.js";
import type { AnyAgentTool } from "./common.js";
import { readStringParam } from "./common.js";
import { ToolInputError, readNumberParam, readStringParam } from "./common.js";
const TtsToolSchema = Type.Object({
text: Type.String({ description: "Text to convert to speech." }),
channel: Type.Optional(
Type.String({ description: "Optional channel id to pick output format." }),
),
timeoutMs: Type.Optional(
Type.Number({
description: "Optional provider request timeout in milliseconds.",
minimum: 1,
}),
),
});
function readTtsTimeoutMs(args: Record<string, unknown>): number | undefined {
const timeoutMs = readNumberParam(args, "timeoutMs", {
integer: true,
strict: true,
});
if (timeoutMs === undefined) {
return undefined;
}
if (timeoutMs <= 0) {
throw new ToolInputError("timeoutMs must be a positive integer in milliseconds.");
}
return timeoutMs;
}
/**
* Defuse reply-directive tokens inside spoken transcripts before they flow
* through tool-result content. When verbose tool output is enabled,
@@ -48,11 +68,13 @@ export function createTtsTool(opts?: {
const params = args as Record<string, unknown>;
const text = readStringParam(params, "text", { required: true });
const channel = readStringParam(params, "channel");
const timeoutMs = readTtsTimeoutMs(params);
const cfg = opts?.config ?? loadConfig();
const result = await textToSpeech({
text,
cfg,
channel: channel ?? opts?.agentChannel,
timeoutMs,
});
if (result.success && result.audioPath) {
@@ -66,6 +88,7 @@ export function createTtsTool(opts?: {
details: {
audioPath: result.audioPath,
provider: result.provider,
...(timeoutMs !== undefined ? { timeoutMs } : {}),
media: {
mediaUrl: result.audioPath,
trustedLocalMedia: true,

View File

@@ -32,6 +32,7 @@ import {
buildTaskRunDetails,
normalizeMediaReferenceInputs,
readBooleanToolParam,
readGenerationTimeoutMs,
resolveCapabilityModelConfigForTool,
resolveGenerateAction,
resolveMediaToolLocalRoots,
@@ -205,6 +206,12 @@ const VideoGenerateToolSchema = Type.Object({
"keys each provider accepts.",
}),
),
timeoutMs: Type.Optional(
Type.Number({
description: "Optional provider request timeout in milliseconds.",
minimum: 1,
}),
),
});
export function resolveVideoGenerationModelConfigForTool(params: {
@@ -562,6 +569,7 @@ async function executeVideoGenerationJob(params: {
loadedReferenceAudios: LoadedReferenceAsset[];
taskHandle?: VideoGenerationTaskHandle | null;
providerOptions?: Record<string, unknown>;
timeoutMs?: number;
}): Promise<ExecutedVideoGeneration> {
if (params.taskHandle) {
recordVideoGenerationTaskProgress({
@@ -584,6 +592,7 @@ async function executeVideoGenerationJob(params: {
inputVideos: params.loadedReferenceVideos.map((entry) => entry.sourceAsset),
inputAudios: params.loadedReferenceAudios.map((entry) => entry.sourceAsset),
providerOptions: params.providerOptions,
timeoutMs: params.timeoutMs,
});
if (params.taskHandle) {
recordVideoGenerationTaskProgress({
@@ -747,6 +756,7 @@ async function executeVideoGenerationJob(params: {
? { watermark: params.watermark }
: {}),
...(params.filename ? { filename: params.filename } : {}),
...(params.timeoutMs !== undefined ? { timeoutMs: params.timeoutMs } : {}),
attempts: result.attempts,
...(result.normalization ? { normalization: result.normalization } : {}),
metadata: result.metadata,
@@ -825,6 +835,7 @@ export function createVideoGenerateTool(options?: {
});
const audio = readBooleanToolParam(args, "audio");
const watermark = readBooleanToolParam(args, "watermark");
const timeoutMs = readGenerationTimeoutMs(args);
// providerOptions must be a plain object. Arrays are objects in JS, so
// exclude them explicitly — a bogus call like `providerOptions: ["seed", 42]`
// would otherwise be cast to `Record<string, unknown>` with numeric-string
@@ -960,6 +971,7 @@ export function createVideoGenerateTool(options?: {
loadedReferenceAudios,
taskHandle,
providerOptions,
timeoutMs,
});
completeVideoGenerationTaskRun({
handle: taskHandle,
@@ -1032,6 +1044,7 @@ export function createVideoGenerateTool(options?: {
...(typeof audio === "boolean" ? { audio } : {}),
...(typeof watermark === "boolean" ? { watermark } : {}),
...(filename ? { filename } : {}),
...(timeoutMs !== undefined ? { timeoutMs } : {}),
},
};
}
@@ -1054,6 +1067,7 @@ export function createVideoGenerateTool(options?: {
loadedReferenceAudios,
taskHandle,
providerOptions,
timeoutMs,
});
completeVideoGenerationTaskRun({
handle: taskHandle,

View File

@@ -21,6 +21,8 @@ export type GenerateImageParams = {
aspectRatio?: string;
resolution?: ImageGenerationResolution;
inputImages?: ImageGenerationSourceImage[];
/** Optional per-request provider timeout in milliseconds. */
timeoutMs?: number;
};
export type GenerateImageRuntimeResult = {

View File

@@ -26,6 +26,7 @@ describe("image-generation runtime", () => {
it("generates images through the active image-generation provider", async () => {
const authStore = { version: 1, profiles: {} } as const;
let seenAuthStore: unknown;
let seenTimeoutMs: number | undefined;
mocks.resolveAgentModelPrimaryValue.mockReturnValue("image-plugin/img-v1");
const provider: ImageGenerationProvider = {
id: "image-plugin",
@@ -33,8 +34,9 @@ describe("image-generation runtime", () => {
generate: {},
edit: { enabled: false },
},
async generateImage(req: { authStore?: unknown }) {
async generateImage(req: { authStore?: unknown; timeoutMs?: number }) {
seenAuthStore = req.authStore;
seenTimeoutMs = req.timeoutMs;
return {
images: [
{
@@ -60,12 +62,14 @@ describe("image-generation runtime", () => {
prompt: "draw a cat",
agentDir: "/tmp/agent",
authStore,
timeoutMs: 12_345,
});
expect(result.provider).toBe("image-plugin");
expect(result.model).toBe("img-v1");
expect(result.attempts).toEqual([]);
expect(seenAuthStore).toEqual(authStore);
expect(seenTimeoutMs).toBe(12_345);
expect(result.images).toEqual([
{
buffer: Buffer.from("png-bytes"),

View File

@@ -85,6 +85,7 @@ export async function generateImage(
aspectRatio: sanitized.aspectRatio,
resolution: sanitized.resolution,
inputImages: params.inputImages,
...(params.timeoutMs !== undefined ? { timeoutMs: params.timeoutMs } : {}),
});
if (!Array.isArray(result.images) || result.images.length === 0) {
throw new Error("Image generation provider returned no images.");

View File

@@ -21,6 +21,8 @@ export type GenerateMusicParams = {
durationSeconds?: number;
format?: MusicGenerationOutputFormat;
inputImages?: MusicGenerationSourceImage[];
/** Optional per-request provider timeout in milliseconds. */
timeoutMs?: number;
};
export type GenerateMusicRuntimeResult = {

View File

@@ -26,12 +26,14 @@ describe("music-generation runtime", () => {
it("generates tracks through the active music-generation provider", async () => {
const authStore = { version: 1, profiles: {} } as const;
let seenAuthStore: unknown;
let seenTimeoutMs: number | undefined;
mocks.resolveAgentModelPrimaryValue.mockReturnValue("music-plugin/track-v1");
const provider: MusicGenerationProvider = {
id: "music-plugin",
capabilities: {},
async generateMusic(req: { authStore?: unknown }) {
async generateMusic(req: { authStore?: unknown; timeoutMs?: number }) {
seenAuthStore = req.authStore;
seenTimeoutMs = req.timeoutMs;
return {
tracks: [
{
@@ -57,6 +59,7 @@ describe("music-generation runtime", () => {
prompt: "play a synth line",
agentDir: "/tmp/agent",
authStore,
timeoutMs: 12_345,
});
expect(result.provider).toBe("music-plugin");
@@ -64,6 +67,7 @@ describe("music-generation runtime", () => {
expect(result.attempts).toEqual([]);
expect(result.ignoredOverrides).toEqual([]);
expect(seenAuthStore).toEqual(authStore);
expect(seenTimeoutMs).toBe(12_345);
expect(result.tracks).toEqual([
{
buffer: Buffer.from("mp3-bytes"),

View File

@@ -82,6 +82,7 @@ export async function generateMusic(
durationSeconds: sanitized.durationSeconds,
format: sanitized.format,
inputImages: params.inputImages,
...(params.timeoutMs !== undefined ? { timeoutMs: params.timeoutMs } : {}),
});
if (!Array.isArray(result.tracks) || result.tracks.length === 0) {
throw new Error("Music generation provider returned no tracks.");

View File

@@ -69,6 +69,7 @@ export type TtsRequestParams = {
channel?: string;
overrides?: TtsDirectiveOverrides;
disableFallback?: boolean;
timeoutMs?: number;
};
export type TtsTelephonyRequestParams = {

View File

@@ -27,6 +27,8 @@ export type GenerateVideoParams = {
inputAudios?: VideoGenerationSourceAsset[];
/** Arbitrary provider-specific options forwarded as-is to provider.generateVideo. */
providerOptions?: Record<string, unknown>;
/** Optional per-request provider timeout in milliseconds. */
timeoutMs?: number;
};
export type GenerateVideoRuntimeResult = {

View File

@@ -43,12 +43,14 @@ describe("video-generation runtime", () => {
it("generates videos through the active video-generation provider", async () => {
const authStore = { version: 1, profiles: {} } as const;
let seenAuthStore: unknown;
let seenTimeoutMs: number | undefined;
mocks.resolveAgentModelPrimaryValue.mockReturnValue("video-plugin/vid-v1");
const provider: VideoGenerationProvider = {
id: "video-plugin",
capabilities: {},
async generateVideo(req: { authStore?: unknown }) {
async generateVideo(req: { authStore?: unknown; timeoutMs?: number }) {
seenAuthStore = req.authStore;
seenTimeoutMs = req.timeoutMs;
return {
videos: [
{
@@ -74,6 +76,7 @@ describe("video-generation runtime", () => {
prompt: "animate a cat",
agentDir: "/tmp/agent",
authStore,
timeoutMs: 12_345,
});
expect(result.provider).toBe("video-plugin");
@@ -81,6 +84,7 @@ describe("video-generation runtime", () => {
expect(result.attempts).toEqual([]);
expect(result.ignoredOverrides).toEqual([]);
expect(seenAuthStore).toEqual(authStore);
expect(seenTimeoutMs).toBe(12_345);
expect(result.videos).toEqual([
{
buffer: Buffer.from("mp4-bytes"),

View File

@@ -260,6 +260,7 @@ export async function generateVideo(
inputVideos: params.inputVideos,
inputAudios: params.inputAudios,
providerOptions: params.providerOptions,
...(params.timeoutMs !== undefined ? { timeoutMs: params.timeoutMs } : {}),
});
if (!Array.isArray(result.videos) || result.videos.length === 0) {
throw new Error("Video generation provider returned no videos.");