mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 11:10:45 +00:00
pdf: add Codex instructions for extraction fallback (#51329)
* Fix Codex PDF extraction fallback missing instructions - add a Codex-specific systemPrompt on the PDF extraction fallback path - keep non-Codex PDF fallback requests unchanged - add regression coverage proving openai-codex-responses requests include instructions for PDF tool calls * test: cover Codex text-only extraction fallback - add regression coverage for the branch where PDF extraction includes images but the selected Codex model only accepts text input - assert Codex-specific extraction instructions are still attached in that path * test: fix extracted image mock shape - add the required `type: "image"` field to the text-only fallback regression mock - keep the new Codex coverage test aligned with PdfExtractedImage * test: align Codex PDF fallback tests * docs(changelog): note PDF Codex fallback fix --------- Co-authored-by: Dr JCai <jingxiao.cai@gmail.com> Co-authored-by: anyech <8743351+anyech@users.noreply.github.com>
This commit is contained in:
@@ -111,6 +111,7 @@ Docs: https://docs.openclaw.ai
|
||||
### Fixes
|
||||
|
||||
- Google Meet/Voice Call: wait longer before playing PIN-derived Twilio DTMF for Meet dial-in prompts and retire stale delegated phone sessions instead of reusing completed calls.
|
||||
- PDF/Codex: include extraction-fallback instructions for `openai-codex/*` PDF tool requests so Codex Responses receives its required system prompt. Fixes #77872. Thanks @anyech.
|
||||
- Onboard/channels: recover externalized channel plugins from stale `channels.<id>` config by falling back to `ensureChannelSetupPluginInstalled` via the trusted catalog when the plugin is missing on disk, so leftover `appId`/token entries no longer dead-end onboard with "<channel> plugin not available." (#78328) Thanks @sliverp.
|
||||
- Codex/app-server: forward the OpenClaw workspace bootstrap block through Codex `developerInstructions` instead of `config.instructions`, so persona/style guidance reaches the behavior-shaping app-server lane. Fixes #77363. Thanks @lonexreb.
|
||||
- CLI/infer: pass minimal instructions to local `openai-codex/*` model probes and surface provider error details when `infer model run` returns no text. Fixes #76464. Thanks @lilesjtu.
|
||||
|
||||
@@ -36,6 +36,7 @@ async function loadCreatePdfTool() {
|
||||
|
||||
const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
|
||||
const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini";
|
||||
const CODEX_PDF_MODEL = "openai-codex/gpt-5.4";
|
||||
const FAKE_PDF_MEDIA = {
|
||||
kind: "document",
|
||||
buffer: Buffer.from("%PDF-1.4 fake"),
|
||||
@@ -85,6 +86,7 @@ async function stubPdfToolInfra(
|
||||
mockLoad?: boolean;
|
||||
provider?: string;
|
||||
input?: string[];
|
||||
api?: string;
|
||||
modelFound?: boolean;
|
||||
},
|
||||
) {
|
||||
@@ -102,6 +104,13 @@ async function stubPdfToolInfra(
|
||||
: () =>
|
||||
({
|
||||
provider: params?.provider ?? "anthropic",
|
||||
api:
|
||||
params?.api ??
|
||||
(params?.provider === "openai-codex"
|
||||
? "openai-codex-responses"
|
||||
: params?.provider === "openai"
|
||||
? "openai-responses"
|
||||
: "anthropic-messages"),
|
||||
maxTokens: 8192,
|
||||
input: params?.input ?? ["text", "document"],
|
||||
}) as never;
|
||||
@@ -469,6 +478,82 @@ describe("createPdfTool", () => {
|
||||
content: [{ type: "text", text: "fallback summary" }],
|
||||
details: { native: false, model: OPENAI_PDF_MODEL },
|
||||
});
|
||||
const [, context] = completeMock.mock.calls[0] ?? [];
|
||||
expect(context?.systemPrompt).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
it("adds Codex instructions for PDF extraction fallback requests", async () => {
|
||||
await withTempPdfAgentDir(async (agentDir) => {
|
||||
await stubPdfToolInfra(agentDir, {
|
||||
provider: "openai-codex",
|
||||
api: "openai-codex-responses",
|
||||
input: ["text", "image"],
|
||||
});
|
||||
|
||||
vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
|
||||
text: "Extracted content",
|
||||
images: [],
|
||||
});
|
||||
|
||||
completeMock.mockResolvedValue({
|
||||
role: "assistant",
|
||||
stopReason: "stop",
|
||||
content: [{ type: "text", text: "codex summary" }],
|
||||
} as never);
|
||||
|
||||
const cfg = withPdfModel(CODEX_PDF_MODEL);
|
||||
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
|
||||
|
||||
const result = await tool.execute("t1", {
|
||||
prompt: "summarize",
|
||||
pdf: "/tmp/doc.pdf",
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({
|
||||
content: [{ type: "text", text: "codex summary" }],
|
||||
details: { native: false, model: CODEX_PDF_MODEL },
|
||||
});
|
||||
expect(completeMock).toHaveBeenCalledTimes(1);
|
||||
const [, context] = completeMock.mock.calls[0] ?? [];
|
||||
expect(context?.systemPrompt).toContain("Analyze the provided PDF content");
|
||||
});
|
||||
});
|
||||
|
||||
it("adds Codex instructions when extraction has images but the model only accepts text", async () => {
|
||||
await withTempPdfAgentDir(async (agentDir) => {
|
||||
await stubPdfToolInfra(agentDir, {
|
||||
provider: "openai-codex",
|
||||
api: "openai-codex-responses",
|
||||
input: ["text"],
|
||||
});
|
||||
|
||||
vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
|
||||
text: "Extracted content",
|
||||
images: [{ type: "image", data: "base64img", mimeType: "image/png" }],
|
||||
});
|
||||
|
||||
completeMock.mockResolvedValue({
|
||||
role: "assistant",
|
||||
stopReason: "stop",
|
||||
content: [{ type: "text", text: "codex summary" }],
|
||||
} as never);
|
||||
|
||||
const cfg = withPdfModel(CODEX_PDF_MODEL);
|
||||
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
|
||||
|
||||
const result = await tool.execute("t1", {
|
||||
prompt: "summarize",
|
||||
pdf: "/tmp/doc.pdf",
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({
|
||||
content: [{ type: "text", text: "codex summary" }],
|
||||
details: { native: false, model: CODEX_PDF_MODEL },
|
||||
});
|
||||
expect(completeMock).toHaveBeenCalledTimes(1);
|
||||
const [, context] = completeMock.mock.calls[0] ?? [];
|
||||
expect(context?.systemPrompt).toContain("Analyze the provided PDF content");
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@@ -90,7 +90,14 @@ function hasExplicitPdfToolModelConfig(config?: OpenClawConfig): boolean {
|
||||
// Build context for extraction fallback path
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedContent[]): Context {
|
||||
const CODEX_PDF_INSTRUCTIONS =
|
||||
"Analyze the provided PDF content and answer the user's request accurately.";
|
||||
|
||||
function buildPdfExtractionContext(
|
||||
prompt: string,
|
||||
extractions: PdfExtractedContent[],
|
||||
model?: { api?: string },
|
||||
): Context {
|
||||
const content: Array<
|
||||
{ type: "text"; text: string } | { type: "image"; data: string; mimeType: string }
|
||||
> = [];
|
||||
@@ -110,7 +117,10 @@ function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedCont
|
||||
// Add the user prompt
|
||||
content.push({ type: "text", text: prompt });
|
||||
|
||||
const systemPrompt = model?.api === "openai-codex-responses" ? CODEX_PDF_INSTRUCTIONS : undefined;
|
||||
|
||||
return {
|
||||
...(systemPrompt ? { systemPrompt } : {}),
|
||||
messages: [{ role: "user", content, timestamp: Date.now() }],
|
||||
};
|
||||
}
|
||||
@@ -217,7 +227,7 @@ async function runPdfPrompt(params: {
|
||||
text: e.text,
|
||||
images: [],
|
||||
}));
|
||||
const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions);
|
||||
const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions, model);
|
||||
const message = await complete(model, context, {
|
||||
apiKey,
|
||||
maxTokens: resolvePdfToolMaxTokens(model.maxTokens),
|
||||
@@ -226,7 +236,7 @@ async function runPdfPrompt(params: {
|
||||
return { text, provider, model: modelId, native: false };
|
||||
}
|
||||
|
||||
const context = buildPdfExtractionContext(params.prompt, extractions);
|
||||
const context = buildPdfExtractionContext(params.prompt, extractions, model);
|
||||
const message = await complete(model, context, {
|
||||
apiKey,
|
||||
maxTokens: resolvePdfToolMaxTokens(model.maxTokens),
|
||||
|
||||
Reference in New Issue
Block a user