pdf: add Codex instructions for extraction fallback (#51329)

* Fix Codex PDF extraction fallback missing instructions

- add a Codex-specific systemPrompt on the PDF extraction fallback path
- keep non-Codex PDF fallback requests unchanged
- add regression coverage proving openai-codex-responses requests include instructions for PDF tool calls

* test: cover Codex text-only extraction fallback

- add regression coverage for the branch where PDF extraction includes images
  but the selected Codex model only accepts text input
- assert Codex-specific extraction instructions are still attached in that path

* test: fix extracted image mock shape

- add the required `type: "image"` field to the text-only fallback regression mock
- keep the new Codex coverage test aligned with PdfExtractedImage

* test: align Codex PDF fallback tests

* docs(changelog): note PDF Codex fallback fix

---------

Co-authored-by: Dr JCai <jingxiao.cai@gmail.com>
Co-authored-by: anyech <8743351+anyech@users.noreply.github.com>
This commit is contained in:
JC
2026-05-06 01:34:42 -07:00
committed by GitHub
parent 674c447264
commit 85ded4d444
3 changed files with 99 additions and 3 deletions

View File

@@ -111,6 +111,7 @@ Docs: https://docs.openclaw.ai
### Fixes
- Google Meet/Voice Call: wait longer before playing PIN-derived Twilio DTMF for Meet dial-in prompts and retire stale delegated phone sessions instead of reusing completed calls.
- PDF/Codex: include extraction-fallback instructions for `openai-codex/*` PDF tool requests so Codex Responses receives its required system prompt. Fixes #77872. Thanks @anyech.
- Onboard/channels: recover externalized channel plugins from stale `channels.<id>` config by falling back to `ensureChannelSetupPluginInstalled` via the trusted catalog when the plugin is missing on disk, so leftover `appId`/token entries no longer dead-end onboard with "<channel> plugin not available." (#78328) Thanks @sliverp.
- Codex/app-server: forward the OpenClaw workspace bootstrap block through Codex `developerInstructions` instead of `config.instructions`, so persona/style guidance reaches the behavior-shaping app-server lane. Fixes #77363. Thanks @lonexreb.
- CLI/infer: pass minimal instructions to local `openai-codex/*` model probes and surface provider error details when `infer model run` returns no text. Fixes #76464. Thanks @lilesjtu.

View File

@@ -36,6 +36,7 @@ async function loadCreatePdfTool() {
const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini";
const CODEX_PDF_MODEL = "openai-codex/gpt-5.4";
const FAKE_PDF_MEDIA = {
kind: "document",
buffer: Buffer.from("%PDF-1.4 fake"),
@@ -85,6 +86,7 @@ async function stubPdfToolInfra(
mockLoad?: boolean;
provider?: string;
input?: string[];
api?: string;
modelFound?: boolean;
},
) {
@@ -102,6 +104,13 @@ async function stubPdfToolInfra(
: () =>
({
provider: params?.provider ?? "anthropic",
api:
params?.api ??
(params?.provider === "openai-codex"
? "openai-codex-responses"
: params?.provider === "openai"
? "openai-responses"
: "anthropic-messages"),
maxTokens: 8192,
input: params?.input ?? ["text", "document"],
}) as never;
@@ -469,6 +478,82 @@ describe("createPdfTool", () => {
content: [{ type: "text", text: "fallback summary" }],
details: { native: false, model: OPENAI_PDF_MODEL },
});
const [, context] = completeMock.mock.calls[0] ?? [];
expect(context?.systemPrompt).toBeUndefined();
});
});
it("adds Codex instructions for PDF extraction fallback requests", async () => {
await withTempPdfAgentDir(async (agentDir) => {
await stubPdfToolInfra(agentDir, {
provider: "openai-codex",
api: "openai-codex-responses",
input: ["text", "image"],
});
vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
text: "Extracted content",
images: [],
});
completeMock.mockResolvedValue({
role: "assistant",
stopReason: "stop",
content: [{ type: "text", text: "codex summary" }],
} as never);
const cfg = withPdfModel(CODEX_PDF_MODEL);
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "summarize",
pdf: "/tmp/doc.pdf",
});
expect(result).toMatchObject({
content: [{ type: "text", text: "codex summary" }],
details: { native: false, model: CODEX_PDF_MODEL },
});
expect(completeMock).toHaveBeenCalledTimes(1);
const [, context] = completeMock.mock.calls[0] ?? [];
expect(context?.systemPrompt).toContain("Analyze the provided PDF content");
});
});
it("adds Codex instructions when extraction has images but the model only accepts text", async () => {
await withTempPdfAgentDir(async (agentDir) => {
await stubPdfToolInfra(agentDir, {
provider: "openai-codex",
api: "openai-codex-responses",
input: ["text"],
});
vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
text: "Extracted content",
images: [{ type: "image", data: "base64img", mimeType: "image/png" }],
});
completeMock.mockResolvedValue({
role: "assistant",
stopReason: "stop",
content: [{ type: "text", text: "codex summary" }],
} as never);
const cfg = withPdfModel(CODEX_PDF_MODEL);
const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
const result = await tool.execute("t1", {
prompt: "summarize",
pdf: "/tmp/doc.pdf",
});
expect(result).toMatchObject({
content: [{ type: "text", text: "codex summary" }],
details: { native: false, model: CODEX_PDF_MODEL },
});
expect(completeMock).toHaveBeenCalledTimes(1);
const [, context] = completeMock.mock.calls[0] ?? [];
expect(context?.systemPrompt).toContain("Analyze the provided PDF content");
});
});

View File

@@ -90,7 +90,14 @@ function hasExplicitPdfToolModelConfig(config?: OpenClawConfig): boolean {
// Build context for extraction fallback path
// ---------------------------------------------------------------------------
function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedContent[]): Context {
const CODEX_PDF_INSTRUCTIONS =
"Analyze the provided PDF content and answer the user's request accurately.";
function buildPdfExtractionContext(
prompt: string,
extractions: PdfExtractedContent[],
model?: { api?: string },
): Context {
const content: Array<
{ type: "text"; text: string } | { type: "image"; data: string; mimeType: string }
> = [];
@@ -110,7 +117,10 @@ function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedCont
// Add the user prompt
content.push({ type: "text", text: prompt });
const systemPrompt = model?.api === "openai-codex-responses" ? CODEX_PDF_INSTRUCTIONS : undefined;
return {
...(systemPrompt ? { systemPrompt } : {}),
messages: [{ role: "user", content, timestamp: Date.now() }],
};
}
@@ -217,7 +227,7 @@ async function runPdfPrompt(params: {
text: e.text,
images: [],
}));
const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions);
const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions, model);
const message = await complete(model, context, {
apiKey,
maxTokens: resolvePdfToolMaxTokens(model.maxTokens),
@@ -226,7 +236,7 @@ async function runPdfPrompt(params: {
return { text, provider, model: modelId, native: false };
}
const context = buildPdfExtractionContext(params.prompt, extractions);
const context = buildPdfExtractionContext(params.prompt, extractions, model);
const message = await complete(model, context, {
apiKey,
maxTokens: resolvePdfToolMaxTokens(model.maxTokens),