pdf: add Codex instructions for extraction fallback (#51329)

* Fix Codex PDF extraction fallback missing instructions - add a Codex-specific systemPrompt on the PDF extraction fallback path - keep non-Codex PDF fallback requests unchanged - add regression coverage proving openai-codex-responses requests include instructions for PDF tool calls * test: cover Codex text-only extraction fallback - add regression coverage for the branch where PDF extraction includes images but the selected Codex model only accepts text input - assert Codex-specific extraction instructions are still attached in that path * test: fix extracted image mock shape - add the required `type: "image"` field to the text-only fallback regression mock - keep the new Codex coverage test aligned with PdfExtractedImage * test: align Codex PDF fallback tests * docs(changelog): note PDF Codex fallback fix --------- Co-authored-by: Dr JCai <jingxiao.cai@gmail.com> Co-authored-by: anyech <8743351+anyech@users.noreply.github.com>
2026-05-06 11:10:45 +00:00 · 2026-05-06 01:34:42 -07:00
parent 674c447264
commit 85ded4d444
3 changed files with 99 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -111,6 +111,7 @@ Docs: https://docs.openclaw.ai
 ### Fixes

 - Google Meet/Voice Call: wait longer before playing PIN-derived Twilio DTMF for Meet dial-in prompts and retire stale delegated phone sessions instead of reusing completed calls.
+- PDF/Codex: include extraction-fallback instructions for `openai-codex/*` PDF tool requests so Codex Responses receives its required system prompt. Fixes #77872. Thanks @anyech.
 - Onboard/channels: recover externalized channel plugins from stale `channels.<id>` config by falling back to `ensureChannelSetupPluginInstalled` via the trusted catalog when the plugin is missing on disk, so leftover `appId`/token entries no longer dead-end onboard with "<channel> plugin not available." (#78328) Thanks @sliverp.
 - Codex/app-server: forward the OpenClaw workspace bootstrap block through Codex `developerInstructions` instead of `config.instructions`, so persona/style guidance reaches the behavior-shaping app-server lane. Fixes #77363. Thanks @lonexreb.
 - CLI/infer: pass minimal instructions to local `openai-codex/*` model probes and surface provider error details when `infer model run` returns no text. Fixes #76464. Thanks @lilesjtu.
--- a/src/agents/tools/pdf-tool.test.ts
+++ b/src/agents/tools/pdf-tool.test.ts
@@ -36,6 +36,7 @@ async function loadCreatePdfTool() {

 const ANTHROPIC_PDF_MODEL = "anthropic/claude-opus-4-6";
 const OPENAI_PDF_MODEL = "openai/gpt-5.4-mini";
+const CODEX_PDF_MODEL = "openai-codex/gpt-5.4";
 const FAKE_PDF_MEDIA = {
  kind: "document",
  buffer: Buffer.from("%PDF-1.4 fake"),
@@ -85,6 +86,7 @@ async function stubPdfToolInfra(
    mockLoad?: boolean;
    provider?: string;
    input?: string[];
+    api?: string;
    modelFound?: boolean;
  },
 ) {
@@ -102,6 +104,13 @@ async function stubPdfToolInfra(
      : () =>
          ({
            provider: params?.provider ?? "anthropic",
+            api:
+              params?.api ??
+              (params?.provider === "openai-codex"
+                ? "openai-codex-responses"
+                : params?.provider === "openai"
+                  ? "openai-responses"
+                  : "anthropic-messages"),
            maxTokens: 8192,
            input: params?.input ?? ["text", "document"],
          }) as never;
@@ -469,6 +478,82 @@ describe("createPdfTool", () => {
        content: [{ type: "text", text: "fallback summary" }],
        details: { native: false, model: OPENAI_PDF_MODEL },
      });
+      const [, context] = completeMock.mock.calls[0] ?? [];
+      expect(context?.systemPrompt).toBeUndefined();
+    });
+  });
+
+  it("adds Codex instructions for PDF extraction fallback requests", async () => {
+    await withTempPdfAgentDir(async (agentDir) => {
+      await stubPdfToolInfra(agentDir, {
+        provider: "openai-codex",
+        api: "openai-codex-responses",
+        input: ["text", "image"],
+      });
+
+      vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
+        text: "Extracted content",
+        images: [],
+      });
+
+      completeMock.mockResolvedValue({
+        role: "assistant",
+        stopReason: "stop",
+        content: [{ type: "text", text: "codex summary" }],
+      } as never);
+
+      const cfg = withPdfModel(CODEX_PDF_MODEL);
+      const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
+
+      const result = await tool.execute("t1", {
+        prompt: "summarize",
+        pdf: "/tmp/doc.pdf",
+      });
+
+      expect(result).toMatchObject({
+        content: [{ type: "text", text: "codex summary" }],
+        details: { native: false, model: CODEX_PDF_MODEL },
+      });
+      expect(completeMock).toHaveBeenCalledTimes(1);
+      const [, context] = completeMock.mock.calls[0] ?? [];
+      expect(context?.systemPrompt).toContain("Analyze the provided PDF content");
+    });
+  });
+
+  it("adds Codex instructions when extraction has images but the model only accepts text", async () => {
+    await withTempPdfAgentDir(async (agentDir) => {
+      await stubPdfToolInfra(agentDir, {
+        provider: "openai-codex",
+        api: "openai-codex-responses",
+        input: ["text"],
+      });
+
+      vi.spyOn(pdfExtractModule, "extractPdfContent").mockResolvedValue({
+        text: "Extracted content",
+        images: [{ type: "image", data: "base64img", mimeType: "image/png" }],
+      });
+
+      completeMock.mockResolvedValue({
+        role: "assistant",
+        stopReason: "stop",
+        content: [{ type: "text", text: "codex summary" }],
+      } as never);
+
+      const cfg = withPdfModel(CODEX_PDF_MODEL);
+      const tool = requirePdfTool((await loadCreatePdfTool())({ config: cfg, agentDir }));
+
+      const result = await tool.execute("t1", {
+        prompt: "summarize",
+        pdf: "/tmp/doc.pdf",
+      });
+
+      expect(result).toMatchObject({
+        content: [{ type: "text", text: "codex summary" }],
+        details: { native: false, model: CODEX_PDF_MODEL },
+      });
+      expect(completeMock).toHaveBeenCalledTimes(1);
+      const [, context] = completeMock.mock.calls[0] ?? [];
+      expect(context?.systemPrompt).toContain("Analyze the provided PDF content");
    });
  });

--- a/src/agents/tools/pdf-tool.ts
+++ b/src/agents/tools/pdf-tool.ts
@@ -90,7 +90,14 @@ function hasExplicitPdfToolModelConfig(config?: OpenClawConfig): boolean {
 // Build context for extraction fallback path
 // ---------------------------------------------------------------------------

-function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedContent[]): Context {
+const CODEX_PDF_INSTRUCTIONS =
+  "Analyze the provided PDF content and answer the user's request accurately.";
+
+function buildPdfExtractionContext(
+  prompt: string,
+  extractions: PdfExtractedContent[],
+  model?: { api?: string },
+): Context {
  const content: Array<
    { type: "text"; text: string } | { type: "image"; data: string; mimeType: string }
  > = [];
@@ -110,7 +117,10 @@ function buildPdfExtractionContext(prompt: string, extractions: PdfExtractedCont
  // Add the user prompt
  content.push({ type: "text", text: prompt });

+  const systemPrompt = model?.api === "openai-codex-responses" ? CODEX_PDF_INSTRUCTIONS : undefined;
+
  return {
+    ...(systemPrompt ? { systemPrompt } : {}),
    messages: [{ role: "user", content, timestamp: Date.now() }],
  };
 }
@@ -217,7 +227,7 @@ async function runPdfPrompt(params: {
          text: e.text,
          images: [],
        }));
-        const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions);
+        const context = buildPdfExtractionContext(params.prompt, textOnlyExtractions, model);
        const message = await complete(model, context, {
          apiKey,
          maxTokens: resolvePdfToolMaxTokens(model.maxTokens),
@@ -226,7 +236,7 @@ async function runPdfPrompt(params: {
        return { text, provider, model: modelId, native: false };
      }

-      const context = buildPdfExtractionContext(params.prompt, extractions);
+      const context = buildPdfExtractionContext(params.prompt, extractions, model);
      const message = await complete(model, context, {
        apiKey,
        maxTokens: resolvePdfToolMaxTokens(model.maxTokens),