fix(openai-completions): seal native reasoning before the answer under /reasoning on (#95283)

* fix(openai-completions): seal native reasoning before the answer deepseek-style providers stream reasoning via reasoning_content deltas then switch to the answer via content deltas with no boundary event. thinking_end was only emitted by the end-of-stream finishBlock loop, so it landed after the answer's text_delta and channels merged the answer into the reasoning block. Seal the open native thinking block when visible text (or a tool call) begins so thinking_end precedes the answer; tag-based <think> reasoning is unaffected (closed by the partitioner). finishBlock is now idempotent so the end-of-stream loop never re-emits thinking_end. * fix(openai-completions): preserve co-streamed reasoning * fix(openai-completions): order co-streamed reasoning * fix(openai-completions): seal co-streamed reasoning --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
2026-06-26 15:59:31 +00:00 · 2026-06-22 14:33:44 +08:00
parent 984efdb0b6
commit 1bd85e3cc3
2 changed files with 145 additions and 9 deletions
--- a/src/llm/providers/openai-completions.test.ts
+++ b/src/llm/providers/openai-completions.test.ts
@@ -799,6 +799,123 @@ describe("openai-completions stop-reason tool-call guard", () => {
    expect(result.content.some((block) => block.type === "thinking")).toBe(false);
  });

+  it("seals the native reasoning block before the answer text begins", async () => {
+    // deepseek streams reasoning_content, then switches to content with no
+    // boundary event; thinking_end must precede the answer so channels do not
+    // merge the answer into the reasoning block.
+    mockChunksRef.chunks = [
+      {
+        id: "chatcmpl-test",
+        choices: [{ index: 0, delta: { reasoning_content: "Let me think." } }],
+      },
+      {
+        id: "chatcmpl-test",
+        choices: [{ index: 0, delta: { reasoning_content: " Still thinking." } }],
+      },
+      makeTextChunk("The answer"),
+      makeTextChunk(" is 42."),
+      makeFinishChunk("stop"),
+    ];
+
+    const stream = streamOpenAICompletions(reasoningModel, context, {
+      apiKey: "sk-test",
+      reasoningEffort: "medium",
+    });
+    const eventTypes: string[] = [];
+    for await (const event of stream as AsyncIterable<{ type: string }>) {
+      eventTypes.push(event.type);
+    }
+    const result = await stream.result();
+
+    const thinkingEndIndex = eventTypes.indexOf("thinking_end");
+    const textStartIndex = eventTypes.indexOf("text_start");
+    const firstTextDeltaIndex = eventTypes.indexOf("text_delta");
+    expect(thinkingEndIndex).toBeGreaterThanOrEqual(0);
+    expect(textStartIndex).toBeGreaterThanOrEqual(0);
+    expect(thinkingEndIndex).toBeLessThan(textStartIndex);
+    expect(thinkingEndIndex).toBeLessThan(firstTextDeltaIndex);
+    // thinking_end is emitted exactly once even though the block is also
+    // visited by the end-of-stream finish loop.
+    expect(eventTypes.filter((type) => type === "thinking_end")).toHaveLength(1);
+
+    expect(result.content).toContainEqual({
+      type: "thinking",
+      thinking: "Let me think. Still thinking.",
+      thinkingSignature: "reasoning_content",
+    });
+    expect(result.content).toContainEqual({ type: "text", text: "The answer is 42." });
+  });
+
+  it("seals the native reasoning block before a following tool call", async () => {
+    mockChunksRef.chunks = [
+      {
+        id: "chatcmpl-test",
+        choices: [{ index: 0, delta: { reasoning_content: "I should call a tool." } }],
+      },
+      makeToolCallChunk("call_1", "bash", '{"cmd":"ls"}'),
+      makeFinishChunk("tool_calls"),
+    ];
+
+    const stream = streamOpenAICompletions(reasoningModel, context, {
+      apiKey: "sk-test",
+      reasoningEffort: "medium",
+    });
+    const eventTypes: string[] = [];
+    for await (const event of stream as AsyncIterable<{ type: string }>) {
+      eventTypes.push(event.type);
+    }
+    await stream.result();
+
+    const thinkingEndIndex = eventTypes.indexOf("thinking_end");
+    const toolCallStartIndex = eventTypes.indexOf("toolcall_start");
+    expect(thinkingEndIndex).toBeGreaterThanOrEqual(0);
+    expect(toolCallStartIndex).toBeGreaterThanOrEqual(0);
+    expect(thinkingEndIndex).toBeLessThan(toolCallStartIndex);
+    expect(eventTypes.filter((type) => type === "thinking_end")).toHaveLength(1);
+  });
+
+  it("keeps one native reasoning block when content and reasoning co-occur", async () => {
+    mockChunksRef.chunks = [
+      {
+        id: "chatcmpl-test",
+        choices: [{ index: 0, delta: { reasoning_content: "First thought." } }],
+      },
+      {
+        id: "chatcmpl-test",
+        choices: [
+          {
+            index: 0,
+            delta: {
+              content: "Visible text that shares the reasoning chunk.",
+              reasoning_content: " Second thought.",
+            },
+          },
+        ],
+      },
+      makeTextChunk(" Final answer."),
+      makeFinishChunk("stop"),
+    ];
+
+    const stream = streamOpenAICompletions(reasoningModel, context, {
+      apiKey: "sk-test",
+      reasoningEffort: "medium",
+    });
+    const eventTypes: string[] = [];
+    for await (const event of stream as AsyncIterable<{ type: string }>) {
+      eventTypes.push(event.type);
+    }
+    const result = await stream.result();
+
+    expect(eventTypes.filter((type) => type === "thinking_start")).toHaveLength(1);
+    expect(eventTypes.filter((type) => type === "thinking_end")).toHaveLength(1);
+    expect(eventTypes.indexOf("thinking_end")).toBeLessThan(eventTypes.indexOf("text_start"));
+    expect(result.content).toContainEqual({
+      type: "thinking",
+      thinking: "First thought. Second thought.",
+      thinkingSignature: "reasoning_content",
+    });
+  });
+
  it("promotes silent tool_calls with finish_reason stop to toolUse", async () => {
    mockChunksRef.chunks = [
      makeToolCallChunk("call_1", "bash", '{"cmd":"ls"}'),
--- a/src/llm/providers/openai-completions.ts
+++ b/src/llm/providers/openai-completions.ts
@@ -187,12 +187,17 @@ export const streamOpenAICompletions: StreamFunction<
      const toolCallBlocksByIndex = new Map<number, StreamingToolCallBlock>();
      const toolCallBlocksById = new Map<string, StreamingToolCallBlock>();
      const blocks = output.content as StreamingBlock[];
+      // A block can be finished mid-stream (native reasoning sealed at the
+      // text-lane transition) and again by the end-of-stream loop; guard so its
+      // *_end event is emitted exactly once.
+      const finishedBlocks = new Set<StreamingBlock>();
      const getContentIndex = (block: StreamingBlock) => blocks.indexOf(block);
      const finishBlock = (block: StreamingBlock) => {
        const contentIndex = getContentIndex(block);
-        if (contentIndex === -1) {
+        if (contentIndex === -1 || finishedBlocks.has(block)) {
          return;
        }
+        finishedBlocks.add(block);
        if (block.type === "text") {
          stream.push({
            type: "text_end",
@@ -249,7 +254,19 @@ export const streamOpenAICompletions: StreamFunction<
        }
        return thinkingBlock;
      };
+      // Native-thinking providers (e.g. deepseek `reasoning_content`) stream the
+      // reasoning lane, then switch to the answer via `content` with no boundary
+      // event. Seal the open thought when visible text begins so `thinking_end`
+      // precedes the answer; tag-based <think> reasoning has no native thinking
+      // block (it is closed by the partitioner), so this is a no-op there.
+      const sealNativeReasoningBeforeText = () => {
+        if (thinkingBlock && !reasoningTagTextPartitioner.isInsideReasoning()) {
+          finishBlock(thinkingBlock);
+          thinkingBlock = null;
+        }
+      };
      const appendTextDelta = (delta: string) => {
+        sealNativeReasoningBeforeText();
        const block = ensureTextBlock();
        block.text += delta;
        stream.push({
@@ -382,14 +399,6 @@ export const streamOpenAICompletions: StreamFunction<
          if (foundReasoningField) {
            reasoningTagTextPartitioner.markStrict();
          }
-          if (
-            choice.delta.content !== null &&
-            choice.delta.content !== undefined &&
-            choice.delta.content.length > 0
-          ) {
-            appendPartitionedContent(choice.delta.content, Boolean(foundReasoningField));
-          }
-
          if (shouldEmitReasoning && foundReasoningField) {
            const delta = deltaFields[foundReasoningField];
            if (typeof delta === "string" && delta.length > 0) {
@@ -400,9 +409,19 @@ export const streamOpenAICompletions: StreamFunction<
              appendThinkingDelta(thinkingSignature, delta);
            }
          }
+          if (
+            choice.delta.content !== null &&
+            choice.delta.content !== undefined &&
+            choice.delta.content.length > 0
+          ) {
+            appendPartitionedContent(choice.delta.content, Boolean(foundReasoningField));
+          }

          if (choice?.delta?.tool_calls) {
            flushPartitionedContent();
+            // The tool-call lane is also a reasoning boundary; seal the thought
+            // before toolcall_start so thinking_end never trails the action.
+            sealNativeReasoningBeforeText();
            for (const toolCall of choice.delta.tool_calls) {
              const block = ensureToolCallBlock(toolCall);
              if (!block.id && toolCall.id) {