fix(openai-completions): seal native reasoning before the answer under /reasoning on (#95283)

* fix(openai-completions): seal native reasoning before the answer

deepseek-style providers stream reasoning via reasoning_content deltas
then switch to the answer via content deltas with no boundary event.
thinking_end was only emitted by the end-of-stream finishBlock loop, so
it landed after the answer's text_delta and channels merged the answer
into the reasoning block.

Seal the open native thinking block when visible text (or a tool call)
begins so thinking_end precedes the answer; tag-based <think> reasoning
is unaffected (closed by the partitioner). finishBlock is now idempotent
so the end-of-stream loop never re-emits thinking_end.

* fix(openai-completions): preserve co-streamed reasoning

* fix(openai-completions): order co-streamed reasoning

* fix(openai-completions): seal co-streamed reasoning

---------

Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
This commit is contained in:
Wynne668
2026-06-22 14:33:44 +08:00
committed by GitHub
parent 984efdb0b6
commit 1bd85e3cc3
2 changed files with 145 additions and 9 deletions

View File

@@ -799,6 +799,123 @@ describe("openai-completions stop-reason tool-call guard", () => {
expect(result.content.some((block) => block.type === "thinking")).toBe(false);
});
it("seals the native reasoning block before the answer text begins", async () => {
// deepseek streams reasoning_content, then switches to content with no
// boundary event; thinking_end must precede the answer so channels do not
// merge the answer into the reasoning block.
mockChunksRef.chunks = [
{
id: "chatcmpl-test",
choices: [{ index: 0, delta: { reasoning_content: "Let me think." } }],
},
{
id: "chatcmpl-test",
choices: [{ index: 0, delta: { reasoning_content: " Still thinking." } }],
},
makeTextChunk("The answer"),
makeTextChunk(" is 42."),
makeFinishChunk("stop"),
];
const stream = streamOpenAICompletions(reasoningModel, context, {
apiKey: "sk-test",
reasoningEffort: "medium",
});
const eventTypes: string[] = [];
for await (const event of stream as AsyncIterable<{ type: string }>) {
eventTypes.push(event.type);
}
const result = await stream.result();
const thinkingEndIndex = eventTypes.indexOf("thinking_end");
const textStartIndex = eventTypes.indexOf("text_start");
const firstTextDeltaIndex = eventTypes.indexOf("text_delta");
expect(thinkingEndIndex).toBeGreaterThanOrEqual(0);
expect(textStartIndex).toBeGreaterThanOrEqual(0);
expect(thinkingEndIndex).toBeLessThan(textStartIndex);
expect(thinkingEndIndex).toBeLessThan(firstTextDeltaIndex);
// thinking_end is emitted exactly once even though the block is also
// visited by the end-of-stream finish loop.
expect(eventTypes.filter((type) => type === "thinking_end")).toHaveLength(1);
expect(result.content).toContainEqual({
type: "thinking",
thinking: "Let me think. Still thinking.",
thinkingSignature: "reasoning_content",
});
expect(result.content).toContainEqual({ type: "text", text: "The answer is 42." });
});
it("seals the native reasoning block before a following tool call", async () => {
mockChunksRef.chunks = [
{
id: "chatcmpl-test",
choices: [{ index: 0, delta: { reasoning_content: "I should call a tool." } }],
},
makeToolCallChunk("call_1", "bash", '{"cmd":"ls"}'),
makeFinishChunk("tool_calls"),
];
const stream = streamOpenAICompletions(reasoningModel, context, {
apiKey: "sk-test",
reasoningEffort: "medium",
});
const eventTypes: string[] = [];
for await (const event of stream as AsyncIterable<{ type: string }>) {
eventTypes.push(event.type);
}
await stream.result();
const thinkingEndIndex = eventTypes.indexOf("thinking_end");
const toolCallStartIndex = eventTypes.indexOf("toolcall_start");
expect(thinkingEndIndex).toBeGreaterThanOrEqual(0);
expect(toolCallStartIndex).toBeGreaterThanOrEqual(0);
expect(thinkingEndIndex).toBeLessThan(toolCallStartIndex);
expect(eventTypes.filter((type) => type === "thinking_end")).toHaveLength(1);
});
it("keeps one native reasoning block when content and reasoning co-occur", async () => {
mockChunksRef.chunks = [
{
id: "chatcmpl-test",
choices: [{ index: 0, delta: { reasoning_content: "First thought." } }],
},
{
id: "chatcmpl-test",
choices: [
{
index: 0,
delta: {
content: "Visible text that shares the reasoning chunk.",
reasoning_content: " Second thought.",
},
},
],
},
makeTextChunk(" Final answer."),
makeFinishChunk("stop"),
];
const stream = streamOpenAICompletions(reasoningModel, context, {
apiKey: "sk-test",
reasoningEffort: "medium",
});
const eventTypes: string[] = [];
for await (const event of stream as AsyncIterable<{ type: string }>) {
eventTypes.push(event.type);
}
const result = await stream.result();
expect(eventTypes.filter((type) => type === "thinking_start")).toHaveLength(1);
expect(eventTypes.filter((type) => type === "thinking_end")).toHaveLength(1);
expect(eventTypes.indexOf("thinking_end")).toBeLessThan(eventTypes.indexOf("text_start"));
expect(result.content).toContainEqual({
type: "thinking",
thinking: "First thought. Second thought.",
thinkingSignature: "reasoning_content",
});
});
it("promotes silent tool_calls with finish_reason stop to toolUse", async () => {
mockChunksRef.chunks = [
makeToolCallChunk("call_1", "bash", '{"cmd":"ls"}'),

View File

@@ -187,12 +187,17 @@ export const streamOpenAICompletions: StreamFunction<
const toolCallBlocksByIndex = new Map<number, StreamingToolCallBlock>();
const toolCallBlocksById = new Map<string, StreamingToolCallBlock>();
const blocks = output.content as StreamingBlock[];
// A block can be finished mid-stream (native reasoning sealed at the
// text-lane transition) and again by the end-of-stream loop; guard so its
// *_end event is emitted exactly once.
const finishedBlocks = new Set<StreamingBlock>();
const getContentIndex = (block: StreamingBlock) => blocks.indexOf(block);
const finishBlock = (block: StreamingBlock) => {
const contentIndex = getContentIndex(block);
if (contentIndex === -1) {
if (contentIndex === -1 || finishedBlocks.has(block)) {
return;
}
finishedBlocks.add(block);
if (block.type === "text") {
stream.push({
type: "text_end",
@@ -249,7 +254,19 @@ export const streamOpenAICompletions: StreamFunction<
}
return thinkingBlock;
};
// Native-thinking providers (e.g. deepseek `reasoning_content`) stream the
// reasoning lane, then switch to the answer via `content` with no boundary
// event. Seal the open thought when visible text begins so `thinking_end`
// precedes the answer; tag-based <think> reasoning has no native thinking
// block (it is closed by the partitioner), so this is a no-op there.
const sealNativeReasoningBeforeText = () => {
if (thinkingBlock && !reasoningTagTextPartitioner.isInsideReasoning()) {
finishBlock(thinkingBlock);
thinkingBlock = null;
}
};
const appendTextDelta = (delta: string) => {
sealNativeReasoningBeforeText();
const block = ensureTextBlock();
block.text += delta;
stream.push({
@@ -382,14 +399,6 @@ export const streamOpenAICompletions: StreamFunction<
if (foundReasoningField) {
reasoningTagTextPartitioner.markStrict();
}
if (
choice.delta.content !== null &&
choice.delta.content !== undefined &&
choice.delta.content.length > 0
) {
appendPartitionedContent(choice.delta.content, Boolean(foundReasoningField));
}
if (shouldEmitReasoning && foundReasoningField) {
const delta = deltaFields[foundReasoningField];
if (typeof delta === "string" && delta.length > 0) {
@@ -400,9 +409,19 @@ export const streamOpenAICompletions: StreamFunction<
appendThinkingDelta(thinkingSignature, delta);
}
}
if (
choice.delta.content !== null &&
choice.delta.content !== undefined &&
choice.delta.content.length > 0
) {
appendPartitionedContent(choice.delta.content, Boolean(foundReasoningField));
}
if (choice?.delta?.tool_calls) {
flushPartitionedContent();
// The tool-call lane is also a reasoning boundary; seal the thought
// before toolcall_start so thinking_end never trails the action.
sealNativeReasoningBeforeText();
for (const toolCall of choice.delta.tool_calls) {
const block = ensureToolCallBlock(toolCall);
if (!block.id && toolCall.id) {