refactor(vllm): own qwen thinking payloads

This commit is contained in:
Peter Steinberger
2026-04-27 11:47:54 +01:00
parent 4f7038ae33
commit 836d4b4105
20 changed files with 467 additions and 129 deletions

View File

@@ -31,3 +31,4 @@ export {
MODELSTUDIO_MODEL_CATALOG,
} from "./models.js";
export { buildModelStudioProvider, buildQwenProvider } from "./provider-catalog.js";
export { createQwenThinkingWrapper, wrapQwenProviderStream } from "./stream.js";

View File

@@ -10,6 +10,7 @@ import {
QWEN_DEFAULT_MODEL_REF,
} from "./onboard.js";
import { buildQwenProvider } from "./provider-catalog.js";
import { wrapQwenProviderStream } from "./stream.js";
import { buildQwenVideoGenerationProvider } from "./video-generation-provider.js";
const PROVIDER_ID = "qwen";
@@ -165,6 +166,7 @@ export default defineSingleProviderPluginEntry({
},
applyNativeStreamingUsageCompat: ({ providerConfig }) =>
applyQwenNativeStreamingUsageCompat(providerConfig),
wrapStreamFn: wrapQwenProviderStream,
normalizeConfig: ({ providerConfig }) => {
if (!isQwenCodingPlanBaseUrl(providerConfig.baseUrl)) {
return undefined;

View File

@@ -0,0 +1,94 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import type { Context, Model } from "@mariozechner/pi-ai";
import { describe, expect, it } from "vitest";
import { createQwenThinkingWrapper, wrapQwenProviderStream } from "./stream.js";
function capturePayload(params: {
thinkingLevel?: "off" | "low" | "medium" | "high" | "xhigh" | "max";
reasoning?: unknown;
initialPayload?: Record<string, unknown>;
model?: Partial<Model<"openai-completions">>;
}): Record<string, unknown> {
let captured: Record<string, unknown> = {};
const baseStreamFn: StreamFn = (_model, _context, options) => {
const payload = { ...params.initialPayload };
options?.onPayload?.(payload, _model);
captured = payload;
return {} as ReturnType<StreamFn>;
};
const wrapped = createQwenThinkingWrapper(baseStreamFn, params.thinkingLevel ?? "high");
void wrapped(
{
api: "openai-completions",
provider: "qwen",
id: "qwen3.6-plus",
reasoning: true,
...params.model,
} as Model<"openai-completions">,
{ messages: [] } as Context,
params.reasoning === undefined ? {} : ({ reasoning: params.reasoning } as never),
);
return captured;
}
describe("createQwenThinkingWrapper", () => {
it("maps disabled thinking to Qwen top-level enable_thinking", () => {
const payload = capturePayload({
reasoning: "none",
initialPayload: {
reasoning_effort: "high",
reasoning: { effort: "high" },
reasoningEffort: "high",
},
});
expect(payload).toEqual({ enable_thinking: false });
});
it("maps enabled thinking to Qwen top-level enable_thinking", () => {
expect(capturePayload({ reasoning: "medium" })).toEqual({ enable_thinking: true });
});
it("falls back to the session thinking level", () => {
expect(capturePayload({ thinkingLevel: "off" })).toEqual({ enable_thinking: false });
expect(capturePayload({ thinkingLevel: "high" })).toEqual({ enable_thinking: true });
});
it("skips non-reasoning and non-completions models", () => {
expect(capturePayload({ model: { reasoning: false } })).toEqual({});
expect(capturePayload({ model: { api: "openai-responses" as never } })).toEqual({});
});
});
describe("wrapQwenProviderStream", () => {
it("only registers for Qwen-family OpenAI-compatible providers", () => {
expect(
wrapQwenProviderStream({
provider: "qwencloud",
modelId: "qwen3.6-plus",
model: {
api: "openai-completions",
provider: "qwen",
id: "qwen3.6-plus",
reasoning: true,
} as Model<"openai-completions">,
streamFn: undefined,
} as never),
).toBeTypeOf("function");
expect(
wrapQwenProviderStream({
provider: "openai",
modelId: "gpt-5.4",
model: {
api: "openai-completions",
provider: "openai",
id: "gpt-5.4",
} as Model<"openai-completions">,
streamFn: undefined,
} as never),
).toBeUndefined();
});
});

56
extensions/qwen/stream.ts Normal file
View File

@@ -0,0 +1,56 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import { streamSimple } from "@mariozechner/pi-ai";
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
import { streamWithPayloadPatch } from "openclaw/plugin-sdk/provider-stream-shared";
type QwenThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
function isQwenProviderId(providerId: string): boolean {
const normalized = normalizeProviderId(providerId);
return (
normalized === "qwen" ||
normalized === "modelstudio" ||
normalized === "qwencloud" ||
normalized === "dashscope"
);
}
function resolveOpenAICompatibleThinkingEnabled(params: {
thinkingLevel: QwenThinkingLevel;
options: Parameters<StreamFn>[2];
}): boolean {
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
if (typeof raw !== "string") {
return true;
}
const normalized = raw.trim().toLowerCase();
return normalized !== "off" && normalized !== "none";
}
export function createQwenThinkingWrapper(
baseStreamFn: StreamFn | undefined,
thinkingLevel: QwenThinkingLevel,
): StreamFn {
const underlying = baseStreamFn ?? streamSimple;
return (model, context, options) => {
if (model.api !== "openai-completions" || !model.reasoning) {
return underlying(model, context, options);
}
const enableThinking = resolveOpenAICompatibleThinkingEnabled({ thinkingLevel, options });
return streamWithPayloadPatch(underlying, model, context, options, (payloadObj) => {
payloadObj.enable_thinking = enableThinking;
delete payloadObj.reasoning_effort;
delete payloadObj.reasoningEffort;
delete payloadObj.reasoning;
});
};
}
export function wrapQwenProviderStream(ctx: ProviderWrapStreamFnContext): StreamFn | undefined {
if (!isQwenProviderId(ctx.provider) || (ctx.model && ctx.model.api !== "openai-completions")) {
return undefined;
}
return createQwenThinkingWrapper(ctx.streamFn, ctx.thinkingLevel);
}

View File

@@ -5,3 +5,4 @@ export {
VLLM_PROVIDER_LABEL,
} from "./defaults.js";
export { buildVllmProvider } from "./models.js";
export { createVllmQwenThinkingWrapper, wrapVllmProviderStream } from "./stream.js";

View File

@@ -10,6 +10,7 @@ import {
VLLM_MODEL_PLACEHOLDER,
VLLM_PROVIDER_LABEL,
} from "./api.js";
import { wrapVllmProviderStream } from "./stream.js";
const PROVIDER_ID = "vllm";
@@ -89,6 +90,7 @@ export default definePluginEntry({
"vLLM requires authentication to be registered as a provider. " +
'Set VLLM_API_KEY (any value works) or run "openclaw configure". ' +
"See: https://docs.openclaw.ai/providers/vllm",
wrapStreamFn: wrapVllmProviderStream,
});
},
});

View File

@@ -0,0 +1,170 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import type { Context, Model } from "@mariozechner/pi-ai";
import { describe, expect, it } from "vitest";
import { createVllmQwenThinkingWrapper, wrapVllmProviderStream } from "./stream.js";
function capturePayload(params: {
format: "chat-template" | "top-level";
thinkingLevel?: "off" | "low" | "medium" | "high" | "xhigh" | "max";
reasoning?: unknown;
initialPayload?: Record<string, unknown>;
model?: Partial<Model<"openai-completions">>;
}): Record<string, unknown> {
let captured: Record<string, unknown> = {};
const baseStreamFn: StreamFn = (_model, _context, options) => {
const payload = { ...params.initialPayload };
options?.onPayload?.(payload, _model);
captured = payload;
return {} as ReturnType<StreamFn>;
};
const wrapped = createVllmQwenThinkingWrapper({
baseStreamFn,
format: params.format,
thinkingLevel: params.thinkingLevel ?? "high",
});
void wrapped(
{
api: "openai-completions",
provider: "vllm",
id: "Qwen/Qwen3-8B",
reasoning: true,
...params.model,
} as Model<"openai-completions">,
{ messages: [] } as Context,
params.reasoning === undefined ? {} : ({ reasoning: params.reasoning } as never),
);
return captured;
}
describe("createVllmQwenThinkingWrapper", () => {
it("maps Qwen chat-template thinking off to chat_template_kwargs", () => {
const payload = capturePayload({
format: "chat-template",
reasoning: "none",
initialPayload: {
reasoning_effort: "high",
reasoning: { effort: "high" },
reasoningEffort: "high",
},
});
expect(payload).toEqual({
chat_template_kwargs: {
enable_thinking: false,
preserve_thinking: true,
},
});
});
it("maps Qwen chat-template thinking on to chat_template_kwargs", () => {
expect(capturePayload({ format: "chat-template", reasoning: "medium" })).toEqual({
chat_template_kwargs: {
enable_thinking: true,
preserve_thinking: true,
},
});
});
it("preserves explicit chat-template kwargs while setting enable_thinking", () => {
expect(
capturePayload({
format: "chat-template",
thinkingLevel: "off",
initialPayload: {
chat_template_kwargs: {
preserve_thinking: false,
force_nonempty_content: true,
},
},
}),
).toEqual({
chat_template_kwargs: {
enable_thinking: false,
preserve_thinking: false,
force_nonempty_content: true,
},
});
});
it("maps Qwen top-level thinking format to enable_thinking", () => {
expect(capturePayload({ format: "top-level", thinkingLevel: "off" })).toEqual({
enable_thinking: false,
});
expect(capturePayload({ format: "top-level", thinkingLevel: "high" })).toEqual({
enable_thinking: true,
});
});
it("skips non-reasoning and non-completions models", () => {
expect(capturePayload({ format: "chat-template", model: { reasoning: false } })).toEqual({});
expect(
capturePayload({ format: "chat-template", model: { api: "openai-responses" as never } }),
).toEqual({});
});
});
describe("wrapVllmProviderStream", () => {
it("registers when vLLM Qwen thinking format params are configured", () => {
expect(
wrapVllmProviderStream({
provider: "vllm",
modelId: "Qwen/Qwen3-8B",
extraParams: { qwenThinkingFormat: "chat-template" },
model: {
api: "openai-completions",
provider: "vllm",
id: "Qwen/Qwen3-8B",
reasoning: true,
} as Model<"openai-completions">,
streamFn: undefined,
} as never),
).toBeTypeOf("function");
expect(
wrapVllmProviderStream({
provider: "vllm",
modelId: "Qwen/Qwen3-8B",
extraParams: { qwen_thinking_format: "enable_thinking" },
model: {
api: "openai-completions",
provider: "vllm",
id: "Qwen/Qwen3-8B",
reasoning: true,
} as Model<"openai-completions">,
streamFn: undefined,
} as never),
).toBeTypeOf("function");
});
it("skips unconfigured vLLM and non-vLLM providers", () => {
expect(
wrapVllmProviderStream({
provider: "vllm",
modelId: "Qwen/Qwen3-8B",
extraParams: {},
model: {
api: "openai-completions",
provider: "vllm",
id: "Qwen/Qwen3-8B",
} as Model<"openai-completions">,
streamFn: undefined,
} as never),
).toBeUndefined();
expect(
wrapVllmProviderStream({
provider: "openai",
modelId: "gpt-5.4",
extraParams: { qwenThinkingFormat: "chat-template" },
model: {
api: "openai-completions",
provider: "openai",
id: "gpt-5.4",
} as Model<"openai-completions">,
streamFn: undefined,
} as never),
).toBeUndefined();
});
});

117
extensions/vllm/stream.ts Normal file
View File

@@ -0,0 +1,117 @@
import type { StreamFn } from "@mariozechner/pi-agent-core";
import { streamSimple } from "@mariozechner/pi-ai";
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
import { streamWithPayloadPatch } from "openclaw/plugin-sdk/provider-stream-shared";
type VllmThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
type VllmQwenThinkingFormat = "chat-template" | "top-level";
function isVllmProviderId(providerId: string): boolean {
return normalizeProviderId(providerId) === "vllm";
}
function normalizeQwenThinkingFormat(value: unknown): VllmQwenThinkingFormat | undefined {
if (typeof value !== "string") {
return undefined;
}
const normalized = value.trim().toLowerCase().replace(/_/g, "-");
if (
normalized === "chat-template" ||
normalized === "chat-template-kwargs" ||
normalized === "chat-template-kwarg" ||
normalized === "chat-template-arguments"
) {
return "chat-template";
}
if (
normalized === "top-level" ||
normalized === "enable-thinking" ||
normalized === "request-body"
) {
return "top-level";
}
return undefined;
}
function resolveVllmQwenThinkingFormat(
extraParams: ProviderWrapStreamFnContext["extraParams"],
): VllmQwenThinkingFormat | undefined {
return normalizeQwenThinkingFormat(
extraParams?.qwenThinkingFormat ?? extraParams?.qwen_thinking_format,
);
}
function resolveOpenAICompatibleThinkingEnabled(params: {
thinkingLevel: VllmThinkingLevel;
options: Parameters<StreamFn>[2];
}): boolean {
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
if (typeof raw !== "string") {
return true;
}
const normalized = raw.trim().toLowerCase();
return normalized !== "off" && normalized !== "none";
}
function setQwenChatTemplateThinking(payload: Record<string, unknown>, enabled: boolean): void {
const existing = payload.chat_template_kwargs;
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
const next: Record<string, unknown> = {
...(existing as Record<string, unknown>),
enable_thinking: enabled,
};
if (!Object.hasOwn(next, "preserve_thinking")) {
next.preserve_thinking = true;
}
payload.chat_template_kwargs = next;
return;
}
payload.chat_template_kwargs = {
enable_thinking: enabled,
preserve_thinking: true,
};
}
export function createVllmQwenThinkingWrapper(params: {
baseStreamFn: StreamFn | undefined;
format: VllmQwenThinkingFormat;
thinkingLevel: VllmThinkingLevel;
}): StreamFn {
const underlying = params.baseStreamFn ?? streamSimple;
return (model, context, options) => {
if (model.api !== "openai-completions" || !model.reasoning) {
return underlying(model, context, options);
}
const enableThinking = resolveOpenAICompatibleThinkingEnabled({
thinkingLevel: params.thinkingLevel,
options,
});
return streamWithPayloadPatch(underlying, model, context, options, (payloadObj) => {
if (params.format === "chat-template") {
setQwenChatTemplateThinking(payloadObj, enableThinking);
} else {
payloadObj.enable_thinking = enableThinking;
}
delete payloadObj.reasoning_effort;
delete payloadObj.reasoningEffort;
delete payloadObj.reasoning;
});
};
}
export function wrapVllmProviderStream(ctx: ProviderWrapStreamFnContext): StreamFn | undefined {
if (!isVllmProviderId(ctx.provider) || (ctx.model && ctx.model.api !== "openai-completions")) {
return undefined;
}
const format = resolveVllmQwenThinkingFormat(ctx.extraParams);
if (!format) {
return undefined;
}
return createVllmQwenThinkingWrapper({
baseStreamFn: ctx.streamFn,
format,
thinkingLevel: ctx.thinkingLevel,
});
}