mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 16:50:43 +00:00
refactor(vllm): own qwen thinking payloads
This commit is contained in:
@@ -5,3 +5,4 @@ export {
|
||||
VLLM_PROVIDER_LABEL,
|
||||
} from "./defaults.js";
|
||||
export { buildVllmProvider } from "./models.js";
|
||||
export { createVllmQwenThinkingWrapper, wrapVllmProviderStream } from "./stream.js";
|
||||
|
||||
@@ -10,6 +10,7 @@ import {
|
||||
VLLM_MODEL_PLACEHOLDER,
|
||||
VLLM_PROVIDER_LABEL,
|
||||
} from "./api.js";
|
||||
import { wrapVllmProviderStream } from "./stream.js";
|
||||
|
||||
const PROVIDER_ID = "vllm";
|
||||
|
||||
@@ -89,6 +90,7 @@ export default definePluginEntry({
|
||||
"vLLM requires authentication to be registered as a provider. " +
|
||||
'Set VLLM_API_KEY (any value works) or run "openclaw configure". ' +
|
||||
"See: https://docs.openclaw.ai/providers/vllm",
|
||||
wrapStreamFn: wrapVllmProviderStream,
|
||||
});
|
||||
},
|
||||
});
|
||||
|
||||
170
extensions/vllm/stream.test.ts
Normal file
170
extensions/vllm/stream.test.ts
Normal file
@@ -0,0 +1,170 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import type { Context, Model } from "@mariozechner/pi-ai";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { createVllmQwenThinkingWrapper, wrapVllmProviderStream } from "./stream.js";
|
||||
|
||||
function capturePayload(params: {
|
||||
format: "chat-template" | "top-level";
|
||||
thinkingLevel?: "off" | "low" | "medium" | "high" | "xhigh" | "max";
|
||||
reasoning?: unknown;
|
||||
initialPayload?: Record<string, unknown>;
|
||||
model?: Partial<Model<"openai-completions">>;
|
||||
}): Record<string, unknown> {
|
||||
let captured: Record<string, unknown> = {};
|
||||
const baseStreamFn: StreamFn = (_model, _context, options) => {
|
||||
const payload = { ...params.initialPayload };
|
||||
options?.onPayload?.(payload, _model);
|
||||
captured = payload;
|
||||
return {} as ReturnType<StreamFn>;
|
||||
};
|
||||
|
||||
const wrapped = createVllmQwenThinkingWrapper({
|
||||
baseStreamFn,
|
||||
format: params.format,
|
||||
thinkingLevel: params.thinkingLevel ?? "high",
|
||||
});
|
||||
void wrapped(
|
||||
{
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "Qwen/Qwen3-8B",
|
||||
reasoning: true,
|
||||
...params.model,
|
||||
} as Model<"openai-completions">,
|
||||
{ messages: [] } as Context,
|
||||
params.reasoning === undefined ? {} : ({ reasoning: params.reasoning } as never),
|
||||
);
|
||||
|
||||
return captured;
|
||||
}
|
||||
|
||||
describe("createVllmQwenThinkingWrapper", () => {
|
||||
it("maps Qwen chat-template thinking off to chat_template_kwargs", () => {
|
||||
const payload = capturePayload({
|
||||
format: "chat-template",
|
||||
reasoning: "none",
|
||||
initialPayload: {
|
||||
reasoning_effort: "high",
|
||||
reasoning: { effort: "high" },
|
||||
reasoningEffort: "high",
|
||||
},
|
||||
});
|
||||
|
||||
expect(payload).toEqual({
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: false,
|
||||
preserve_thinking: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("maps Qwen chat-template thinking on to chat_template_kwargs", () => {
|
||||
expect(capturePayload({ format: "chat-template", reasoning: "medium" })).toEqual({
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: true,
|
||||
preserve_thinking: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("preserves explicit chat-template kwargs while setting enable_thinking", () => {
|
||||
expect(
|
||||
capturePayload({
|
||||
format: "chat-template",
|
||||
thinkingLevel: "off",
|
||||
initialPayload: {
|
||||
chat_template_kwargs: {
|
||||
preserve_thinking: false,
|
||||
force_nonempty_content: true,
|
||||
},
|
||||
},
|
||||
}),
|
||||
).toEqual({
|
||||
chat_template_kwargs: {
|
||||
enable_thinking: false,
|
||||
preserve_thinking: false,
|
||||
force_nonempty_content: true,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("maps Qwen top-level thinking format to enable_thinking", () => {
|
||||
expect(capturePayload({ format: "top-level", thinkingLevel: "off" })).toEqual({
|
||||
enable_thinking: false,
|
||||
});
|
||||
expect(capturePayload({ format: "top-level", thinkingLevel: "high" })).toEqual({
|
||||
enable_thinking: true,
|
||||
});
|
||||
});
|
||||
|
||||
it("skips non-reasoning and non-completions models", () => {
|
||||
expect(capturePayload({ format: "chat-template", model: { reasoning: false } })).toEqual({});
|
||||
expect(
|
||||
capturePayload({ format: "chat-template", model: { api: "openai-responses" as never } }),
|
||||
).toEqual({});
|
||||
});
|
||||
});
|
||||
|
||||
describe("wrapVllmProviderStream", () => {
|
||||
it("registers when vLLM Qwen thinking format params are configured", () => {
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "vllm",
|
||||
modelId: "Qwen/Qwen3-8B",
|
||||
extraParams: { qwenThinkingFormat: "chat-template" },
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "Qwen/Qwen3-8B",
|
||||
reasoning: true,
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeTypeOf("function");
|
||||
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "vllm",
|
||||
modelId: "Qwen/Qwen3-8B",
|
||||
extraParams: { qwen_thinking_format: "enable_thinking" },
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "Qwen/Qwen3-8B",
|
||||
reasoning: true,
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeTypeOf("function");
|
||||
});
|
||||
|
||||
it("skips unconfigured vLLM and non-vLLM providers", () => {
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "vllm",
|
||||
modelId: "Qwen/Qwen3-8B",
|
||||
extraParams: {},
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "vllm",
|
||||
id: "Qwen/Qwen3-8B",
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeUndefined();
|
||||
|
||||
expect(
|
||||
wrapVllmProviderStream({
|
||||
provider: "openai",
|
||||
modelId: "gpt-5.4",
|
||||
extraParams: { qwenThinkingFormat: "chat-template" },
|
||||
model: {
|
||||
api: "openai-completions",
|
||||
provider: "openai",
|
||||
id: "gpt-5.4",
|
||||
} as Model<"openai-completions">,
|
||||
streamFn: undefined,
|
||||
} as never),
|
||||
).toBeUndefined();
|
||||
});
|
||||
});
|
||||
117
extensions/vllm/stream.ts
Normal file
117
extensions/vllm/stream.ts
Normal file
@@ -0,0 +1,117 @@
|
||||
import type { StreamFn } from "@mariozechner/pi-agent-core";
|
||||
import { streamSimple } from "@mariozechner/pi-ai";
|
||||
import type { ProviderWrapStreamFnContext } from "openclaw/plugin-sdk/plugin-entry";
|
||||
import { normalizeProviderId } from "openclaw/plugin-sdk/provider-model-shared";
|
||||
import { streamWithPayloadPatch } from "openclaw/plugin-sdk/provider-stream-shared";
|
||||
|
||||
type VllmThinkingLevel = ProviderWrapStreamFnContext["thinkingLevel"];
|
||||
type VllmQwenThinkingFormat = "chat-template" | "top-level";
|
||||
|
||||
function isVllmProviderId(providerId: string): boolean {
|
||||
return normalizeProviderId(providerId) === "vllm";
|
||||
}
|
||||
|
||||
function normalizeQwenThinkingFormat(value: unknown): VllmQwenThinkingFormat | undefined {
|
||||
if (typeof value !== "string") {
|
||||
return undefined;
|
||||
}
|
||||
const normalized = value.trim().toLowerCase().replace(/_/g, "-");
|
||||
if (
|
||||
normalized === "chat-template" ||
|
||||
normalized === "chat-template-kwargs" ||
|
||||
normalized === "chat-template-kwarg" ||
|
||||
normalized === "chat-template-arguments"
|
||||
) {
|
||||
return "chat-template";
|
||||
}
|
||||
if (
|
||||
normalized === "top-level" ||
|
||||
normalized === "enable-thinking" ||
|
||||
normalized === "request-body"
|
||||
) {
|
||||
return "top-level";
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function resolveVllmQwenThinkingFormat(
|
||||
extraParams: ProviderWrapStreamFnContext["extraParams"],
|
||||
): VllmQwenThinkingFormat | undefined {
|
||||
return normalizeQwenThinkingFormat(
|
||||
extraParams?.qwenThinkingFormat ?? extraParams?.qwen_thinking_format,
|
||||
);
|
||||
}
|
||||
|
||||
function resolveOpenAICompatibleThinkingEnabled(params: {
|
||||
thinkingLevel: VllmThinkingLevel;
|
||||
options: Parameters<StreamFn>[2];
|
||||
}): boolean {
|
||||
const options = (params.options ?? {}) as { reasoningEffort?: unknown; reasoning?: unknown };
|
||||
const raw = options.reasoningEffort ?? options.reasoning ?? params.thinkingLevel ?? "high";
|
||||
if (typeof raw !== "string") {
|
||||
return true;
|
||||
}
|
||||
const normalized = raw.trim().toLowerCase();
|
||||
return normalized !== "off" && normalized !== "none";
|
||||
}
|
||||
|
||||
function setQwenChatTemplateThinking(payload: Record<string, unknown>, enabled: boolean): void {
|
||||
const existing = payload.chat_template_kwargs;
|
||||
if (existing && typeof existing === "object" && !Array.isArray(existing)) {
|
||||
const next: Record<string, unknown> = {
|
||||
...(existing as Record<string, unknown>),
|
||||
enable_thinking: enabled,
|
||||
};
|
||||
if (!Object.hasOwn(next, "preserve_thinking")) {
|
||||
next.preserve_thinking = true;
|
||||
}
|
||||
payload.chat_template_kwargs = next;
|
||||
return;
|
||||
}
|
||||
payload.chat_template_kwargs = {
|
||||
enable_thinking: enabled,
|
||||
preserve_thinking: true,
|
||||
};
|
||||
}
|
||||
|
||||
export function createVllmQwenThinkingWrapper(params: {
|
||||
baseStreamFn: StreamFn | undefined;
|
||||
format: VllmQwenThinkingFormat;
|
||||
thinkingLevel: VllmThinkingLevel;
|
||||
}): StreamFn {
|
||||
const underlying = params.baseStreamFn ?? streamSimple;
|
||||
return (model, context, options) => {
|
||||
if (model.api !== "openai-completions" || !model.reasoning) {
|
||||
return underlying(model, context, options);
|
||||
}
|
||||
const enableThinking = resolveOpenAICompatibleThinkingEnabled({
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
options,
|
||||
});
|
||||
return streamWithPayloadPatch(underlying, model, context, options, (payloadObj) => {
|
||||
if (params.format === "chat-template") {
|
||||
setQwenChatTemplateThinking(payloadObj, enableThinking);
|
||||
} else {
|
||||
payloadObj.enable_thinking = enableThinking;
|
||||
}
|
||||
delete payloadObj.reasoning_effort;
|
||||
delete payloadObj.reasoningEffort;
|
||||
delete payloadObj.reasoning;
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
export function wrapVllmProviderStream(ctx: ProviderWrapStreamFnContext): StreamFn | undefined {
|
||||
if (!isVllmProviderId(ctx.provider) || (ctx.model && ctx.model.api !== "openai-completions")) {
|
||||
return undefined;
|
||||
}
|
||||
const format = resolveVllmQwenThinkingFormat(ctx.extraParams);
|
||||
if (!format) {
|
||||
return undefined;
|
||||
}
|
||||
return createVllmQwenThinkingWrapper({
|
||||
baseStreamFn: ctx.streamFn,
|
||||
format,
|
||||
thinkingLevel: ctx.thinkingLevel,
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user