mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 08:50:43 +00:00
test(qa-lab): cover GPT-style broken turns
This commit is contained in:
@@ -4,6 +4,18 @@ import { resolveProviderVariant, startQaMockOpenAiServer } from "./mock-openai-s
|
||||
const cleanups: Array<() => Promise<void>> = [];
|
||||
const QA_IMAGE_PNG_BASE64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAT0lEQVR42u3RQQkAMAzAwPg33Wnos+wgBo40dboAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANYADwAAAAAAAAAAAAAAAAAAAAAAAAAAAAC+Azy47PDiI4pA2wAAAABJRU5ErkJggg==";
|
||||
const QA_REASONING_ONLY_RECOVERY_PROMPT =
|
||||
"Reasoning-only continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly REASONING-RECOVERED-OK.";
|
||||
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT =
|
||||
"Reasoning-only after write safety check: write reasoning-only-side-effect.txt, then answer with exactly SIDE-EFFECT-GUARD-OK.";
|
||||
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT =
|
||||
"Empty response continuation QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-RECOVERED-OK.";
|
||||
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT =
|
||||
"Empty response exhaustion QA check: read QA_KICKOFF_TASK.md, then answer with exactly EMPTY-EXHAUSTED-OK.";
|
||||
const QA_REASONING_ONLY_RETRY_INSTRUCTION =
|
||||
"The previous assistant turn recorded reasoning but did not produce a user-visible answer. Continue from that partial turn and produce the visible answer now. Do not restate the reasoning or restart from scratch.";
|
||||
const QA_EMPTY_RESPONSE_RETRY_INSTRUCTION =
|
||||
"The previous attempt did not produce a user-visible answer. Continue from the current state and produce the visible answer now. Do not restart from scratch.";
|
||||
|
||||
afterEach(async () => {
|
||||
while (cleanups.length > 0) {
|
||||
@@ -11,6 +23,46 @@ afterEach(async () => {
|
||||
}
|
||||
});
|
||||
|
||||
async function startMockServer() {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
port: 0,
|
||||
});
|
||||
cleanups.push(async () => {
|
||||
await server.stop();
|
||||
});
|
||||
return server;
|
||||
}
|
||||
|
||||
async function postResponses(server: { baseUrl: string }, body: unknown) {
|
||||
return fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
},
|
||||
body: JSON.stringify(body),
|
||||
});
|
||||
}
|
||||
|
||||
async function expectResponsesText(server: { baseUrl: string }, body: unknown) {
|
||||
const response = await postResponses(server, body);
|
||||
expect(response.status).toBe(200);
|
||||
return response.text();
|
||||
}
|
||||
|
||||
async function expectResponsesJson<T>(server: { baseUrl: string }, body: unknown) {
|
||||
const response = await postResponses(server, body);
|
||||
expect(response.status).toBe(200);
|
||||
return (await response.json()) as T;
|
||||
}
|
||||
|
||||
function makeUserInput(text: string) {
|
||||
return {
|
||||
role: "user" as const,
|
||||
content: [{ type: "input_text" as const, text }],
|
||||
};
|
||||
}
|
||||
|
||||
describe("qa mock openai server", () => {
|
||||
it("serves health and streamed responses", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
@@ -1750,6 +1802,204 @@ describe("qa mock openai server", () => {
|
||||
const debug = (await debugResponse.json()) as { model: string };
|
||||
expect(debug.model).toBe("claude-opus-4-6");
|
||||
});
|
||||
|
||||
it("scripts a reasoning-only recovery sequence after a replay-safe read", async () => {
|
||||
const server = await startMockServer();
|
||||
|
||||
const toolPlan = await expectResponsesText(server, {
|
||||
stream: true,
|
||||
model: "gpt-5.4",
|
||||
input: [makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT)],
|
||||
});
|
||||
expect(toolPlan).toContain('"name":"read"');
|
||||
expect(toolPlan).toContain("QA_KICKOFF_TASK.md");
|
||||
|
||||
expect(
|
||||
await expectResponsesJson<{
|
||||
output?: Array<{ type?: string; id?: string; summary?: Array<{ text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [
|
||||
makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
|
||||
},
|
||||
],
|
||||
}),
|
||||
).toMatchObject({
|
||||
output: [
|
||||
{
|
||||
type: "reasoning",
|
||||
id: "rs_mock_reasoning_recovery",
|
||||
summary: [{ text: expect.stringContaining("Need visible answer") }],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(
|
||||
await expectResponsesJson<{
|
||||
output?: Array<{ content?: Array<{ text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [
|
||||
makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT),
|
||||
makeUserInput(QA_REASONING_ONLY_RETRY_INSTRUCTION),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
|
||||
},
|
||||
],
|
||||
}),
|
||||
).toMatchObject({
|
||||
output: [
|
||||
{
|
||||
content: [{ text: "REASONING-RECOVERED-OK" }],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const requests = await fetch(`${server.baseUrl}/debug/requests`);
|
||||
expect(requests.status).toBe(200);
|
||||
expect(await requests.json()).toMatchObject([
|
||||
{ plannedToolName: "read" },
|
||||
{ allInputText: expect.stringContaining(QA_REASONING_ONLY_RECOVERY_PROMPT) },
|
||||
{ allInputText: expect.stringContaining(QA_REASONING_ONLY_RETRY_INSTRUCTION) },
|
||||
]);
|
||||
});
|
||||
|
||||
it("keeps the reasoning-only side-effect path ready for no-auto-retry QA coverage", async () => {
|
||||
const server = await startMockServer();
|
||||
|
||||
const toolPlan = await expectResponsesText(server, {
|
||||
stream: true,
|
||||
model: "gpt-5.4",
|
||||
input: [makeUserInput(QA_REASONING_ONLY_SIDE_EFFECT_PROMPT)],
|
||||
});
|
||||
expect(toolPlan).toContain('"name":"write"');
|
||||
expect(toolPlan).toContain("reasoning-only-side-effect.txt");
|
||||
|
||||
expect(
|
||||
await expectResponsesJson<{
|
||||
output?: Array<{ type?: string; id?: string }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [
|
||||
makeUserInput(QA_REASONING_ONLY_SIDE_EFFECT_PROMPT),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "Successfully wrote 28 bytes to reasoning-only-side-effect.txt.",
|
||||
},
|
||||
],
|
||||
}),
|
||||
).toMatchObject({
|
||||
output: [{ type: "reasoning", id: "rs_mock_reasoning_side_effect" }],
|
||||
});
|
||||
|
||||
const requests = await fetch(`${server.baseUrl}/debug/requests`);
|
||||
expect(requests.status).toBe(200);
|
||||
expect((await requests.json()) as Array<{ allInputText?: string }>).toHaveLength(2);
|
||||
});
|
||||
|
||||
it("scripts an empty-response recovery sequence after a replay-safe read", async () => {
|
||||
const server = await startMockServer();
|
||||
|
||||
const toolPlan = await expectResponsesText(server, {
|
||||
stream: true,
|
||||
model: "gpt-5.4",
|
||||
input: [makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT)],
|
||||
});
|
||||
expect(toolPlan).toContain('"name":"read"');
|
||||
|
||||
expect(
|
||||
await expectResponsesJson<{
|
||||
output?: Array<{ content?: Array<{ type?: string; text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [
|
||||
makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
|
||||
},
|
||||
],
|
||||
}),
|
||||
).toMatchObject({
|
||||
output: [
|
||||
{
|
||||
content: [{ type: "output_text", text: "" }],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
expect(
|
||||
await expectResponsesJson<{
|
||||
output?: Array<{ content?: Array<{ text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [
|
||||
makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT),
|
||||
makeUserInput(QA_EMPTY_RESPONSE_RETRY_INSTRUCTION),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
|
||||
},
|
||||
],
|
||||
}),
|
||||
).toMatchObject({
|
||||
output: [
|
||||
{
|
||||
content: [{ text: "EMPTY-RECOVERED-OK" }],
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
it("can keep emitting empty GPT turns when the single retry budget should exhaust", async () => {
|
||||
const server = await startMockServer();
|
||||
|
||||
await expectResponsesText(server, {
|
||||
stream: true,
|
||||
model: "gpt-5.4",
|
||||
input: [makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT)],
|
||||
});
|
||||
|
||||
const firstEmpty = await expectResponsesJson<{
|
||||
output?: Array<{ content?: Array<{ text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [
|
||||
makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(firstEmpty.output?.[0]?.content?.[0]?.text).toBe("");
|
||||
|
||||
const secondEmpty = await expectResponsesJson<{
|
||||
output?: Array<{ content?: Array<{ text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
model: "gpt-5.4",
|
||||
input: [
|
||||
makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT),
|
||||
makeUserInput(QA_EMPTY_RESPONSE_RETRY_INSTRUCTION),
|
||||
{
|
||||
type: "function_call_output",
|
||||
output: "QA mission: Understand this OpenClaw repo from source + docs before acting.",
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(secondEmpty.output?.[0]?.content?.[0]?.text).toBe("");
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveProviderVariant", () => {
|
||||
|
||||
@@ -124,6 +124,14 @@ type AnthropicMessagesRequest = {
|
||||
|
||||
const TINY_PNG_BASE64 =
|
||||
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO7Z0nQAAAAASUVORK5CYII=";
|
||||
const QA_REASONING_ONLY_RECOVERY_PROMPT_RE = /reasoning-only continuation qa check/i;
|
||||
const QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE = /reasoning-only after write safety check/i;
|
||||
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
|
||||
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
|
||||
const QA_REASONING_ONLY_RETRY_NEEDLE =
|
||||
"recorded reasoning but did not produce a user-visible answer";
|
||||
const QA_EMPTY_RESPONSE_RETRY_NEEDLE =
|
||||
"The previous attempt did not produce a user-visible answer.";
|
||||
|
||||
type MockScenarioState = {
|
||||
subagentFanoutPhase: number;
|
||||
@@ -718,6 +726,37 @@ function buildAssistantEvents(text: string): StreamEvent[] {
|
||||
];
|
||||
}
|
||||
|
||||
function buildReasoningOnlyEvents(summaryText: string, id: string): StreamEvent[] {
|
||||
const reasoningItem = {
|
||||
type: "reasoning",
|
||||
id,
|
||||
summary: [{ text: summaryText }],
|
||||
} as const;
|
||||
return [
|
||||
{
|
||||
type: "response.output_item.added",
|
||||
item: {
|
||||
type: "reasoning",
|
||||
id,
|
||||
summary: [],
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "response.output_item.done",
|
||||
item: reasoningItem,
|
||||
},
|
||||
{
|
||||
type: "response.completed",
|
||||
response: {
|
||||
id: `resp_${id}`,
|
||||
status: "completed",
|
||||
output: [reasoningItem],
|
||||
usage: { input_tokens: 64, output_tokens: 8, total_tokens: 72 },
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
async function buildResponsesPayload(
|
||||
body: Record<string, unknown>,
|
||||
scenarioState: MockScenarioState,
|
||||
@@ -729,12 +768,56 @@ async function buildResponsesPayload(
|
||||
const allInputText = extractAllRequestTexts(input, body);
|
||||
const isGroupChat = allInputText.includes('"is_group_chat": true');
|
||||
const isBaselineUnmentionedChannelChatter = /\bno bot ping here\b/i.test(prompt);
|
||||
const hasReasoningOnlyRetryInstruction = allInputText.includes(QA_REASONING_ONLY_RETRY_NEEDLE);
|
||||
const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
|
||||
if (/remember this fact/i.test(prompt)) {
|
||||
return buildAssistantEvents(buildAssistantText(input, body, scenarioState));
|
||||
}
|
||||
if (isHeartbeatPrompt(prompt)) {
|
||||
return buildAssistantEvents("HEARTBEAT_OK");
|
||||
}
|
||||
if (QA_REASONING_ONLY_RECOVERY_PROMPT_RE.test(allInputText)) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
|
||||
}
|
||||
if (!hasReasoningOnlyRetryInstruction) {
|
||||
return buildReasoningOnlyEvents(
|
||||
"Need visible answer after reading the QA kickoff task.",
|
||||
"rs_mock_reasoning_recovery",
|
||||
);
|
||||
}
|
||||
return buildAssistantEvents("REASONING-RECOVERED-OK");
|
||||
}
|
||||
if (QA_REASONING_ONLY_SIDE_EFFECT_PROMPT_RE.test(allInputText)) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("write", {
|
||||
path: "reasoning-only-side-effect.txt",
|
||||
content: "side effects already happened\n",
|
||||
});
|
||||
}
|
||||
if (!hasReasoningOnlyRetryInstruction) {
|
||||
return buildReasoningOnlyEvents(
|
||||
"Need visible answer after the write, but the write already happened.",
|
||||
"rs_mock_reasoning_side_effect",
|
||||
);
|
||||
}
|
||||
return buildAssistantEvents("BUG-SHOULD-NOT-AUTO-RETRY");
|
||||
}
|
||||
if (QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE.test(allInputText)) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
|
||||
}
|
||||
if (!hasEmptyResponseRetryInstruction) {
|
||||
return buildAssistantEvents("");
|
||||
}
|
||||
return buildAssistantEvents("EMPTY-RECOVERED-OK");
|
||||
}
|
||||
if (QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE.test(allInputText)) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
|
||||
}
|
||||
return buildAssistantEvents("");
|
||||
}
|
||||
if (/lobster invaders/i.test(prompt)) {
|
||||
if (!toolOutput) {
|
||||
return buildToolCallEventsWithArgs("read", { path: "QA_KICKOFF_TASK.md" });
|
||||
|
||||
Reference in New Issue
Block a user