mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-06 15:50:46 +00:00
qa-lab: generalize tool progress prompts
This commit is contained in:
@@ -206,6 +206,21 @@ describe("qa mock openai server", () => {
|
||||
expect(quietBody).toContain('"phase":"final_answer"');
|
||||
expect(quietBody).toContain("QA_STREAMING_OK");
|
||||
|
||||
const partialResponse = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
input: [makeUserInput("Partial streaming QA check: reply exactly `QA_PARTIAL_OK`.")],
|
||||
}),
|
||||
});
|
||||
expect(partialResponse.status).toBe(200);
|
||||
const partialBody = await partialResponse.text();
|
||||
expect(partialBody).toContain('"type":"response.output_text.delta"');
|
||||
expect(partialBody).toContain("QA_PARTIAL_OK");
|
||||
|
||||
const blockResponse = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
@@ -228,6 +243,113 @@ describe("qa mock openai server", () => {
|
||||
expect(blockBody).toContain("BLOCK_TWO_OK");
|
||||
});
|
||||
|
||||
it("plans deterministic tool-progress reads from prompt paths", async () => {
|
||||
const server = await startMockServer();
|
||||
|
||||
const response = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
input: [
|
||||
makeUserInput(
|
||||
"Tool progress QA check: read `qa-progress-target.txt` before answering. After the read completes, reply exactly `TOOL_PROGRESS_OK`.",
|
||||
),
|
||||
],
|
||||
}),
|
||||
});
|
||||
|
||||
expect(response.status).toBe(200);
|
||||
const body = await response.text();
|
||||
expect(body).toContain('"name":"read"');
|
||||
expect(body).toContain("qa-progress-target.txt");
|
||||
});
|
||||
|
||||
it("requires deterministic tool-progress error prompts to observe a failed tool", async () => {
|
||||
const server = await startMockServer();
|
||||
const prompt =
|
||||
"Tool progress error QA check: read `missing-tool-progress-target.txt` before answering. After the read fails, reply exactly `TOOL_PROGRESS_ERROR_OK`.";
|
||||
|
||||
const toolPlan = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
input: [makeUserInput(prompt)],
|
||||
}),
|
||||
});
|
||||
|
||||
expect(toolPlan.status).toBe(200);
|
||||
const toolPlanBody = await toolPlan.text();
|
||||
expect(toolPlanBody).toContain('"name":"read"');
|
||||
expect(toolPlanBody).toContain("missing-tool-progress-target.txt");
|
||||
|
||||
const successOutput = await expectResponsesJson<{
|
||||
output: Array<{ content?: Array<{ text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
input: [
|
||||
makeUserInput(prompt),
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_mock_read_1",
|
||||
output: JSON.stringify({ text: "unexpected success" }),
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(successOutput.output[0]?.content?.[0]?.text).toBe("BUG-TOOL-DID-NOT-FAIL");
|
||||
|
||||
const errorOutput = await expectResponsesJson<{
|
||||
output: Array<{ content?: Array<{ text?: string }> }>;
|
||||
}>(server, {
|
||||
stream: false,
|
||||
input: [
|
||||
makeUserInput(prompt),
|
||||
{
|
||||
type: "function_call_output",
|
||||
call_id: "call_mock_read_1",
|
||||
output: JSON.stringify({ error: "ENOENT: no such file or directory" }),
|
||||
},
|
||||
],
|
||||
});
|
||||
expect(errorOutput.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_ERROR_OK");
|
||||
});
|
||||
|
||||
it("uses the latest user prompt path for tool-progress plans", async () => {
|
||||
const server = await startMockServer();
|
||||
|
||||
const response = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
input: [
|
||||
makeUserInput(
|
||||
"Tool progress QA check: read `older-progress-target.txt` before answering. After the read completes, reply exactly `OLD_PROGRESS_OK`.",
|
||||
),
|
||||
makeUserInput(
|
||||
"Tool progress error QA check: read `latest-missing-progress-target.txt` before answering. After the read fails, reply exactly `LATEST_PROGRESS_OK`.",
|
||||
),
|
||||
makeUserInput(
|
||||
"Continue with the QA scenario plan and report worked, failed, and blocked items.",
|
||||
),
|
||||
],
|
||||
}),
|
||||
});
|
||||
|
||||
expect(response.status).toBe(200);
|
||||
const body = await response.text();
|
||||
expect(body).toContain('"name":"read"');
|
||||
expect(body).toContain("latest-missing-progress-target.txt");
|
||||
expect(body).not.toContain("older-progress-target.txt");
|
||||
});
|
||||
|
||||
it("prefers path-like refs over generic quoted keys in prompts", async () => {
|
||||
const server = await startQaMockOpenAiServer({
|
||||
host: "127.0.0.1",
|
||||
@@ -1674,7 +1796,7 @@ describe("qa mock openai server", () => {
|
||||
content: [
|
||||
{
|
||||
type: "input_text",
|
||||
text: "@qa-sut:matrix-qa.test reply with only this exact marker: MATRIX_QA_CANARY_TEST",
|
||||
text: "@qa-sut.example.test reply with only this exact marker: QA_CANARY_TEST",
|
||||
},
|
||||
],
|
||||
},
|
||||
@@ -1695,7 +1817,7 @@ describe("qa mock openai server", () => {
|
||||
expect(await response.json()).toMatchObject({
|
||||
output: [
|
||||
{
|
||||
content: [{ text: "MATRIX_QA_CANARY_TEST" }],
|
||||
content: [{ text: "QA_CANARY_TEST" }],
|
||||
},
|
||||
],
|
||||
});
|
||||
@@ -1710,8 +1832,8 @@ describe("qa mock openai server", () => {
|
||||
await server.stop();
|
||||
});
|
||||
|
||||
const matrixPrompt =
|
||||
"@qa-sut:matrix-qa.test Image generation check: generate a QA lighthouse image and summarize it in one short sentence.";
|
||||
const channelPrompt =
|
||||
"@qa-sut.example.test Image generation check: generate a QA lighthouse image and summarize it in one short sentence.";
|
||||
const genericPrompt =
|
||||
"Continue with the QA scenario plan and report worked, failed, and blocked items.";
|
||||
|
||||
@@ -1722,7 +1844,7 @@ describe("qa mock openai server", () => {
|
||||
},
|
||||
body: JSON.stringify({
|
||||
stream: false,
|
||||
input: [makeUserInput(matrixPrompt), makeUserInput(genericPrompt)],
|
||||
input: [makeUserInput(channelPrompt), makeUserInput(genericPrompt)],
|
||||
}),
|
||||
});
|
||||
|
||||
@@ -1745,7 +1867,7 @@ describe("qa mock openai server", () => {
|
||||
body: JSON.stringify({
|
||||
stream: false,
|
||||
input: [
|
||||
makeUserInput(matrixPrompt),
|
||||
makeUserInput(channelPrompt),
|
||||
makeUserInput(genericPrompt),
|
||||
{
|
||||
type: "function_call",
|
||||
|
||||
@@ -145,8 +145,10 @@ const QA_THINKING_VISIBILITY_OFF_PROMPT_RE = /qa thinking visibility check off/i
|
||||
const QA_THINKING_VISIBILITY_MAX_PROMPT_RE = /qa thinking visibility check max/i;
|
||||
const QA_EMPTY_RESPONSE_RECOVERY_PROMPT_RE = /empty response continuation qa check/i;
|
||||
const QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT_RE = /empty response exhaustion qa check/i;
|
||||
const QA_QUIET_STREAMING_PROMPT_RE = /quiet streaming qa check/i;
|
||||
const QA_STREAMING_PROMPT_RE = /(?:partial|quiet) streaming qa check/i;
|
||||
const QA_BLOCK_STREAMING_PROMPT_RE = /block streaming qa check/i;
|
||||
const QA_TOOL_PROGRESS_ERROR_PROMPT_RE = /tool progress error qa check/i;
|
||||
const QA_TOOL_PROGRESS_PROMPT_RE = /tool progress qa check/i;
|
||||
const QA_SUBAGENT_DIRECT_FALLBACK_PROMPT_RE = /subagent direct fallback qa check/i;
|
||||
const QA_SUBAGENT_DIRECT_FALLBACK_WORKER_RE = /subagent direct fallback worker/i;
|
||||
const QA_SUBAGENT_DIRECT_FALLBACK_MARKER = "QA-SUBAGENT-DIRECT-FALLBACK-OK";
|
||||
@@ -530,6 +532,16 @@ function extractLastCapture(text: string, pattern: RegExp) {
|
||||
return lastMatch?.[1]?.trim() || null;
|
||||
}
|
||||
|
||||
function extractLastMatchingUserText(texts: string[], pattern: RegExp) {
|
||||
for (let index = texts.length - 1; index >= 0; index -= 1) {
|
||||
const text = texts[index] ?? "";
|
||||
if (pattern.test(text)) {
|
||||
return text;
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
function extractExactReplyDirective(text: string) {
|
||||
const backtickedMatch = extractLastCapture(text, /reply(?: with)? exactly\s+`([^`]+)`/i);
|
||||
if (backtickedMatch) {
|
||||
@@ -642,6 +654,19 @@ function extractToolErrorForNamedCall(params: {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function hasToolErrorOutput(toolJson: Record<string, unknown> | null, toolOutput: string) {
|
||||
if (typeof toolJson?.error === "string" && toolJson.error.trim()) {
|
||||
return true;
|
||||
}
|
||||
if (
|
||||
typeof toolJson?.status === "string" &&
|
||||
/\b(?:error|failed|failure)\b/i.test(toolJson.status)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
return /\b(?:error|failed|failure|not found|no such file|enoent)\b/i.test(toolOutput);
|
||||
}
|
||||
|
||||
function isHeartbeatPrompt(text: string) {
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed || /remember this fact/i.test(trimmed)) {
|
||||
@@ -1165,6 +1190,12 @@ async function buildResponsesPayload(
|
||||
const hasEmptyResponseRetryInstruction = allInputText.includes(QA_EMPTY_RESPONSE_RETRY_NEEDLE);
|
||||
const canCallSessionsSpawn = hasDeclaredTool(body, "sessions_spawn");
|
||||
const canCallSessionsYield = hasDeclaredTool(body, "sessions_yield");
|
||||
const buildToolProgressReadEvents = (pattern: RegExp) => {
|
||||
const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
|
||||
return buildToolCallEventsWithArgs("read", {
|
||||
path: readTargetFromPrompt(toolProgressPrompt || prompt || allInputText),
|
||||
});
|
||||
};
|
||||
if (
|
||||
allInputText.includes(QA_SUBAGENT_DIRECT_FALLBACK_MARKER) &&
|
||||
/Internal task completion event/i.test(allInputText)
|
||||
@@ -1250,7 +1281,7 @@ async function buildResponsesPayload(
|
||||
}
|
||||
return buildAssistantEvents("");
|
||||
}
|
||||
if (QA_QUIET_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) {
|
||||
if (QA_STREAMING_PROMPT_RE.test(allInputText) && exactReplyDirective) {
|
||||
return buildAssistantEvents([
|
||||
{
|
||||
id: "msg_mock_quiet_stream",
|
||||
@@ -1260,6 +1291,20 @@ async function buildResponsesPayload(
|
||||
},
|
||||
]);
|
||||
}
|
||||
if (QA_TOOL_PROGRESS_ERROR_PROMPT_RE.test(allInputText) && exactReplyDirective) {
|
||||
if (!toolOutput) {
|
||||
return buildToolProgressReadEvents(QA_TOOL_PROGRESS_ERROR_PROMPT_RE);
|
||||
}
|
||||
return buildAssistantEvents(
|
||||
hasToolErrorOutput(toolJson, toolOutput) ? exactReplyDirective : "BUG-TOOL-DID-NOT-FAIL",
|
||||
);
|
||||
}
|
||||
if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && exactReplyDirective) {
|
||||
if (!toolOutput) {
|
||||
return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE);
|
||||
}
|
||||
return buildAssistantEvents(exactReplyDirective);
|
||||
}
|
||||
if (
|
||||
QA_BLOCK_STREAMING_PROMPT_RE.test(allInputText) &&
|
||||
firstExactMarkerDirective &&
|
||||
|
||||
Reference in New Issue
Block a user