mirror of
https://github.com/openclaw/openclaw.git
synced 2026-05-28 03:33:52 +00:00
fix(qa): force Matrix mention progress search
This commit is contained in:
@@ -397,6 +397,29 @@ describe("qa mock openai server", () => {
|
||||
expect(final.output[0]?.content?.[0]?.text).toBe("TOOL_PROGRESS_MARKER_OK");
|
||||
});
|
||||
|
||||
it("plans deterministic tool-progress exec commands from exact command prompts", async () => {
|
||||
const server = await startMockServer();
|
||||
const command =
|
||||
"rg -n 'matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt' . ; sleep 2";
|
||||
const prompt = `Tool progress QA check: call the exec tool exactly once with this exact command before answering: \`${command}\`. After that exec command completes or fails, reply exactly \`TOOL_PROGRESS_EXEC_OK\`.`;
|
||||
|
||||
const response = await fetch(`${server.baseUrl}/v1/responses`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"content-type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
stream: true,
|
||||
input: [makeUserInput(prompt)],
|
||||
}),
|
||||
});
|
||||
|
||||
expect(response.status).toBe(200);
|
||||
const body = await response.text();
|
||||
expect(body).toContain('"name":"exec"');
|
||||
expect(body).toContain(command);
|
||||
});
|
||||
|
||||
it("honors exact replies after QA kickoff reads without marker wording", async () => {
|
||||
const server = await startMockServer();
|
||||
const prompt =
|
||||
|
||||
@@ -635,6 +635,14 @@ function readTargetFromPrompt(prompt: string) {
|
||||
return "repo/package.json";
|
||||
}
|
||||
|
||||
function execCommandFromToolProgressPrompt(prompt: string) {
|
||||
return (
|
||||
/call the exec tool exactly once with this exact command before answering:\s*`([^`]+)`/i
|
||||
.exec(prompt)?.[1]
|
||||
?.trim() || null
|
||||
);
|
||||
}
|
||||
|
||||
function buildToolCallEventsWithArgs(name: string, args: Record<string, unknown>): StreamEvent[] {
|
||||
const serialized = JSON.stringify(args);
|
||||
const callSuffix = createHash("sha1")
|
||||
@@ -1626,6 +1634,11 @@ async function buildResponsesPayload(
|
||||
path: readTargetFromPrompt(toolProgressPrompt || prompt || allInputText),
|
||||
});
|
||||
};
|
||||
const buildToolProgressExecEvents = (pattern: RegExp) => {
|
||||
const toolProgressPrompt = extractLastMatchingUserText(extractAllUserTexts(input), pattern);
|
||||
const command = execCommandFromToolProgressPrompt(toolProgressPrompt || prompt || allInputText);
|
||||
return command ? buildToolCallEventsWithArgs("exec", { command }) : null;
|
||||
};
|
||||
if (
|
||||
(QA_TOOL_SEARCH_PROMPT_RE.test(allInputText) ||
|
||||
QA_TOOL_SEARCH_FAILURE_PROMPT_RE.test(allInputText)) &&
|
||||
@@ -1812,7 +1825,10 @@ async function buildResponsesPayload(
|
||||
}
|
||||
if (QA_TOOL_PROGRESS_PROMPT_RE.test(allInputText) && toolProgressReplyDirective) {
|
||||
if (!toolOutput) {
|
||||
return buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE);
|
||||
return (
|
||||
buildToolProgressExecEvents(QA_TOOL_PROGRESS_PROMPT_RE) ??
|
||||
buildToolProgressReadEvents(QA_TOOL_PROGRESS_PROMPT_RE)
|
||||
);
|
||||
}
|
||||
return buildAssistantEvents(toolProgressReplyDirective);
|
||||
}
|
||||
|
||||
@@ -32,7 +32,6 @@ import {
|
||||
createMatrixQaScenarioClient,
|
||||
isMatrixQaExactMarkerReply,
|
||||
isMatrixQaMessageLikeKind,
|
||||
MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME,
|
||||
MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME,
|
||||
primeMatrixQaActorCursor,
|
||||
primeMatrixQaDriverScenarioClient,
|
||||
@@ -870,7 +869,6 @@ async function runMatrixToolProgressScenario(
|
||||
finalText: string;
|
||||
allowFinalOnly?: boolean;
|
||||
allowTopLevelFinalWithProgress?: boolean;
|
||||
taskFilename?: string;
|
||||
label: string;
|
||||
allowGenericProgressLine?: boolean;
|
||||
mentionSafety?: boolean;
|
||||
@@ -880,7 +878,7 @@ async function runMatrixToolProgressScenario(
|
||||
) {
|
||||
const { client, startSince } = await primeMatrixQaDriverScenarioClient(context);
|
||||
const startObservedIndex = context.observedEvents.length;
|
||||
await writeMatrixToolProgressTaskFile(context, params.finalText, params.taskFilename);
|
||||
await writeMatrixToolProgressTaskFile(context, params.finalText);
|
||||
const triggerBody = params.triggerBodyBuilder(context.sutUserId, params.finalText);
|
||||
const driverEventId = await client.sendTextMessage({
|
||||
body: triggerBody,
|
||||
@@ -1188,13 +1186,12 @@ async function runMatrixToolProgressScenario(
|
||||
async function writeMatrixToolProgressTaskFile(
|
||||
context: MatrixQaScenarioContext,
|
||||
finalText: string,
|
||||
taskFilename = MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME,
|
||||
) {
|
||||
if (!context.gatewayWorkspaceDir) {
|
||||
return;
|
||||
}
|
||||
await writeFile(
|
||||
path.join(context.gatewayWorkspaceDir, taskFilename),
|
||||
path.join(context.gatewayWorkspaceDir, MATRIX_QA_TOOL_PROGRESS_TASK_FILENAME),
|
||||
`${buildMatrixToolProgressTaskContent(finalText)}\n`,
|
||||
"utf8",
|
||||
);
|
||||
@@ -1231,7 +1228,6 @@ export async function runToolProgressMentionSafetyScenario(context: MatrixQaScen
|
||||
finalText: buildMatrixQaToken("MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE"),
|
||||
label: "tool progress mention safety",
|
||||
mentionSafety: true,
|
||||
taskFilename: MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME,
|
||||
progressPattern: /@room|@alice:matrix-qa\.test|!room:matrix-qa\.test/i,
|
||||
triggerBodyBuilder: buildMatrixToolProgressMentionSafetyPrompt,
|
||||
});
|
||||
|
||||
@@ -121,10 +121,12 @@ export function buildMatrixToolProgressErrorPrompt(sutUserId: string, text: stri
|
||||
}
|
||||
|
||||
export function buildMatrixToolProgressMentionSafetyPrompt(sutUserId: string, text: string) {
|
||||
const command = `rg -n '${MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME}' . ; sleep 2`;
|
||||
return [
|
||||
`${sutUserId} Tool progress QA check: call the read tool exactly once on \`${MATRIX_QA_TOOL_PROGRESS_MENTION_FILENAME}\` before answering.`,
|
||||
`Do not use search for this check.`,
|
||||
`After the read completes, reply exactly \`${text}\`.`,
|
||||
`${sutUserId} Tool progress QA check: call the exec tool exactly once with this exact command before answering: \`${command}\`.`,
|
||||
`The QA harness must observe that exec tool call in a Matrix tool-progress preview.`,
|
||||
`Do not guess or send any marker before the tool result returns.`,
|
||||
`After that exec command completes or fails, reply exactly \`${text}\`.`,
|
||||
].join(" ");
|
||||
}
|
||||
|
||||
|
||||
@@ -3865,87 +3865,73 @@ describe("matrix live qa scenarios", () => {
|
||||
|
||||
it("keeps Matrix-looking tool progress mentions inert in partial previews", async () => {
|
||||
const previewEventId = "$tool-progress-mention-preview";
|
||||
const gatewayWorkspaceDir = await mkdtemp(path.join(os.tmpdir(), "matrix-qa-workspace-"));
|
||||
try {
|
||||
const { sendTextMessage } = mockMatrixQaRoomClient({
|
||||
driverEventId: "$tool-progress-mention-trigger",
|
||||
events: [
|
||||
{
|
||||
event: matrixQaMessageEvent({
|
||||
kind: "message",
|
||||
const { sendTextMessage } = mockMatrixQaRoomClient({
|
||||
driverEventId: "$tool-progress-mention-trigger",
|
||||
events: [
|
||||
{
|
||||
event: matrixQaMessageEvent({
|
||||
kind: "message",
|
||||
eventId: previewEventId,
|
||||
body: "Working...\n- `tool: exec`",
|
||||
}),
|
||||
since: "driver-sync-preview",
|
||||
},
|
||||
{
|
||||
event: matrixQaMessageEvent({
|
||||
kind: "message",
|
||||
eventId: "$tool-progress-mention-edit",
|
||||
body:
|
||||
'Working...\n- `search "matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt" in . -> run sleep 2`',
|
||||
formattedBody:
|
||||
'Working...<br><ul><li><code>search "matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt" in . -> run sleep 2</code></li></ul>',
|
||||
mentions: {},
|
||||
relatesTo: {
|
||||
relType: "m.replace",
|
||||
eventId: previewEventId,
|
||||
body: "Working...\n- `tool: read`",
|
||||
}),
|
||||
since: "driver-sync-preview",
|
||||
},
|
||||
{
|
||||
event: matrixQaMessageEvent({
|
||||
},
|
||||
}),
|
||||
since: "driver-sync-progress",
|
||||
},
|
||||
{
|
||||
event: ({ sendTextMessage }) =>
|
||||
matrixQaMessageEvent({
|
||||
kind: "message",
|
||||
eventId: "$tool-progress-mention-edit",
|
||||
body: "Working...\n- `tool: read`\n- `read from matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt`",
|
||||
formattedBody:
|
||||
"Working...<br><ul><li><code>read from matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt</code></li></ul>",
|
||||
mentions: {},
|
||||
eventId: "$tool-progress-mention-final",
|
||||
body: readMatrixQaReplyDirective(
|
||||
mockMessageBody(sendTextMessage, "sendTextMessage"),
|
||||
"MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE_FIXED",
|
||||
),
|
||||
relatesTo: {
|
||||
relType: "m.replace",
|
||||
eventId: previewEventId,
|
||||
},
|
||||
}),
|
||||
since: "driver-sync-progress",
|
||||
},
|
||||
{
|
||||
event: ({ sendTextMessage }) =>
|
||||
matrixQaMessageEvent({
|
||||
kind: "message",
|
||||
eventId: "$tool-progress-mention-final",
|
||||
body: readMatrixQaReplyDirective(
|
||||
mockMessageBody(sendTextMessage, "sendTextMessage"),
|
||||
"MATRIX_QA_TOOL_PROGRESS_MENTION_SAFE_FIXED",
|
||||
),
|
||||
relatesTo: {
|
||||
relType: "m.replace",
|
||||
eventId: previewEventId,
|
||||
},
|
||||
}),
|
||||
since: "driver-sync-next",
|
||||
},
|
||||
],
|
||||
});
|
||||
since: "driver-sync-next",
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
const scenario = requireMatrixQaScenario("matrix-room-tool-progress-mention-safety");
|
||||
const scenario = requireMatrixQaScenario("matrix-room-tool-progress-mention-safety");
|
||||
|
||||
const result = await runMatrixQaScenario(scenario, {
|
||||
...matrixQaScenarioContext(),
|
||||
gatewayWorkspaceDir,
|
||||
});
|
||||
const artifacts = result.artifacts as {
|
||||
driverEventId?: unknown;
|
||||
previewEventId?: unknown;
|
||||
previewMentions?: unknown;
|
||||
reply?: { eventId?: unknown };
|
||||
token?: unknown;
|
||||
};
|
||||
expect(artifacts.driverEventId).toBe("$tool-progress-mention-trigger");
|
||||
expect(artifacts.previewEventId).toBe("$tool-progress-mention-preview");
|
||||
expect(artifacts.previewMentions).toEqual({});
|
||||
expect(artifacts.reply?.eventId).toBe("$tool-progress-mention-final");
|
||||
const prompt = mockMessageBody(sendTextMessage, "sendTextMessage");
|
||||
expect(prompt).toContain(
|
||||
"call the read tool exactly once on `matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt`",
|
||||
);
|
||||
expect(prompt).toContain("Do not use search for this check.");
|
||||
await expect(
|
||||
readFile(
|
||||
path.join(
|
||||
gatewayWorkspaceDir,
|
||||
"matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt",
|
||||
),
|
||||
"utf8",
|
||||
),
|
||||
).resolves.toContain(String(artifacts.token));
|
||||
} finally {
|
||||
await rm(gatewayWorkspaceDir, { force: true, recursive: true });
|
||||
}
|
||||
const result = await runMatrixQaScenario(scenario, matrixQaScenarioContext());
|
||||
const artifacts = result.artifacts as {
|
||||
driverEventId?: unknown;
|
||||
previewEventId?: unknown;
|
||||
previewMentions?: unknown;
|
||||
reply?: { eventId?: unknown };
|
||||
};
|
||||
expect(artifacts.driverEventId).toBe("$tool-progress-mention-trigger");
|
||||
expect(artifacts.previewEventId).toBe("$tool-progress-mention-preview");
|
||||
expect(artifacts.previewMentions).toEqual({});
|
||||
expect(artifacts.reply?.eventId).toBe("$tool-progress-mention-final");
|
||||
const prompt = mockMessageBody(sendTextMessage, "sendTextMessage");
|
||||
expect(prompt).toContain(
|
||||
"call the exec tool exactly once with this exact command before answering",
|
||||
);
|
||||
expect(prompt).toContain(
|
||||
"`rg -n 'matrix-progress-@room-@alice:matrix-qa.test-!room:matrix-qa.test.txt' . ; sleep 2`",
|
||||
);
|
||||
expect(prompt).toContain("The QA harness must observe that exec tool call");
|
||||
});
|
||||
|
||||
it("preserves separate finalized block events when Matrix block streaming is enabled", async () => {
|
||||
|
||||
Reference in New Issue
Block a user