test: gate qa character concurrency

This commit is contained in:
Shakker
2026-05-08 23:53:22 +01:00
parent 56fa80d334
commit 49e2f91335

View File

@@ -40,6 +40,55 @@ function makeRunSuite(transcriptForModel: (model: string) => string = defaultMod
);
}
function createConcurrencyGate(expectedActive: number) {
let active = 0;
let maxActive = 0;
let releaseStartedTasks = false;
let resolveExpectedActive: () => void = () => {};
const expectedActiveReached = new Promise<void>((resolve) => {
resolveExpectedActive = resolve;
});
const taskReleases: Array<() => void> = [];
const releaseQueuedTasks = () => {
if (!releaseStartedTasks) {
return;
}
let releaseTask: (() => void) | undefined;
while ((releaseTask = taskReleases.shift())) {
releaseTask();
}
};
return {
get maxActive() {
return maxActive;
},
async run<T>(work: () => T | Promise<T>): Promise<T> {
active += 1;
maxActive = Math.max(maxActive, active);
if (active >= expectedActive) {
resolveExpectedActive();
}
await new Promise<void>((resolve) => {
taskReleases.push(resolve);
releaseQueuedTasks();
});
try {
return await work();
} finally {
active -= 1;
}
},
async waitForExpectedActive(): Promise<void> {
await expectedActiveReached;
},
releaseStartedTasks(): void {
releaseStartedTasks = true;
releaseQueuedTasks();
},
};
}
function makeSuiteResult(params: { outputDir: string; model: string; transcript: string }) {
return {
outputDir: params.outputDir,
@@ -265,22 +314,17 @@ describe("runQaCharacterEval", () => {
});
it("runs candidate models with bounded concurrency while preserving result order", async () => {
let activeRuns = 0;
let maxActiveRuns = 0;
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
activeRuns += 1;
maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
await new Promise((resolve) => setTimeout(resolve, 10));
activeRuns -= 1;
return makeReplySuiteResult(params);
});
const runGate = createConcurrencyGate(2);
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
runGate.run(() => makeReplySuiteResult(params)),
);
const runJudge = makeRunJudge([
{ model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" },
{ model: "anthropic/claude-sonnet-4-6", rank: 2, score: 7, summary: "ok" },
{ model: "moonshot/kimi-k2.5", rank: 3, score: 6, summary: "ok" },
]);
const result = await runQaCharacterEval({
const resultPromise = runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: ["openai/gpt-5.5", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5"],
@@ -290,7 +334,10 @@ describe("runQaCharacterEval", () => {
runJudge,
});
expect(maxActiveRuns).toBe(2);
await runGate.waitForExpectedActive();
expect(runGate.maxActive).toBe(2);
runGate.releaseStartedTasks();
const result = await resultPromise;
expect(result.runs.map((run) => run.model)).toEqual([
"openai/gpt-5.5",
"anthropic/claude-sonnet-4-6",
@@ -299,33 +346,25 @@ describe("runQaCharacterEval", () => {
});
it("defaults candidate and judge concurrency to sixteen", async () => {
let activeRuns = 0;
let maxActiveRuns = 0;
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => {
activeRuns += 1;
maxActiveRuns = Math.max(maxActiveRuns, activeRuns);
await new Promise((resolve) => setTimeout(resolve, 10));
activeRuns -= 1;
return makeReplySuiteResult(params);
});
let activeJudges = 0;
let maxActiveJudges = 0;
const runGate = createConcurrencyGate(16);
const judgeGate = createConcurrencyGate(16);
const runSuite = vi.fn(async (params: CharacterRunSuiteParams) =>
runGate.run(() => makeReplySuiteResult(params)),
);
const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) => {
activeJudges += 1;
maxActiveJudges = Math.max(maxActiveJudges, activeJudges);
await new Promise((resolve) => setTimeout(resolve, 10));
activeJudges -= 1;
return makeJudgeReply(
Array.from({ length: 20 }, (_, index) => ({
model: `provider/model-${index + 1}`,
rank: index + 1,
score: 10 - index,
summary: "ok",
})),
return await judgeGate.run(() =>
makeJudgeReply(
Array.from({ length: 20 }, (_, index) => ({
model: `provider/model-${index + 1}`,
rank: index + 1,
score: 10 - index,
summary: "ok",
})),
),
);
});
await runQaCharacterEval({
const resultPromise = runQaCharacterEval({
repoRoot: tempRoot,
outputDir: path.join(tempRoot, "character"),
models: Array.from({ length: 20 }, (_, index) => `provider/model-${index + 1}`),
@@ -334,8 +373,13 @@ describe("runQaCharacterEval", () => {
runJudge,
});
expect(maxActiveRuns).toBe(16);
expect(maxActiveJudges).toBe(16);
await runGate.waitForExpectedActive();
expect(runGate.maxActive).toBe(16);
runGate.releaseStartedTasks();
await judgeGate.waitForExpectedActive();
expect(judgeGate.maxActive).toBe(16);
judgeGate.releaseStartedTasks();
await resultPromise;
});
it("marks raw provider error transcripts as failed output", async () => {