diff --git a/extensions/qa-lab/src/character-eval.test.ts b/extensions/qa-lab/src/character-eval.test.ts index 8dd8277f7d4..b8bc25e3634 100644 --- a/extensions/qa-lab/src/character-eval.test.ts +++ b/extensions/qa-lab/src/character-eval.test.ts @@ -40,6 +40,55 @@ function makeRunSuite(transcriptForModel: (model: string) => string = defaultMod ); } +function createConcurrencyGate(expectedActive: number) { + let active = 0; + let maxActive = 0; + let releaseStartedTasks = false; + let resolveExpectedActive: () => void = () => {}; + const expectedActiveReached = new Promise((resolve) => { + resolveExpectedActive = resolve; + }); + const taskReleases: Array<() => void> = []; + const releaseQueuedTasks = () => { + if (!releaseStartedTasks) { + return; + } + let releaseTask: (() => void) | undefined; + while ((releaseTask = taskReleases.shift())) { + releaseTask(); + } + }; + + return { + get maxActive() { + return maxActive; + }, + async run(work: () => T | Promise): Promise { + active += 1; + maxActive = Math.max(maxActive, active); + if (active >= expectedActive) { + resolveExpectedActive(); + } + await new Promise((resolve) => { + taskReleases.push(resolve); + releaseQueuedTasks(); + }); + try { + return await work(); + } finally { + active -= 1; + } + }, + async waitForExpectedActive(): Promise { + await expectedActiveReached; + }, + releaseStartedTasks(): void { + releaseStartedTasks = true; + releaseQueuedTasks(); + }, + }; +} + function makeSuiteResult(params: { outputDir: string; model: string; transcript: string }) { return { outputDir: params.outputDir, @@ -265,22 +314,17 @@ describe("runQaCharacterEval", () => { }); it("runs candidate models with bounded concurrency while preserving result order", async () => { - let activeRuns = 0; - let maxActiveRuns = 0; - const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => { - activeRuns += 1; - maxActiveRuns = Math.max(maxActiveRuns, activeRuns); - await new Promise((resolve) => setTimeout(resolve, 10)); - activeRuns -= 1; - return makeReplySuiteResult(params); - }); + const runGate = createConcurrencyGate(2); + const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => + runGate.run(() => makeReplySuiteResult(params)), + ); const runJudge = makeRunJudge([ { model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" }, { model: "anthropic/claude-sonnet-4-6", rank: 2, score: 7, summary: "ok" }, { model: "moonshot/kimi-k2.5", rank: 3, score: 6, summary: "ok" }, ]); - const result = await runQaCharacterEval({ + const resultPromise = runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), models: ["openai/gpt-5.5", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5"], @@ -290,7 +334,10 @@ describe("runQaCharacterEval", () => { runJudge, }); - expect(maxActiveRuns).toBe(2); + await runGate.waitForExpectedActive(); + expect(runGate.maxActive).toBe(2); + runGate.releaseStartedTasks(); + const result = await resultPromise; expect(result.runs.map((run) => run.model)).toEqual([ "openai/gpt-5.5", "anthropic/claude-sonnet-4-6", @@ -299,33 +346,25 @@ describe("runQaCharacterEval", () => { }); it("defaults candidate and judge concurrency to sixteen", async () => { - let activeRuns = 0; - let maxActiveRuns = 0; - const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => { - activeRuns += 1; - maxActiveRuns = Math.max(maxActiveRuns, activeRuns); - await new Promise((resolve) => setTimeout(resolve, 10)); - activeRuns -= 1; - return makeReplySuiteResult(params); - }); - let activeJudges = 0; - let maxActiveJudges = 0; + const runGate = createConcurrencyGate(16); + const judgeGate = createConcurrencyGate(16); + const runSuite = vi.fn(async (params: CharacterRunSuiteParams) => + runGate.run(() => makeReplySuiteResult(params)), + ); const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) => { - activeJudges += 1; - maxActiveJudges = Math.max(maxActiveJudges, activeJudges); - await new Promise((resolve) => setTimeout(resolve, 10)); - activeJudges -= 1; - return makeJudgeReply( - Array.from({ length: 20 }, (_, index) => ({ - model: `provider/model-${index + 1}`, - rank: index + 1, - score: 10 - index, - summary: "ok", - })), + return await judgeGate.run(() => + makeJudgeReply( + Array.from({ length: 20 }, (_, index) => ({ + model: `provider/model-${index + 1}`, + rank: index + 1, + score: 10 - index, + summary: "ok", + })), + ), ); }); - await runQaCharacterEval({ + const resultPromise = runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), models: Array.from({ length: 20 }, (_, index) => `provider/model-${index + 1}`), @@ -334,8 +373,13 @@ describe("runQaCharacterEval", () => { runJudge, }); - expect(maxActiveRuns).toBe(16); - expect(maxActiveJudges).toBe(16); + await runGate.waitForExpectedActive(); + expect(runGate.maxActive).toBe(16); + runGate.releaseStartedTasks(); + await judgeGate.waitForExpectedActive(); + expect(judgeGate.maxActive).toBe(16); + judgeGate.releaseStartedTasks(); + await resultPromise; }); it("marks raw provider error transcripts as failed output", async () => {