test(qa): expand smoke-ci and release categories and coverage (#93175)

* test(qa): add smoke ci primary coverage evidence

* test(qa): remove overstated primary coverage claims

* test(qa): make release profile include smoke ci

* test(qa): trim taxonomy formatting churn

* test(qa): avoid hardcoded profile names in coverage test

* test(qa): make release profile cover taxonomy

* test(qa): type profile fixture all category flag

* test(qa): include channel delivery in smoke ci profile
This commit is contained in:
Dallin Romney
2026-06-15 18:05:52 -07:00
committed by GitHub
parent 6bc57ca73a
commit 450060d7a2
8 changed files with 53 additions and 39 deletions

View File

@@ -18,6 +18,7 @@ const TEST_WEBCHAT_COVERAGE_ID = "ui.webchat";
function testMaturityTaxonomy(params?: {
categoryId?: string;
coverageIds?: readonly string[];
includeAllCategories?: boolean;
profileCategoryIds?: readonly string[];
}) {
const categoryId = params?.categoryId ?? TEST_EXECUTABLE_CATEGORY_ID;
@@ -31,12 +32,16 @@ function testMaturityTaxonomy(params?: {
{
id: "smoke-ci",
description: "Test smoke profile.",
includeAllCategories: false,
categoryIds: [],
},
{
id: "release",
description: "Test release profile.",
categoryIds: [...(params?.profileCategoryIds ?? [categoryId])],
includeAllCategories: params?.includeAllCategories ?? false,
categoryIds: [
...(params?.includeAllCategories ? [] : (params?.profileCategoryIds ?? [categoryId])),
],
},
],
surfaces: [
@@ -115,7 +120,10 @@ describe("qa coverage report", () => {
]);
expect(inventory.scorecardTaxonomy.profileCount).toBe(2);
expect(inventory.scorecardTaxonomy.categoryCount).toBeGreaterThan(200);
expect(inventory.scorecardTaxonomy.requiredCategoryCount).toBe(15);
expect(inventory.scorecardTaxonomy.requiredCategoryCount).toBeGreaterThan(0);
expect(inventory.scorecardTaxonomy.requiredCategoryCount).toBeLessThanOrEqual(
inventory.scorecardTaxonomy.categoryCount,
);
expect(inventory.scorecardTaxonomy.requiredFeatureCount).toBeGreaterThan(0);
expect(inventory.scorecardTaxonomy.fulfilledFeatureCount).toBeGreaterThan(0);
expect(inventory.scorecardTaxonomy.taxonomyFulfillmentPercent).toBeGreaterThan(0);
@@ -124,30 +132,15 @@ describe("qa coverage report", () => {
expect(inventory.scorecardTaxonomy.unknownCoverageIdCount).toBe(0);
expect(inventory.scorecardTaxonomy.validationIssues.length).toBeGreaterThan(0);
expect(
inventory.scorecardTaxonomy.validationIssues.every(
inventory.scorecardTaxonomy.validationIssues.some((issue) =>
issue.code.endsWith("not-found"),
),
).toBe(false);
expect(
inventory.scorecardTaxonomy.validationIssues.some(
(issue) => issue.code === "coverage-id-missing-primary-evidence",
),
).toBe(true);
expect(
inventory.scorecardTaxonomy.profiles
.find((profile) => profile.id === "release")
?.categoryIds.toSorted(),
).toEqual([
"agent-runtime-and-provider-execution.agent-turn-execution",
"automation-cron-hooks-tasks-polling.cron-jobs",
"browser-automation-and-exec-sandbox-tools.tool-invocation-and-execution",
"browser-control-ui-and-webchat.browser-ui",
"media-understanding-and-media-generation.media-generation",
"media-understanding-and-media-generation.media-understanding",
"openai-codex-provider-path.responses-and-tool-compatibility",
"plugin-sdk-and-bundled-plugin-architecture.installing-and-running-plugins",
"security-auth-pairing-and-secrets.approval-policy-and-tool-safeguards",
"security-auth-pairing-and-secrets.credential-and-secret-hygiene",
"session-memory-and-context-engine.diagnostics-maintenance-and-recovery",
"session-memory-and-context-engine.memory",
"session-memory-and-context-engine.token-management",
"telemetry-diagnostics-and-observability.telemetry-export",
]);
expect(
inventory.scorecardTaxonomy.categories.find(
(category) => category.id === TEST_BROWSER_CATEGORY_ID,
@@ -349,6 +342,21 @@ describe("qa coverage report", () => {
);
});
it("resolves all-category profiles from taxonomy categories", () => {
const report = buildQaScorecardTaxonomyReport({
taxonomy: testMaturityTaxonomy({
includeAllCategories: true,
}),
repoRoot: process.cwd(),
scenarios: [],
});
expect(report.profiles.find((profile) => profile.id === "release")?.categoryIds).toStrictEqual([
TEST_EXECUTABLE_CATEGORY_ID,
]);
expect(report.requiredCategoryCount).toBe(1);
});
it("reports profile categories missing primary coverage evidence", () => {
const report = buildQaScorecardTaxonomyReport({
taxonomy: testMaturityTaxonomy(),

View File

@@ -25,6 +25,7 @@ const qaScorecardProfileSchema = z.object({
id: qaScorecardIdSchema,
description: z.string().trim().min(1),
evidenceMode: qaScorecardEvidenceModeSchema.optional(),
includeAllCategories: z.boolean().default(false),
categoryIds: z.array(qaScorecardIdSchema).default([]),
});
@@ -67,6 +68,14 @@ const qaMaturityTaxonomySchema = z
}
seenProfileIds.add(profile.id);
if (profile.includeAllCategories && profile.categoryIds.length > 0) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ["profiles", profileIndex, "categoryIds"],
message: `profile ${profile.id} cannot set categoryIds when includeAllCategories is true`,
});
}
const seenProfileCategoryIds = new Set<string>();
for (const [categoryIndex, categoryId] of profile.categoryIds.entries()) {
if (seenProfileCategoryIds.has(categoryId)) {
@@ -466,7 +475,10 @@ export function buildQaScorecardTaxonomyReport(params: {
const profiles =
params.taxonomy?.profiles.map((profile) => {
const validCategoryIds: string[] = [];
for (const categoryId of profile.categoryIds) {
const selectedCategoryIds = profile.includeAllCategories
? [...maturityRefs.categories.keys()]
: profile.categoryIds;
for (const categoryId of selectedCategoryIds) {
if (!maturityRefs.categories.has(categoryId)) {
issues.push({
code: "profile-category-ref-not-found",

View File

@@ -6,6 +6,7 @@ scenario:
coverage:
primary:
- channels.threads
- thread-parent-child-placement
secondary:
- channels.qa-channel
objective: Verify the agent can keep follow-up work inside a thread and not leak context into the root channel.

View File

@@ -6,6 +6,7 @@ scenario:
coverage:
primary:
- media.image-generation
- generated-image-persistence-and-delivery
secondary:
- channels.qa-channel
objective: Verify a generated image is saved as media, reattached on the next turn, and described correctly through the vision path.

View File

@@ -6,6 +6,7 @@ scenario:
coverage:
primary:
- telemetry.prometheus
- gateway-authenticated-get-api-diagnostics-prometheus
secondary:
- harness.qa-lab
- docker.e2e

View File

@@ -6,6 +6,8 @@ scenario:
coverage:
primary:
- scheduling.cron
- cron-rpcs
- chat-announce-delivery
secondary:
- channels.qa-channel
objective: Verify the agent can schedule a cron reminder one minute in the future and receive the follow-up in the QA channel.

View File

@@ -6,6 +6,7 @@ scenario:
coverage:
primary:
- ui.control
- dashboard-open-auth-bootstrap
secondary:
- media.image-understanding
- channels.qa-channel

View File

@@ -17,6 +17,8 @@ profiles:
- security-auth-pairing-and-secrets.approval-policy-and-tool-safeguards
- telemetry-diagnostics-and-observability.telemetry-export
- channel-framework.conversation-routing-and-delivery
- channel-framework.outbound-delivery-and-reply-pipeline
- channel-framework.group-thread-and-ambient-room-behavior
- session-memory-and-context-engine.memory
- session-memory-and-context-engine.diagnostics-maintenance-and-recovery
- automation-cron-hooks-tasks-polling.cron-jobs
@@ -29,21 +31,7 @@ profiles:
description: Stable/LTS proof selector for live providers, live channels, package artifacts,
upgrade paths, and platform proof where the claim depends on real upstreams or release
artifacts.
categoryIds:
- agent-runtime-and-provider-execution.agent-turn-execution
- session-memory-and-context-engine.token-management
- browser-automation-and-exec-sandbox-tools.tool-invocation-and-execution
- security-auth-pairing-and-secrets.approval-policy-and-tool-safeguards
- telemetry-diagnostics-and-observability.telemetry-export
- openai-codex-provider-path.responses-and-tool-compatibility
- session-memory-and-context-engine.memory
- session-memory-and-context-engine.diagnostics-maintenance-and-recovery
- automation-cron-hooks-tasks-polling.cron-jobs
- plugin-sdk-and-bundled-plugin-architecture.installing-and-running-plugins
- media-understanding-and-media-generation.media-understanding
- media-understanding-and-media-generation.media-generation
- browser-control-ui-and-webchat.browser-ui
- security-auth-pairing-and-secrets.credential-and-secret-hygiene
includeAllCategories: true
levels:
- id: planned
code: M0