fix(heartbeat): align response tool prompts (#76458)

* fix(heartbeat): align response tool prompts

* docs(changelog): credit heartbeat prompt fix
This commit is contained in:
Vincent Koc
2026-05-03 07:19:56 -07:00
committed by GitHub
parent 103b6d50a5
commit 877eb1cbed
14 changed files with 88 additions and 28 deletions

View File

@@ -11,7 +11,7 @@ coverage:
- runtime.no-meta-leak
objective: Verify the Codex app-server harness keeps coordination/meta chatter out of the visible reply.
successCriteria:
- The scenario forces the Codex embedded harness and disables PI fallback.
- The scenario forces the Codex embedded harness.
- The final visible reply includes the requested confirmation token.
- The visible reply does not include internal coordination or progress chatter.
docsRefs:
@@ -29,7 +29,6 @@ execution:
requiredProvider: codex
requiredModel: gpt-5.5
harnessRuntime: codex
harnessFallback: none
expectedReply: QA_LEAK_OK
prompt: |-
Think through your answer privately, but do not expose any internal planning, thread-context checks, or progress narration.
@@ -76,8 +75,6 @@ steps:
agentRuntime:
id:
expr: config.harnessRuntime
fallback:
expr: config.harnessFallback
- call: waitForGatewayHealthy
args:
- ref: env
@@ -94,11 +91,7 @@ steps:
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
message:
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
- assert:
expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback"
message:
expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id}` : `mock mode: parsed ${scenario.id}`"
- name: keeps codex coordination chatter out of the visible reply
actions:
- if:

View File

@@ -12,7 +12,7 @@ coverage:
objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game.
successCriteria:
- A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5 with the Codex harness forced.
- The scenario forces the Codex embedded harness and disables PI fallback.
- The scenario forces the Codex embedded harness.
- The prompt explicitly asks the agent to enter plan mode before editing.
- The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart.
docsRefs:
@@ -30,7 +30,6 @@ execution:
requiredProvider: codex
requiredModel: gpt-5.5
harnessRuntime: codex
harnessFallback: none
artifactFile: star-garden-defenders-codex.html
gameTitle: Star Garden Defenders
minBytes: 5000
@@ -81,8 +80,6 @@ steps:
agentRuntime:
id:
expr: config.harnessRuntime
fallback:
expr: config.harnessFallback
- call: waitForGatewayHealthy
args:
- ref: env
@@ -99,11 +96,7 @@ steps:
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
message:
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
- assert:
expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback"
message:
expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`"
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id}` : `mock mode: parsed ${scenario.id}`"
- name: builds the medium game artifact
actions:
- if:

View File

@@ -30,7 +30,6 @@ execution:
requiredProvider: openai
requiredModel: gpt-5.5
harnessRuntime: pi
harnessFallback: pi
artifactFile: star-garden-defenders-pi.html
gameTitle: Star Garden Defenders
minBytes: 5000
@@ -81,8 +80,6 @@ steps:
agentRuntime:
id:
expr: config.harnessRuntime
fallback:
expr: config.harnessFallback
- call: waitForGatewayHealthy
args:
- ref: env