fix: stabilize qa lab memory and thinking scenarios

2026-05-06 07:40:44 +00:00 · 2026-04-25 07:49:52 +01:00
parent b34ece705f
commit d068cb960d
5 changed files with 30 additions and 65 deletions
--- a/extensions/active-memory/index.test.ts
+++ b/extensions/active-memory/index.test.ts
@@ -654,6 +654,9 @@ describe("active-memory plugin", () => {
      "You receive conversation context, including the user's latest message.",
    );
    expect(runParams?.prompt).toContain("Use only memory_search and memory_get.");
+    expect(runParams?.prompt).toContain(
+      "When searching for preference or habit recall, use a permissive memory_search threshold before deciding that no useful memory exists.",
+    );
    expect(runParams?.prompt).toContain(
      "If the user is directly asking about favorites, preferences, habits, routines, or personal facts, treat that as a strong recall signal.",
    );
--- a/extensions/active-memory/index.ts
+++ b/extensions/active-memory/index.ts
@@ -787,6 +787,7 @@ function buildRecallPrompt(params: {
    "Your job is to search memory and return only the most relevant memory context for that model.",
    "You receive conversation context, including the user's latest message.",
    "Use only memory_search and memory_get.",
+    "When searching for preference or habit recall, use a permissive memory_search threshold before deciding that no useful memory exists.",
    "Do not answer the user directly.",
    `Prompt style: ${params.config.promptStyle}.`,
    ...buildPromptStyleLines(params.config.promptStyle),
--- a/qa/scenarios/memory/active-memory-preprompt-recall.md
+++ b/qa/scenarios/memory/active-memory-preprompt-recall.md
@@ -45,7 +45,7 @@ execution:
  config:
    baselineConversationId: qa-active-memory-off
    activeConversationId: qa-active-memory-on
-    memoryFact: "Stable QA movie night snack preference: lemon pepper wings with blue cheese."
+    memoryFact: "Stable QA movie night usual favorite snack preference: lemon pepper wings with blue cheese."
    memoryQuery: "QA movie night snack lemon pepper wings blue cheese"
    expectedNeedle: lemon pepper wings
    prompt: "Silent snack recall check: what snack do I usually want for QA movie night? Reply in one short sentence."
--- a/qa/scenarios/memory/session-memory-ranking.md
+++ b/qa/scenarios/memory/session-memory-ranking.md
@@ -30,7 +30,7 @@ execution:
    transcriptId: qa-session-memory-ranking
    transcriptQuestion: "What is the current Project Nebula codename?"
    transcriptAnswer: "The current Project Nebula codename is ORBIT-10."
-    prompt: "Session memory ranking check: what is the current Project Nebula codename? Use memory tools first. If durable notes conflict with newer indexed session transcripts, prefer the newer current fact."
+    prompt: "Session memory ranking check: what is the current Project Nebula codename? Use memory_search first with corpus=sessions for indexed session transcripts. If durable notes conflict with newer indexed session transcripts, prefer the newer current fact."
    promptSnippet: "Session memory ranking check"
 ```

--- a/qa/scenarios/models/thinking-slash-model-remap.md
+++ b/qa/scenarios/models/thinking-slash-model-remap.md
@@ -10,7 +10,15 @@ coverage:
  secondary:
    - models.switching
    - runtime.session-continuity
-objective: Verify /think lists provider-owned levels and remaps stored thinking levels when /model changes provider capabilities.
+objective: Verify /think lists provider-owned levels and remaps stored thinking levels when the session model changes provider capabilities.
+plugins:
+  - anthropic
+gatewayConfigPatch:
+  agents:
+    defaults:
+      models:
+        anthropic/claude-sonnet-4-6:
+          params: {}
 successCriteria:
  - Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max.
  - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4.
@@ -35,7 +43,8 @@ execution:
    anthropicModelRef: anthropic/claude-sonnet-4-6
    openAiXhighModelRef: openai/gpt-5.4
    noXhighModelRef: anthropic/claude-sonnet-4-6
-    conversationId: qa-thinking-slash-remap
+    conversationId: thinking-slash-remap
+    sessionKey: agent:qa:main
 ```

 ```yaml qa-flow
@@ -55,25 +64,9 @@ steps:
          expr: "env.providerMode === config.requiredProviderMode"
          message:
            expr: "`thinking remap scenario requires ${config.requiredProviderMode}; got ${env.providerMode}`"
-      - set: cursor
+      - set: anthropicModelAck
        value:
-          expr: state.getSnapshot().messages.length
-      - call: state.addInboundMessage
-        args:
-          - conversation:
-              id:
-                expr: config.conversationId
-              kind: direct
-            senderId: qa-operator
-            senderName: QA Operator
-            text:
-              expr: "`/model ${config.anthropicModelRef}`"
-      - call: waitForCondition
-        saveAs: anthropicModelAck
-        args:
-          - lambda:
-              expr: "state.getSnapshot().messages.slice(cursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(`Model set to ${config.anthropicModelRef}`)).at(-1)"
-          - expr: liveTurnTimeoutMs(env, 20000)
+          expr: "await env.gateway.call('sessions.patch', { key: config.sessionKey, model: config.anthropicModelRef }, { timeoutMs: liveTurnTimeoutMs(env, 45000) })"
      - set: cursor
        value:
          expr: state.getSnapshot().messages.length
@@ -100,7 +93,7 @@ steps:
          expr: "!/Options: .*\\bxhigh\\b/i.test(anthropicThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(anthropicThinkStatus.text)"
          message:
            expr: "`expected Sonnet /think options to omit xhigh/max, got ${anthropicThinkStatus.text}`"
-    detailsExpr: "`model=${anthropicModelAck.text}; think=${anthropicThinkStatus.text}`"
+    detailsExpr: "`model=${JSON.stringify(anthropicModelAck.resolved)}; think=${anthropicThinkStatus.text}`"
  - name: maps adaptive to medium when switching to OpenAI
    actions:
      - set: cursor
@@ -121,29 +114,13 @@ steps:
          - lambda:
              expr: "state.getSnapshot().messages.slice(cursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to adaptive/i.test(candidate.text)).at(-1)"
          - expr: liveTurnTimeoutMs(env, 20000)
-      - set: cursor
+      - set: openAiModelAck
        value:
-          expr: state.getSnapshot().messages.length
-      - call: state.addInboundMessage
-        args:
-          - conversation:
-              id:
-                expr: config.conversationId
-              kind: direct
-            senderId: qa-operator
-            senderName: QA Operator
-            text:
-              expr: "`/model ${config.openAiXhighModelRef}`"
-      - call: waitForCondition
-        saveAs: openAiModelAck
-        args:
-          - lambda:
-              expr: "state.getSnapshot().messages.slice(cursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.openAiXhighModelRef) && /Model (set to|reset to default)/i.test(candidate.text)).at(-1)"
-          - expr: liveTurnTimeoutMs(env, 20000)
+          expr: "await env.gateway.call('sessions.patch', { key: config.sessionKey, model: config.openAiXhighModelRef }, { timeoutMs: liveTurnTimeoutMs(env, 45000) })"
      - assert:
-          expr: "/Thinking level set to medium \\(adaptive not supported for openai\\/gpt-5\\.4\\)/i.test(openAiModelAck.text)"
+          expr: "openAiModelAck.entry?.thinkingLevel === 'medium'"
          message:
-            expr: "`expected adaptive->medium remap, got ${openAiModelAck.text}`"
+            expr: "`expected adaptive->medium remap, got ${JSON.stringify(openAiModelAck.entry)}`"
      - set: cursor
        value:
          expr: state.getSnapshot().messages.length
@@ -166,7 +143,7 @@ steps:
          expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)"
          message:
            expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`"
-    detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${openAiModelAck.text}; think=${openAiThinkStatus.text}`"
+    detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${JSON.stringify(openAiModelAck.resolved)}; think=${openAiThinkStatus.text}`"
  - name: maps xhigh to high on a model without xhigh
    actions:
      - set: cursor
@@ -187,29 +164,13 @@ steps:
          - lambda:
              expr: "state.getSnapshot().messages.slice(cursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to xhigh/i.test(candidate.text)).at(-1)"
          - expr: liveTurnTimeoutMs(env, 20000)
-      - set: cursor
+      - set: noXhighModelAck
        value:
-          expr: state.getSnapshot().messages.length
-      - call: state.addInboundMessage
-        args:
-          - conversation:
-              id:
-                expr: config.conversationId
-              kind: direct
-            senderId: qa-operator
-            senderName: QA Operator
-            text:
-              expr: "`/model ${config.noXhighModelRef}`"
-      - call: waitForCondition
-        saveAs: noXhighModelAck
-        args:
-          - lambda:
-              expr: "state.getSnapshot().messages.slice(cursor).filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && candidate.text.includes(config.noXhighModelRef) && /Model (set to|reset to default)/i.test(candidate.text)).at(-1)"
-          - expr: liveTurnTimeoutMs(env, 20000)
+          expr: "await env.gateway.call('sessions.patch', { key: config.sessionKey, model: config.noXhighModelRef }, { timeoutMs: liveTurnTimeoutMs(env, 45000) })"
      - assert:
-          expr: "/Thinking level set to high \\(xhigh not supported for anthropic\\/claude-sonnet-4-6\\)/i.test(noXhighModelAck.text)"
+          expr: "noXhighModelAck.entry?.thinkingLevel === 'high'"
          message:
-            expr: "`expected xhigh->high remap, got ${noXhighModelAck.text}`"
+            expr: "`expected xhigh->high remap, got ${JSON.stringify(noXhighModelAck.entry)}`"
      - set: cursor
        value:
          expr: state.getSnapshot().messages.length
@@ -232,5 +193,5 @@ steps:
          expr: "/Options: .*\\badaptive\\b/i.test(noXhighThinkStatus.text) && !/Options: .*\\bxhigh\\b/i.test(noXhighThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(noXhighThinkStatus.text)"
          message:
            expr: "`expected non-xhigh model /think options to include adaptive and omit xhigh/max, got ${noXhighThinkStatus.text}`"
-    detailsExpr: "`xhigh=${xhighAck.text}; switch=${noXhighModelAck.text}; think=${noXhighThinkStatus.text}`"
+    detailsExpr: "`xhigh=${xhighAck.text}; switch=${JSON.stringify(noXhighModelAck.resolved)}; think=${noXhighThinkStatus.text}`"
 ```