From be6539971005271180098bdf1ff03fe00e601c24 Mon Sep 17 00:00:00 2001
From: ProofOfConcept <poc@bcachefs.org>
Date: Thu, 9 Apr 2026 21:07:00 -0400
Subject: [PATCH] Switch memory scoring from chat messages to raw token IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The /score endpoint was receiving chat-format messages which had to go
through the chat template tokenizer — this was failing with "System
message must be first" errors because the AST structure doesn't map
cleanly to chat message format.

Send raw token IDs via the new `prompt` field instead, matching what
the /completions endpoint already does. The vLLM score endpoint finds
assistant boundaries by scanning for <|im_start|>assistant token
patterns, so no message-level metadata is needed.

Also includes identity and journal sections in the scored context,
matching what the model actually sees during inference.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
---
 src/subconscious/learn.rs | 69 ++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 40 deletions(-)
diff --git a/src/subconscious/learn.rs b/src/subconscious/learn.rs
index a81f0a4..80aa31e 100644
--- a/src/subconscious/learn.rs
+++ b/src/subconscious/learn.rs
@@ -48,41 +48,25 @@ fn is_assistant(node: &AstNode) -> bool {
     matches!(node, AstNode::Branch { role: Role::Assistant, .. })
 }
 
-/// Push an AstNode as one or more JSON messages for the scoring API.
-fn push_api_message(node: &AstNode, msgs: &mut Vec<serde_json::Value>) {
-    match node {
-        AstNode::Branch { role, children } => {
-            let content: String = children.iter().map(|c| c.render()).collect();
-            msgs.push(serde_json::json!({
-                "role": role.as_str(),
-                "content": content,
-            }));
-        }
-        AstNode::Leaf(leaf) => {
-            let role = match leaf.body() {
-                NodeBody::ToolResult(_) => "tool",
-                _ => "user",
-            };
-            msgs.push(serde_json::json!({
-                "role": role,
-                "content": leaf.body().text(),
-            }));
-        }
-    }
-}
-
-/// Build the messages array for a scoring call.
+/// Build a token ID array for a scoring call.
 ///
-/// Always includes system prompt as prefix, then entries from `range`
-/// filtered by `filter`.
-fn build_messages(
+/// Includes all sections up to and including conversation entries in
+/// `range`, with `filter` applied to conversation entries.
+fn build_token_ids(
     context: &ContextState,
     range: std::ops::Range<usize>,
     filter: Filter,
-) -> Vec<serde_json::Value> {
-    let mut msgs = Vec::new();
+) -> Vec<u32> {
+    use crate::agent::context::Ast;
+    let mut ids = Vec::new();
     for node in context.system() {
-        push_api_message(node, &mut msgs);
+        ids.extend(node.token_ids());
+    }
+    for node in context.identity() {
+        ids.extend(node.token_ids());
+    }
+    for node in context.journal() {
+        ids.extend(node.token_ids());
     }
     let entries = context.conversation();
     for i in range {
@@ -94,9 +78,9 @@ fn build_messages(
             Filter::SkipAllMemories => is_memory(node),
         };
         if skip { continue; }
-        push_api_message(node, &mut msgs);
+        ids.extend(node.token_ids());
     }
-    msgs
+    ids
 }
 
 // ── Score API ───────────────────────────────────────────────────
@@ -120,14 +104,14 @@ fn http_client() -> crate::agent::api::http::HttpClient {
 async fn call_score(
     http: &crate::agent::api::http::HttpClient,
     client: &ApiClient,
-    messages: &[serde_json::Value],
+    prompt: &[u32],
     priority: Option<i32>,
 ) -> anyhow::Result<Vec<ScoreResult>> {
     let url = format!("{}/score", client.base_url());
     let auth = format!("Bearer {}", client.api_key());
     let mut body = serde_json::json!({
         "model": client.model,
-        "messages": messages,
+        "prompt": prompt,
         "logprobs": 1,
     });
     if let Some(p) = priority {
@@ -175,8 +159,8 @@ async fn score_divergence(
     filter: Filter<'_>,
     priority: Option<i32>,
 ) -> anyhow::Result<(Vec<f64>, Vec<ScoreResult>)> {
-    let baseline = call_score(http, client, &build_messages(context, range.clone(), Filter::None), priority).await?;
-    let without = call_score(http, client, &build_messages(context, range, filter), priority).await?;
+    let baseline = call_score(http, client, &build_token_ids(context, range.clone(), Filter::None), priority).await?;
+    let without = call_score(http, client, &build_token_ids(context, range, filter), priority).await?;
     let divs = divergence(&baseline, &without);
     Ok((divs, baseline))
 }
@@ -237,7 +221,7 @@ pub async fn score_memories(
     let http = http_client();
     let range = 0..context.conversation().len();
 
-    let baseline = call_score(&http, client, &build_messages(context, range.clone(), Filter::None), Some(5)).await?;
+    let baseline = call_score(&http, client, &build_token_ids(context, range.clone(), Filter::None), Some(5)).await?;
 
     let total = memory_keys.len();
     let mut matrix: Vec<Vec<f64>> = Vec::new();
@@ -246,7 +230,7 @@ pub async fn score_memories(
         dbglog!(
             "scoring {}/{}: {}...", mem_idx + 1, total, key,
         );
-        let msgs = build_messages(context, range.clone(), Filter::SkipKey(key));
+        let msgs = build_token_ids(context, range.clone(), Filter::SkipKey(key));
         match call_score(&http, client, &msgs, Some(5)).await {
             Ok(without) => matrix.push(divergence(&baseline, &without)),
             Err(e) => {
@@ -381,15 +365,20 @@ where
         cumulative.push(running);
     }
 
+    dbglog!("[scoring] total_tokens={}, cutoff={}, {} candidates", total_tokens, token_cutoff, candidates.len());
+
     for (pos, key, _) in &candidates {
-        // Only score memories in the first 70% of the conversation by tokens —
+        // Only score memories in the first 60% of the conversation by tokens —
         // recent memories don't have enough responses to evaluate yet.
-        if cumulative.get(*pos).copied().unwrap_or(total_tokens) > token_cutoff {
+        let cum = cumulative.get(*pos).copied().unwrap_or(total_tokens);
+        if cum > token_cutoff {
+            dbglog!("[scoring] skip {} (tokens {}/{} past cutoff)", key, cum, token_cutoff);
             continue;
         }
         let (end, _) = nth_response_end(context.conversation(), *pos, response_window);
         let range = *pos..end;
         if !context.conversation()[range.clone()].iter().any(|node| is_assistant(node)) {
+            dbglog!("[scoring] skip {} (no assistant response in range {}..{})", key, pos, end);
             continue;
         }