salience: client-side pad expansion, drop AppendImage

Mirrors the vLLM-side rewrite. AppendImage is gone; images now ride along on Generate via a parallel `images` list. - Productionize `qwen3_image_token_count` (was test-only). Image leaf computes its IMAGE_PAD count eagerly at construction from height/width; `token_count` is no longer "0 until the server tells us." - WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision blocks live inline in the token stream. - `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`. `WireImage` carries `pad_start` / `pad_end` (absolute positions in the full walk) alongside bytes + mime. - `assemble_prompt` returns `(chunks, images, match_upto)`. - `stream_session_mm` / `run_session_generate` take the parallel images list, filter to those past `match_upto`, and pass them in `GenerateRequest.images` as `pb::ImageAttachment` entries. - Drop `SessionHandle::append_image`, `ContextState::commit_image_token_counts`, `StreamToken::ImageAppended`, the WireChunk::Image branch in `learn.rs`, and the now-empty `prompt_to_chunks` helper. - Add 'v' toggle on the conscious-screen tree to render token-id vectors in place of text content (debug-aid: lets us see what the server actually has when output is suspicious). - Comment out the subconscious-trigger spawn loop — Kent had this disabled before; it had crept back into running. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-24 20:26:47 -04:00 · 2026-04-24 20:26:47 -04:00 · fe232cf292
commit fe232cf292
parent 4feebb7bc4
12 changed files with 468 additions and 306 deletions
--- a/src/subconscious/generate.rs
+++ b/src/subconscious/generate.rs
@ -26,7 +26,7 @@ pub async fn gen_continuation<F>(
 ) -> anyhow::Result<String>
 where F: FnMut(&AstNode) -> bool,
 {
-    let mut chunks = context.wire_chunks(0..entry_idx, skip);
+    let (mut chunks, images) = context.wire_chunks(0..entry_idx, skip);

    // Assistant-turn prologue.
    let prologue = {
@ -50,19 +50,13 @@ where F: FnMut(&AstNode) -> bool,
    // `_guard` drops at function end.
    let session_lock = Arc::new(crate::Mutex::new(None));
    let (mut rx, _guard) = client.stream_session_mm(
-        session_lock, chunks, sampling, Some(-5), None,
+        session_lock, chunks, images, 0, sampling, Some(-5), None,
    );

    let mut tokens = Vec::new();
    while let Some(tok) = rx.recv().await {
        match tok {
            StreamToken::Token { id, .. } => tokens.push(id),
-            StreamToken::ImageAppended { .. } => {
-                // subconscious/generate uses wire_chunks over an AST
-                // slice that shouldn't have unsized images — but if
-                // it ever does, we just don't care about updating the
-                // ephemeral session's AST view.
-            }
            StreamToken::Done { .. } => break,
            StreamToken::Error(e) => anyhow::bail!("generation error: {}", e),
        }
--- a/src/subconscious/learn.rs
+++ b/src/subconscious/learn.rs
@ -40,14 +40,15 @@ struct ScoreResult {
    total_logprob: f64,
 }

-/// Convert a flat (prompt_tokens, images) pair into the interleaved
-/// chunks the session protocol expects. Tokens up to the next
-/// `<|vision_start|>` become a Tokens chunk; each
-/// `<|vision_start|>..<|vision_end|>` run collapses into one Image
-/// chunk paired by position with the next entry in `images`. The
-/// server re-expands the IMAGE_PADs on AppendImage.
-fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec<WireChunk> {
-    let mut out: Vec<WireChunk> = Vec::new();
+/// Find each <|vision_start|>...<|vision_end|> run in the flat prompt
+/// and pair it with the matching entry in `images`. Returns a list
+/// of `ImageAttachment` with absolute pad-range positions, ready
+/// to drop into `GenerateRequest.images`.
+fn pair_images_to_ranges(
+    prompt: &[u32],
+    images: &[WireImage],
+) -> Vec<pb::ImageAttachment> {
+    let mut out: Vec<pb::ImageAttachment> = Vec::new();
    let mut cur = 0;
    let mut img_idx = 0;
    while cur < prompt.len() {
@ -60,22 +61,16 @@ fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec<WireChunk> {
            let img = images.get(img_idx)
                .unwrap_or_else(|| panic!(
                    "image index {} out of range for {} images", img_idx, images.len()));
-            out.push(WireChunk::Image {
+            out.push(pb::ImageAttachment {
                bytes: img.bytes.clone(),
                mime: img.mime.clone(),
-                known_expanded_len: (end - cur) as u32,
+                pad_range_start: cur as u32,
+                pad_range_end: end as u32,
            });
            img_idx += 1;
            cur = end;
        } else {
-            let next_vs = prompt[cur..].iter()
-                .position(|&t| t == tokenizer::VISION_START);
-            let end = match next_vs {
-                Some(o) => cur + o,
-                None => prompt.len(),
-            };
-            out.push(WireChunk::Tokens(prompt[cur..end].to_vec()));
-            cur = end;
+            cur += 1;
        }
    }
    out
@ -95,36 +90,22 @@ async fn call_score(
        return Ok(Vec::new());
    }

-    let chunks = prompt_to_chunks(prompt, images);
+    let images_pb = pair_images_to_ranges(prompt, images);
    let mut handle = SessionHandle::open(client).await?;

-    // Walk chunks: AppendImage for each image, prefill-only Generate
-    // for each text run between images. Accumulate any trailing text
-    // run into `pending` for the final logprob-generating Generate.
-    let mut pending: Vec<u32> = Vec::new();
-    for chunk in chunks {
-        match chunk {
-            WireChunk::Tokens(t) => pending.extend(t),
-            WireChunk::Image { bytes, mime, .. } => {
-                if !pending.is_empty() {
-                    handle.prefill_only(std::mem::take(&mut pending)).await?;
-                }
-                handle.append_image(bytes, mime, false).await?;
-            }
-        }
-    }
-
    // Final Generate: max_tokens=0 so the server runs prefill of the
-    // trailing `pending` tokens and emits Token events for each
-    // position covered by logprobs_ranges, then Done. logprob_top_k=0
-    // means "just the sampled (prompt) token's logprob" — no top-k
-    // alternatives, which is all call_score historically needed.
+    // full prompt and emits Token events for each position covered
+    // by logprobs_ranges, then Done. logprob_top_k=0 means "just
+    // the sampled (prompt) token's logprob" — no top-k alternatives,
+    // which is all call_score historically needed. Images attach
+    // inline via `images`; the prompt already contains their pre-
+    // expanded vision blocks at the declared ranges.
    let logprobs_ranges: Vec<pb::PositionRange> = ranges.iter()
        .map(|(s, e)| pb::PositionRange { start: *s as u32, end: *e as u32 })
        .collect();
    let req = pb::GenerateRequest {
        session_id: handle.session_id.clone(),
-        append_tokens: pending,
+        append_tokens: prompt.to_vec(),
        offset: handle.committed_len,
        truncating: false,
        max_tokens: 0,
@ -136,6 +117,7 @@ async fn call_score(
        top_k: 0,
        stop_token_ids: Vec::new(),
        priority: priority.unwrap_or(0),
+        images: images_pb,
    };

    let mut stream = handle.generate(req).await?;