salience: client-side pad expansion, drop AppendImage

Mirrors the vLLM-side rewrite. AppendImage is gone; images now
ride along on Generate via a parallel `images` list.

- Productionize `qwen3_image_token_count` (was test-only). Image
  leaf computes its IMAGE_PAD count eagerly at construction from
  height/width; `token_count` is no longer "0 until the server
  tells us."
- WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision
  blocks live inline in the token stream.
- `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`.
  `WireImage` carries `pad_start` / `pad_end` (absolute positions
  in the full walk) alongside bytes + mime.
- `assemble_prompt` returns `(chunks, images, match_upto)`.
- `stream_session_mm` / `run_session_generate` take the parallel
  images list, filter to those past `match_upto`, and pass them
  in `GenerateRequest.images` as `pb::ImageAttachment` entries.
- Drop `SessionHandle::append_image`,
  `ContextState::commit_image_token_counts`,
  `StreamToken::ImageAppended`, the WireChunk::Image branch in
  `learn.rs`, and the now-empty `prompt_to_chunks` helper.
- Add 'v' toggle on the conscious-screen tree to render token-id
  vectors in place of text content (debug-aid: lets us see what
  the server actually has when output is suspicious).
- Comment out the subconscious-trigger spawn loop — Kent had this
  disabled before; it had crept back into running.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-24 20:26:47 -04:00
commit fe232cf292
12 changed files with 468 additions and 306 deletions

View file

@ -26,7 +26,7 @@ pub async fn gen_continuation<F>(
) -> anyhow::Result<String>
where F: FnMut(&AstNode) -> bool,
{
let mut chunks = context.wire_chunks(0..entry_idx, skip);
let (mut chunks, images) = context.wire_chunks(0..entry_idx, skip);
// Assistant-turn prologue.
let prologue = {
@ -50,19 +50,13 @@ where F: FnMut(&AstNode) -> bool,
// `_guard` drops at function end.
let session_lock = Arc::new(crate::Mutex::new(None));
let (mut rx, _guard) = client.stream_session_mm(
session_lock, chunks, sampling, Some(-5), None,
session_lock, chunks, images, 0, sampling, Some(-5), None,
);
let mut tokens = Vec::new();
while let Some(tok) = rx.recv().await {
match tok {
StreamToken::Token { id, .. } => tokens.push(id),
StreamToken::ImageAppended { .. } => {
// subconscious/generate uses wire_chunks over an AST
// slice that shouldn't have unsized images — but if
// it ever does, we just don't care about updating the
// ephemeral session's AST view.
}
StreamToken::Done { .. } => break,
StreamToken::Error(e) => anyhow::bail!("generation error: {}", e),
}

View file

@ -40,14 +40,15 @@ struct ScoreResult {
total_logprob: f64,
}
/// Convert a flat (prompt_tokens, images) pair into the interleaved
/// chunks the session protocol expects. Tokens up to the next
/// `<|vision_start|>` become a Tokens chunk; each
/// `<|vision_start|>..<|vision_end|>` run collapses into one Image
/// chunk paired by position with the next entry in `images`. The
/// server re-expands the IMAGE_PADs on AppendImage.
fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec<WireChunk> {
let mut out: Vec<WireChunk> = Vec::new();
/// Find each <|vision_start|>...<|vision_end|> run in the flat prompt
/// and pair it with the matching entry in `images`. Returns a list
/// of `ImageAttachment` with absolute pad-range positions, ready
/// to drop into `GenerateRequest.images`.
fn pair_images_to_ranges(
prompt: &[u32],
images: &[WireImage],
) -> Vec<pb::ImageAttachment> {
let mut out: Vec<pb::ImageAttachment> = Vec::new();
let mut cur = 0;
let mut img_idx = 0;
while cur < prompt.len() {
@ -60,22 +61,16 @@ fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec<WireChunk> {
let img = images.get(img_idx)
.unwrap_or_else(|| panic!(
"image index {} out of range for {} images", img_idx, images.len()));
out.push(WireChunk::Image {
out.push(pb::ImageAttachment {
bytes: img.bytes.clone(),
mime: img.mime.clone(),
known_expanded_len: (end - cur) as u32,
pad_range_start: cur as u32,
pad_range_end: end as u32,
});
img_idx += 1;
cur = end;
} else {
let next_vs = prompt[cur..].iter()
.position(|&t| t == tokenizer::VISION_START);
let end = match next_vs {
Some(o) => cur + o,
None => prompt.len(),
};
out.push(WireChunk::Tokens(prompt[cur..end].to_vec()));
cur = end;
cur += 1;
}
}
out
@ -95,36 +90,22 @@ async fn call_score(
return Ok(Vec::new());
}
let chunks = prompt_to_chunks(prompt, images);
let images_pb = pair_images_to_ranges(prompt, images);
let mut handle = SessionHandle::open(client).await?;
// Walk chunks: AppendImage for each image, prefill-only Generate
// for each text run between images. Accumulate any trailing text
// run into `pending` for the final logprob-generating Generate.
let mut pending: Vec<u32> = Vec::new();
for chunk in chunks {
match chunk {
WireChunk::Tokens(t) => pending.extend(t),
WireChunk::Image { bytes, mime, .. } => {
if !pending.is_empty() {
handle.prefill_only(std::mem::take(&mut pending)).await?;
}
handle.append_image(bytes, mime, false).await?;
}
}
}
// Final Generate: max_tokens=0 so the server runs prefill of the
// trailing `pending` tokens and emits Token events for each
// position covered by logprobs_ranges, then Done. logprob_top_k=0
// means "just the sampled (prompt) token's logprob" — no top-k
// alternatives, which is all call_score historically needed.
// full prompt and emits Token events for each position covered
// by logprobs_ranges, then Done. logprob_top_k=0 means "just
// the sampled (prompt) token's logprob" — no top-k alternatives,
// which is all call_score historically needed. Images attach
// inline via `images`; the prompt already contains their pre-
// expanded vision blocks at the declared ranges.
let logprobs_ranges: Vec<pb::PositionRange> = ranges.iter()
.map(|(s, e)| pb::PositionRange { start: *s as u32, end: *e as u32 })
.collect();
let req = pb::GenerateRequest {
session_id: handle.session_id.clone(),
append_tokens: pending,
append_tokens: prompt.to_vec(),
offset: handle.committed_len,
truncating: false,
max_tokens: 0,
@ -136,6 +117,7 @@ async fn call_score(
top_k: 0,
stop_token_ids: Vec::new(),
priority: priority.unwrap_or(0),
images: images_pb,
};
let mut stream = handle.generate(req).await?;