salience: client-side pad expansion, drop AppendImage
Mirrors the vLLM-side rewrite. AppendImage is gone; images now ride along on Generate via a parallel `images` list. - Productionize `qwen3_image_token_count` (was test-only). Image leaf computes its IMAGE_PAD count eagerly at construction from height/width; `token_count` is no longer "0 until the server tells us." - WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision blocks live inline in the token stream. - `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`. `WireImage` carries `pad_start` / `pad_end` (absolute positions in the full walk) alongside bytes + mime. - `assemble_prompt` returns `(chunks, images, match_upto)`. - `stream_session_mm` / `run_session_generate` take the parallel images list, filter to those past `match_upto`, and pass them in `GenerateRequest.images` as `pb::ImageAttachment` entries. - Drop `SessionHandle::append_image`, `ContextState::commit_image_token_counts`, `StreamToken::ImageAppended`, the WireChunk::Image branch in `learn.rs`, and the now-empty `prompt_to_chunks` helper. - Add 'v' toggle on the conscious-screen tree to render token-id vectors in place of text content (debug-aid: lets us see what the server actually has when output is suspicious). - Comment out the subconscious-trigger spawn loop — Kent had this disabled before; it had crept back into running. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
4feebb7bc4
commit
fe232cf292
12 changed files with 468 additions and 306 deletions
|
|
@ -40,14 +40,15 @@ struct ScoreResult {
|
|||
total_logprob: f64,
|
||||
}
|
||||
|
||||
/// Convert a flat (prompt_tokens, images) pair into the interleaved
|
||||
/// chunks the session protocol expects. Tokens up to the next
|
||||
/// `<|vision_start|>` become a Tokens chunk; each
|
||||
/// `<|vision_start|>..<|vision_end|>` run collapses into one Image
|
||||
/// chunk paired by position with the next entry in `images`. The
|
||||
/// server re-expands the IMAGE_PADs on AppendImage.
|
||||
fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec<WireChunk> {
|
||||
let mut out: Vec<WireChunk> = Vec::new();
|
||||
/// Find each <|vision_start|>...<|vision_end|> run in the flat prompt
|
||||
/// and pair it with the matching entry in `images`. Returns a list
|
||||
/// of `ImageAttachment` with absolute pad-range positions, ready
|
||||
/// to drop into `GenerateRequest.images`.
|
||||
fn pair_images_to_ranges(
|
||||
prompt: &[u32],
|
||||
images: &[WireImage],
|
||||
) -> Vec<pb::ImageAttachment> {
|
||||
let mut out: Vec<pb::ImageAttachment> = Vec::new();
|
||||
let mut cur = 0;
|
||||
let mut img_idx = 0;
|
||||
while cur < prompt.len() {
|
||||
|
|
@ -60,22 +61,16 @@ fn prompt_to_chunks(prompt: &[u32], images: &[WireImage]) -> Vec<WireChunk> {
|
|||
let img = images.get(img_idx)
|
||||
.unwrap_or_else(|| panic!(
|
||||
"image index {} out of range for {} images", img_idx, images.len()));
|
||||
out.push(WireChunk::Image {
|
||||
out.push(pb::ImageAttachment {
|
||||
bytes: img.bytes.clone(),
|
||||
mime: img.mime.clone(),
|
||||
known_expanded_len: (end - cur) as u32,
|
||||
pad_range_start: cur as u32,
|
||||
pad_range_end: end as u32,
|
||||
});
|
||||
img_idx += 1;
|
||||
cur = end;
|
||||
} else {
|
||||
let next_vs = prompt[cur..].iter()
|
||||
.position(|&t| t == tokenizer::VISION_START);
|
||||
let end = match next_vs {
|
||||
Some(o) => cur + o,
|
||||
None => prompt.len(),
|
||||
};
|
||||
out.push(WireChunk::Tokens(prompt[cur..end].to_vec()));
|
||||
cur = end;
|
||||
cur += 1;
|
||||
}
|
||||
}
|
||||
out
|
||||
|
|
@ -95,36 +90,22 @@ async fn call_score(
|
|||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let chunks = prompt_to_chunks(prompt, images);
|
||||
let images_pb = pair_images_to_ranges(prompt, images);
|
||||
let mut handle = SessionHandle::open(client).await?;
|
||||
|
||||
// Walk chunks: AppendImage for each image, prefill-only Generate
|
||||
// for each text run between images. Accumulate any trailing text
|
||||
// run into `pending` for the final logprob-generating Generate.
|
||||
let mut pending: Vec<u32> = Vec::new();
|
||||
for chunk in chunks {
|
||||
match chunk {
|
||||
WireChunk::Tokens(t) => pending.extend(t),
|
||||
WireChunk::Image { bytes, mime, .. } => {
|
||||
if !pending.is_empty() {
|
||||
handle.prefill_only(std::mem::take(&mut pending)).await?;
|
||||
}
|
||||
handle.append_image(bytes, mime, false).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final Generate: max_tokens=0 so the server runs prefill of the
|
||||
// trailing `pending` tokens and emits Token events for each
|
||||
// position covered by logprobs_ranges, then Done. logprob_top_k=0
|
||||
// means "just the sampled (prompt) token's logprob" — no top-k
|
||||
// alternatives, which is all call_score historically needed.
|
||||
// full prompt and emits Token events for each position covered
|
||||
// by logprobs_ranges, then Done. logprob_top_k=0 means "just
|
||||
// the sampled (prompt) token's logprob" — no top-k alternatives,
|
||||
// which is all call_score historically needed. Images attach
|
||||
// inline via `images`; the prompt already contains their pre-
|
||||
// expanded vision blocks at the declared ranges.
|
||||
let logprobs_ranges: Vec<pb::PositionRange> = ranges.iter()
|
||||
.map(|(s, e)| pb::PositionRange { start: *s as u32, end: *e as u32 })
|
||||
.collect();
|
||||
let req = pb::GenerateRequest {
|
||||
session_id: handle.session_id.clone(),
|
||||
append_tokens: pending,
|
||||
append_tokens: prompt.to_vec(),
|
||||
offset: handle.committed_len,
|
||||
truncating: false,
|
||||
max_tokens: 0,
|
||||
|
|
@ -136,6 +117,7 @@ async fn call_score(
|
|||
top_k: 0,
|
||||
stop_token_ids: Vec::new(),
|
||||
priority: priority.unwrap_or(0),
|
||||
images: images_pb,
|
||||
};
|
||||
|
||||
let mut stream = handle.generate(req).await?;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue