salience: client-side pad expansion, drop AppendImage

Mirrors the vLLM-side rewrite. AppendImage is gone; images now
ride along on Generate via a parallel `images` list.

- Productionize `qwen3_image_token_count` (was test-only). Image
  leaf computes its IMAGE_PAD count eagerly at construction from
  height/width; `token_count` is no longer "0 until the server
  tells us."
- WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision
  blocks live inline in the token stream.
- `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`.
  `WireImage` carries `pad_start` / `pad_end` (absolute positions
  in the full walk) alongside bytes + mime.
- `assemble_prompt` returns `(chunks, images, match_upto)`.
- `stream_session_mm` / `run_session_generate` take the parallel
  images list, filter to those past `match_upto`, and pass them
  in `GenerateRequest.images` as `pb::ImageAttachment` entries.
- Drop `SessionHandle::append_image`,
  `ContextState::commit_image_token_counts`,
  `StreamToken::ImageAppended`, the WireChunk::Image branch in
  `learn.rs`, and the now-empty `prompt_to_chunks` helper.
- Add 'v' toggle on the conscious-screen tree to render token-id
  vectors in place of text content (debug-aid: lets us see what
  the server actually has when output is suspicious).
- Comment out the subconscious-trigger spawn loop — Kent had this
  disabled before; it had crept back into running.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-24 20:26:47 -04:00
commit fe232cf292
12 changed files with 468 additions and 306 deletions

View file

@ -58,21 +58,26 @@ service Salience {
// boundary).
rpc ForkSession(ForkSessionRequest) returns (ForkSessionResponse);
// Append an image to the session. Server decodes, runs vLLM's
// multimodal pipeline to compute N (IMAGE_PAD count), and writes
// the whole vision block into session.tokens. Returns N and the
// new total length.
rpc AppendImage(AppendImageRequest) returns (AppendImageResponse);
// Prefill + optionally decode. See GenerateRequest for full
// semantics; stream yields Token events (with optional readouts /
// logprobs per position) followed by a terminating Done.
// Prefill + optionally decode. Images are attached inline via
// `GenerateRequest.images`; the client writes its own pre-expanded
// <|vision_start|> + N*<|image_pad|> + <|vision_end|> runs into
// `append_tokens` and declares each run's range in `images[i]`.
// Server validates run length against the actual vision-encoder
// feature count and returns INVALID_ARGUMENT on mismatch. Stream
// yields Token events (with optional readouts / logprobs per
// position) followed by a terminating Done.
rpc Generate(GenerateRequest) returns (stream GenerateEvent);
// Readout manifest for the currently-loaded model concept names,
// layer indices, tensor dtype. Stateless; fetch once at client
// startup and cache.
rpc GetReadoutManifest(GetReadoutManifestRequest) returns (ReadoutManifest);
// Dump the full token stream of a session. Debug-only: used by the
// client to verify its local accounting against the server's
// session.tokens byte-for-byte when divergence is suspected. Not
// cheap copies the whole sequence across the wire.
rpc DumpSession(DumpSessionRequest) returns (DumpSessionResponse);
}
// ============================================================
@ -106,55 +111,47 @@ message ForkSessionResponse {
string session_id = 1; // new session
}
// ============================================================
// Mutation
// ============================================================
message AppendImageRequest {
string session_id = 1;
// Image bytes (PNG / JPEG / WebP / ).
bytes data = 2;
// MIME type, e.g. "image/png".
string mime = 3;
// Client's view of the session's current token length. Must equal
// the server's actual length, OR be strictly less when
// truncating=true. Any mismatch is FAILED_PRECONDITION.
uint32 offset = 4;
// If true, server truncates session.tokens to `offset` before
// appending. Rejected with FAILED_PRECONDITION if the truncation
// would split an image block.
bool truncating = 5;
}
message AppendImageResponse {
// Count of <|image_pad|> tokens inside the vision block. Does not
// include the <|vision_start|> / <|vision_end|> bookends, which
// contribute one token each.
uint32 placeholder_count = 1;
// Session's total token length after this append, including both
// bookends (= offset + placeholder_count + 2, barring truncation).
uint32 total_length = 2;
}
// ============================================================
// Inference
// ============================================================
// One image attached to a Generate call. The client is responsible
// for writing the expanded placeholder run (VISION_START +
// N*IMAGE_PAD + VISION_END) into `GenerateRequest.append_tokens` at
// positions [pad_range_start, pad_range_end) and pairing it with
// the corresponding `ImageAttachment` entry. Server validates that
// the declared range's pad count matches what the vision encoder
// produces, and returns INVALID_ARGUMENT if they disagree.
message ImageAttachment {
// Image bytes (PNG / JPEG / WebP / ).
bytes bytes = 1;
// MIME type, e.g. "image/png".
string mime = 2;
// Absolute token positions (in `session.tokens` AFTER `append_tokens`
// is applied) spanning the full vision block `[vision_start,
// pad*N, vision_end]`. end is exclusive, so end - start == N + 2.
uint32 pad_range_start = 3;
uint32 pad_range_end = 4;
}
message GenerateRequest {
string session_id = 1;
// Tokens to append before prefill. May be empty. Client must NOT
// include vision tokens (<|vision_start|>, <|image_pad|>,
// <|vision_end|>) those live in the session via AppendImage.
// Tokens to append before prefill. May be empty. Client writes the
// full vision block (VISION_START + N*IMAGE_PAD + VISION_END) for
// any newly-attached image directly into this stream; each such
// block must be paired with a matching entry in `images`. The
// server validates that the declared ranges all point at IMAGE_PAD
// runs and that each run's length matches what the vision encoder
// produces for the corresponding image.
repeated uint32 append_tokens = 2;
// Offset / truncating same semantics as AppendImage. Truncation
// that splits an image block is FAILED_PRECONDITION.
// Client's view of session.tokens length at the time of the call.
// Must equal server's actual length, OR be strictly less when
// truncating=true (server rewinds before appending). Any other
// mismatch is FAILED_PRECONDITION.
uint32 offset = 3;
bool truncating = 4;
@ -185,6 +182,12 @@ message GenerateRequest {
// vLLM scheduler priority (0 = interactive, 10 = batch).
int32 priority = 13;
// Images newly attached on this call. Each entry describes one
// image's binary bytes, its mime type, and the exact token-position
// range of its pre-expanded placeholder run inside `session.tokens`
// after `append_tokens` is applied. See `ImageAttachment`.
repeated ImageAttachment images = 14;
}
message PositionRange {
@ -258,3 +261,16 @@ message ReadoutManifest {
uint32 hidden_size = 3;
string dtype = 4;
}
// ============================================================
// Debug
// ============================================================
message DumpSessionRequest {
string session_id = 1;
}
message DumpSessionResponse {
// The full session.tokens sequence, verbatim.
repeated uint32 tokens = 1 [packed = true];
}