salience: client-side pad expansion, drop AppendImage

Mirrors the vLLM-side rewrite. AppendImage is gone; images now ride along on Generate via a parallel `images` list. - Productionize `qwen3_image_token_count` (was test-only). Image leaf computes its IMAGE_PAD count eagerly at construction from height/width; `token_count` is no longer "0 until the server tells us." - WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision blocks live inline in the token stream. - `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`. `WireImage` carries `pad_start` / `pad_end` (absolute positions in the full walk) alongside bytes + mime. - `assemble_prompt` returns `(chunks, images, match_upto)`. - `stream_session_mm` / `run_session_generate` take the parallel images list, filter to those past `match_upto`, and pass them in `GenerateRequest.images` as `pb::ImageAttachment` entries. - Drop `SessionHandle::append_image`, `ContextState::commit_image_token_counts`, `StreamToken::ImageAppended`, the WireChunk::Image branch in `learn.rs`, and the now-empty `prompt_to_chunks` helper. - Add 'v' toggle on the conscious-screen tree to render token-id vectors in place of text content (debug-aid: lets us see what the server actually has when output is suspicious). - Comment out the subconscious-trigger spawn loop — Kent had this disabled before; it had crept back into running. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-24 20:26:47 -04:00 · 2026-04-24 20:26:47 -04:00 · fe232cf292
commit fe232cf292
parent 4feebb7bc4
12 changed files with 468 additions and 306 deletions
--- a/proto/salience.proto
+++ b/proto/salience.proto
@ -58,21 +58,26 @@ service Salience {
  // boundary).
  rpc ForkSession(ForkSessionRequest) returns (ForkSessionResponse);

-  // Append an image to the session. Server decodes, runs vLLM's
-  // multimodal pipeline to compute N (IMAGE_PAD count), and writes
-  // the whole vision block into session.tokens. Returns N and the
-  // new total length.
-  rpc AppendImage(AppendImageRequest) returns (AppendImageResponse);
-
-  // Prefill + optionally decode. See GenerateRequest for full
-  // semantics; stream yields Token events (with optional readouts /
-  // logprobs per position) followed by a terminating Done.
+  // Prefill + optionally decode. Images are attached inline via
+  // `GenerateRequest.images`; the client writes its own pre-expanded
+  // <|vision_start|> + N*<|image_pad|> + <|vision_end|> runs into
+  // `append_tokens` and declares each run's range in `images[i]`.
+  // Server validates run length against the actual vision-encoder
+  // feature count and returns INVALID_ARGUMENT on mismatch. Stream
+  // yields Token events (with optional readouts / logprobs per
+  // position) followed by a terminating Done.
  rpc Generate(GenerateRequest) returns (stream GenerateEvent);

  // Readout manifest for the currently-loaded model — concept names,
  // layer indices, tensor dtype. Stateless; fetch once at client
  // startup and cache.
  rpc GetReadoutManifest(GetReadoutManifestRequest) returns (ReadoutManifest);
+
+  // Dump the full token stream of a session. Debug-only: used by the
+  // client to verify its local accounting against the server's
+  // session.tokens byte-for-byte when divergence is suspected. Not
+  // cheap — copies the whole sequence across the wire.
+  rpc DumpSession(DumpSessionRequest) returns (DumpSessionResponse);
 }

 // ============================================================
@ -106,55 +111,47 @@ message ForkSessionResponse {
  string session_id = 1;    // new session
 }

-// ============================================================
-//  Mutation
-// ============================================================
-
-message AppendImageRequest {
-  string session_id = 1;
-
-  // Image bytes (PNG / JPEG / WebP / …).
-  bytes  data = 2;
-
-  // MIME type, e.g. "image/png".
-  string mime = 3;
-
-  // Client's view of the session's current token length. Must equal
-  // the server's actual length, OR be strictly less when
-  // truncating=true. Any mismatch is FAILED_PRECONDITION.
-  uint32 offset = 4;
-
-  // If true, server truncates session.tokens to `offset` before
-  // appending. Rejected with FAILED_PRECONDITION if the truncation
-  // would split an image block.
-  bool   truncating = 5;
-}
-
-message AppendImageResponse {
-  // Count of <|image_pad|> tokens inside the vision block. Does not
-  // include the <|vision_start|> / <|vision_end|> bookends, which
-  // contribute one token each.
-  uint32 placeholder_count = 1;
-
-  // Session's total token length after this append, including both
-  // bookends (= offset + placeholder_count + 2, barring truncation).
-  uint32 total_length = 2;
-}
-
 // ============================================================
 //  Inference
 // ============================================================

+// One image attached to a Generate call. The client is responsible
+// for writing the expanded placeholder run (VISION_START +
+// N*IMAGE_PAD + VISION_END) into `GenerateRequest.append_tokens` at
+// positions [pad_range_start, pad_range_end) and pairing it with
+// the corresponding `ImageAttachment` entry. Server validates that
+// the declared range's pad count matches what the vision encoder
+// produces, and returns INVALID_ARGUMENT if they disagree.
+message ImageAttachment {
+  // Image bytes (PNG / JPEG / WebP / …).
+  bytes  bytes = 1;
+
+  // MIME type, e.g. "image/png".
+  string mime = 2;
+
+  // Absolute token positions (in `session.tokens` AFTER `append_tokens`
+  // is applied) spanning the full vision block —  `[vision_start,
+  // pad*N, vision_end]`. end is exclusive, so end - start == N + 2.
+  uint32 pad_range_start = 3;
+  uint32 pad_range_end = 4;
+}
+
 message GenerateRequest {
  string session_id = 1;

-  // Tokens to append before prefill. May be empty. Client must NOT
-  // include vision tokens (<|vision_start|>, <|image_pad|>,
-  // <|vision_end|>) — those live in the session via AppendImage.
+  // Tokens to append before prefill. May be empty. Client writes the
+  // full vision block (VISION_START + N*IMAGE_PAD + VISION_END) for
+  // any newly-attached image directly into this stream; each such
+  // block must be paired with a matching entry in `images`. The
+  // server validates that the declared ranges all point at IMAGE_PAD
+  // runs and that each run's length matches what the vision encoder
+  // produces for the corresponding image.
  repeated uint32 append_tokens = 2;

-  // Offset / truncating — same semantics as AppendImage. Truncation
-  // that splits an image block is FAILED_PRECONDITION.
+  // Client's view of session.tokens length at the time of the call.
+  // Must equal server's actual length, OR be strictly less when
+  // truncating=true (server rewinds before appending). Any other
+  // mismatch is FAILED_PRECONDITION.
  uint32 offset = 3;
  bool   truncating = 4;

@ -185,6 +182,12 @@ message GenerateRequest {

  // vLLM scheduler priority (0 = interactive, 10 = batch).
  int32 priority = 13;
+
+  // Images newly attached on this call. Each entry describes one
+  // image's binary bytes, its mime type, and the exact token-position
+  // range of its pre-expanded placeholder run inside `session.tokens`
+  // after `append_tokens` is applied. See `ImageAttachment`.
+  repeated ImageAttachment images = 14;
 }

 message PositionRange {
@ -258,3 +261,16 @@ message ReadoutManifest {
  uint32          hidden_size = 3;
  string          dtype = 4;
 }
+
+// ============================================================
+//  Debug
+// ============================================================
+
+message DumpSessionRequest {
+  string session_id = 1;
+}
+
+message DumpSessionResponse {
+  // The full session.tokens sequence, verbatim.
+  repeated uint32 tokens = 1 [packed = true];
+}