forked from kent/consciousness
salience: client-side pad expansion, drop AppendImage
Mirrors the vLLM-side rewrite. AppendImage is gone; images now ride along on Generate via a parallel `images` list. - Productionize `qwen3_image_token_count` (was test-only). Image leaf computes its IMAGE_PAD count eagerly at construction from height/width; `token_count` is no longer "0 until the server tells us." - WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision blocks live inline in the token stream. - `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`. `WireImage` carries `pad_start` / `pad_end` (absolute positions in the full walk) alongside bytes + mime. - `assemble_prompt` returns `(chunks, images, match_upto)`. - `stream_session_mm` / `run_session_generate` take the parallel images list, filter to those past `match_upto`, and pass them in `GenerateRequest.images` as `pb::ImageAttachment` entries. - Drop `SessionHandle::append_image`, `ContextState::commit_image_token_counts`, `StreamToken::ImageAppended`, the WireChunk::Image branch in `learn.rs`, and the now-empty `prompt_to_chunks` helper. - Add 'v' toggle on the conscious-screen tree to render token-id vectors in place of text content (debug-aid: lets us see what the server actually has when output is suspicious). - Comment out the subconscious-trigger spawn loop — Kent had this disabled before; it had crept back into running. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
4feebb7bc4
commit
fe232cf292
12 changed files with 468 additions and 306 deletions
|
|
@ -58,21 +58,26 @@ service Salience {
|
|||
// boundary).
|
||||
rpc ForkSession(ForkSessionRequest) returns (ForkSessionResponse);
|
||||
|
||||
// Append an image to the session. Server decodes, runs vLLM's
|
||||
// multimodal pipeline to compute N (IMAGE_PAD count), and writes
|
||||
// the whole vision block into session.tokens. Returns N and the
|
||||
// new total length.
|
||||
rpc AppendImage(AppendImageRequest) returns (AppendImageResponse);
|
||||
|
||||
// Prefill + optionally decode. See GenerateRequest for full
|
||||
// semantics; stream yields Token events (with optional readouts /
|
||||
// logprobs per position) followed by a terminating Done.
|
||||
// Prefill + optionally decode. Images are attached inline via
|
||||
// `GenerateRequest.images`; the client writes its own pre-expanded
|
||||
// <|vision_start|> + N*<|image_pad|> + <|vision_end|> runs into
|
||||
// `append_tokens` and declares each run's range in `images[i]`.
|
||||
// Server validates run length against the actual vision-encoder
|
||||
// feature count and returns INVALID_ARGUMENT on mismatch. Stream
|
||||
// yields Token events (with optional readouts / logprobs per
|
||||
// position) followed by a terminating Done.
|
||||
rpc Generate(GenerateRequest) returns (stream GenerateEvent);
|
||||
|
||||
// Readout manifest for the currently-loaded model — concept names,
|
||||
// layer indices, tensor dtype. Stateless; fetch once at client
|
||||
// startup and cache.
|
||||
rpc GetReadoutManifest(GetReadoutManifestRequest) returns (ReadoutManifest);
|
||||
|
||||
// Dump the full token stream of a session. Debug-only: used by the
|
||||
// client to verify its local accounting against the server's
|
||||
// session.tokens byte-for-byte when divergence is suspected. Not
|
||||
// cheap — copies the whole sequence across the wire.
|
||||
rpc DumpSession(DumpSessionRequest) returns (DumpSessionResponse);
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
|
|
@ -106,55 +111,47 @@ message ForkSessionResponse {
|
|||
string session_id = 1; // new session
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Mutation
|
||||
// ============================================================
|
||||
|
||||
message AppendImageRequest {
|
||||
string session_id = 1;
|
||||
|
||||
// Image bytes (PNG / JPEG / WebP / …).
|
||||
bytes data = 2;
|
||||
|
||||
// MIME type, e.g. "image/png".
|
||||
string mime = 3;
|
||||
|
||||
// Client's view of the session's current token length. Must equal
|
||||
// the server's actual length, OR be strictly less when
|
||||
// truncating=true. Any mismatch is FAILED_PRECONDITION.
|
||||
uint32 offset = 4;
|
||||
|
||||
// If true, server truncates session.tokens to `offset` before
|
||||
// appending. Rejected with FAILED_PRECONDITION if the truncation
|
||||
// would split an image block.
|
||||
bool truncating = 5;
|
||||
}
|
||||
|
||||
message AppendImageResponse {
|
||||
// Count of <|image_pad|> tokens inside the vision block. Does not
|
||||
// include the <|vision_start|> / <|vision_end|> bookends, which
|
||||
// contribute one token each.
|
||||
uint32 placeholder_count = 1;
|
||||
|
||||
// Session's total token length after this append, including both
|
||||
// bookends (= offset + placeholder_count + 2, barring truncation).
|
||||
uint32 total_length = 2;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Inference
|
||||
// ============================================================
|
||||
|
||||
// One image attached to a Generate call. The client is responsible
|
||||
// for writing the expanded placeholder run (VISION_START +
|
||||
// N*IMAGE_PAD + VISION_END) into `GenerateRequest.append_tokens` at
|
||||
// positions [pad_range_start, pad_range_end) and pairing it with
|
||||
// the corresponding `ImageAttachment` entry. Server validates that
|
||||
// the declared range's pad count matches what the vision encoder
|
||||
// produces, and returns INVALID_ARGUMENT if they disagree.
|
||||
message ImageAttachment {
|
||||
// Image bytes (PNG / JPEG / WebP / …).
|
||||
bytes bytes = 1;
|
||||
|
||||
// MIME type, e.g. "image/png".
|
||||
string mime = 2;
|
||||
|
||||
// Absolute token positions (in `session.tokens` AFTER `append_tokens`
|
||||
// is applied) spanning the full vision block — `[vision_start,
|
||||
// pad*N, vision_end]`. end is exclusive, so end - start == N + 2.
|
||||
uint32 pad_range_start = 3;
|
||||
uint32 pad_range_end = 4;
|
||||
}
|
||||
|
||||
message GenerateRequest {
|
||||
string session_id = 1;
|
||||
|
||||
// Tokens to append before prefill. May be empty. Client must NOT
|
||||
// include vision tokens (<|vision_start|>, <|image_pad|>,
|
||||
// <|vision_end|>) — those live in the session via AppendImage.
|
||||
// Tokens to append before prefill. May be empty. Client writes the
|
||||
// full vision block (VISION_START + N*IMAGE_PAD + VISION_END) for
|
||||
// any newly-attached image directly into this stream; each such
|
||||
// block must be paired with a matching entry in `images`. The
|
||||
// server validates that the declared ranges all point at IMAGE_PAD
|
||||
// runs and that each run's length matches what the vision encoder
|
||||
// produces for the corresponding image.
|
||||
repeated uint32 append_tokens = 2;
|
||||
|
||||
// Offset / truncating — same semantics as AppendImage. Truncation
|
||||
// that splits an image block is FAILED_PRECONDITION.
|
||||
// Client's view of session.tokens length at the time of the call.
|
||||
// Must equal server's actual length, OR be strictly less when
|
||||
// truncating=true (server rewinds before appending). Any other
|
||||
// mismatch is FAILED_PRECONDITION.
|
||||
uint32 offset = 3;
|
||||
bool truncating = 4;
|
||||
|
||||
|
|
@ -185,6 +182,12 @@ message GenerateRequest {
|
|||
|
||||
// vLLM scheduler priority (0 = interactive, 10 = batch).
|
||||
int32 priority = 13;
|
||||
|
||||
// Images newly attached on this call. Each entry describes one
|
||||
// image's binary bytes, its mime type, and the exact token-position
|
||||
// range of its pre-expanded placeholder run inside `session.tokens`
|
||||
// after `append_tokens` is applied. See `ImageAttachment`.
|
||||
repeated ImageAttachment images = 14;
|
||||
}
|
||||
|
||||
message PositionRange {
|
||||
|
|
@ -258,3 +261,16 @@ message ReadoutManifest {
|
|||
uint32 hidden_size = 3;
|
||||
string dtype = 4;
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Debug
|
||||
// ============================================================
|
||||
|
||||
message DumpSessionRequest {
|
||||
string session_id = 1;
|
||||
}
|
||||
|
||||
message DumpSessionResponse {
|
||||
// The full session.tokens sequence, verbatim.
|
||||
repeated uint32 tokens = 1 [packed = true];
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue