// salience.proto — stateful generation + per-token concept readout over gRPC. // // Shape: // - One server-streaming RPC (Generate) for inference. Every other // operation is unary. This is the minimum streaming we need — // tokens arrive one at a time with optional readouts / logprobs — // and keeping everything else unary makes the client dramatically // simpler than a single bidi state machine did. // // - Server-side sessions hold the token list and image binaries. // Sessions exist for bandwidth: at 200K tokens we'd otherwise // re-ship ~800KB every turn, which hurts badly over a WAN link. // vLLM's prefix cache holds the KV; the session just gives the // client a handle so it can send deltas. // // - The client is the source of truth for prompt content. The server // is the source of truth for image token expansion (how many // IMAGE_PAD tokens an image becomes under this model). The client // never writes vision tokens itself — AppendImage appends the whole // <|vision_start|> + IMAGE_PAD×N + <|vision_end|> block server-side. // // - Every mutation carries (offset, truncating): the client's view of // the server's current length, plus whether the client is deliberately // rewriting history. Server validates on each call and rejects drift. // No silent divergence, no migration bugs. // // - Errors use gRPC status codes. NOT_FOUND for missing sessions, // FAILED_PRECONDITION for offset drift or image-block splits, // RESOURCE_EXHAUSTED for context overflow, ABORTED for "session busy". // // Not in v1: // - Authentication beyond a shared bearer token in gRPC metadata. // - Multi-tenant session namespacing. // - Sampling traces beyond top-k logprobs. syntax = "proto3"; package salience.v1; // ============================================================ // Service // ============================================================ service Salience { // Create a fresh session. Client uses session_id on every subsequent // RPC until CloseSession or TTL eviction (default 30 min idle). To // refresh TTL across a long pause, issue a no-op Generate (empty // append_tokens, max_tokens=0, no ranges). rpc OpenSession(OpenSessionRequest) returns (OpenSessionResponse); // Release the session's tokens + images. Idempotent. rpc CloseSession(CloseSessionRequest) returns (CloseSessionResponse); // Branch a session at a given token position. The new session // inherits tokens [0, at_position) and any images whose vision // block lies fully in that range. Rejected with FAILED_PRECONDITION // if at_position falls inside an image block (client picks a clean // boundary). rpc ForkSession(ForkSessionRequest) returns (ForkSessionResponse); // Prefill + optionally decode. Images are attached inline via // `GenerateRequest.images`; the client writes its own pre-expanded // <|vision_start|> + N*<|image_pad|> + <|vision_end|> runs into // `append_tokens` and declares each run's range in `images[i]`. // Server validates run length against the actual vision-encoder // feature count and returns INVALID_ARGUMENT on mismatch. Stream // yields Token events (with optional readouts / logprobs per // position) followed by a terminating Done. rpc Generate(GenerateRequest) returns (stream GenerateEvent); // Readout manifest for the currently-loaded model — concept names, // layer indices, tensor dtype. Stateless; fetch once at client // startup and cache. rpc GetReadoutManifest(GetReadoutManifestRequest) returns (ReadoutManifest); // Dump the full token stream of a session. Debug-only: used by the // client to verify its local accounting against the server's // session.tokens byte-for-byte when divergence is suspected. Not // cheap — copies the whole sequence across the wire. rpc DumpSession(DumpSessionRequest) returns (DumpSessionResponse); } // ============================================================ // Lifecycle // ============================================================ message OpenSessionRequest { // Model identifier, must match vLLM's served model. The server // only has one model loaded; this is a safety check on what the // client thinks it's talking to. string model = 1; } message OpenSessionResponse { string session_id = 1; uint32 max_model_len = 2; } message CloseSessionRequest { string session_id = 1; } message CloseSessionResponse {} message ForkSessionRequest { string session_id = 1; // source session uint32 at_position = 2; // new session inherits tokens [0, at_position) } message ForkSessionResponse { string session_id = 1; // new session } // ============================================================ // Inference // ============================================================ // One image attached to a Generate call. The client is responsible // for writing the expanded placeholder run (VISION_START + // N*IMAGE_PAD + VISION_END) into `GenerateRequest.append_tokens` at // positions [pad_range_start, pad_range_end) and pairing it with // the corresponding `ImageAttachment` entry. Server validates that // the declared range's pad count matches what the vision encoder // produces, and returns INVALID_ARGUMENT if they disagree. message ImageAttachment { // Image bytes (PNG / JPEG / WebP / …). bytes bytes = 1; // MIME type, e.g. "image/png". string mime = 2; // Absolute token positions (in `session.tokens` AFTER `append_tokens` // is applied) spanning the full vision block — `[vision_start, // pad*N, vision_end]`. end is exclusive, so end - start == N + 2. uint32 pad_range_start = 3; uint32 pad_range_end = 4; } message GenerateRequest { string session_id = 1; // Tokens to append before prefill. May be empty. Client writes the // full vision block (VISION_START + N*IMAGE_PAD + VISION_END) for // any newly-attached image directly into this stream; each such // block must be paired with a matching entry in `images`. The // server validates that the declared ranges all point at IMAGE_PAD // runs and that each run's length matches what the vision encoder // produces for the corresponding image. repeated uint32 append_tokens = 2; // Client's view of session.tokens length at the time of the call. // Must equal server's actual length, OR be strictly less when // truncating=true (server rewinds before appending). Any other // mismatch is FAILED_PRECONDITION. uint32 offset = 3; bool truncating = 4; // Decode budget. 0 = prefill only (no decode, emit Token events // for positions covered by logprobs_ranges / readout_ranges, then // Done; replaces the old /score endpoint). >0 = decode up to this // many tokens, stopping early on EOS / stop_token_ids. uint32 max_tokens = 5; // Position ranges (absolute, within the session's post-append // token list) at which to emit logprobs on Token events. Empty = // no logprobs. `logprob_top_k > 0` returns the top-k alternative // tokens at each covered position; `logprob_top_k == 0` returns // only the sampled-token's logprob. repeated PositionRange logprobs_ranges = 6; uint32 logprob_top_k = 7; // Position ranges at which to emit concept-readout vectors. Empty // = no readouts. Logical shape per position is // [n_layers][n_concepts] — see GetReadoutManifest. repeated PositionRange readout_ranges = 8; // Sampling parameters. Meaningful only when max_tokens > 0. float temperature = 9; // default 1.0 when zero float top_p = 10; // default 1.0 when zero uint32 top_k = 11; // default 0 (disabled) repeated uint32 stop_token_ids = 12; // vLLM scheduler priority (0 = interactive, 10 = batch). int32 priority = 13; // Images newly attached on this call. Each entry describes one // image's binary bytes, its mime type, and the exact token-position // range of its pre-expanded placeholder run inside `session.tokens` // after `append_tokens` is applied. See `ImageAttachment`. repeated ImageAttachment images = 14; } message PositionRange { uint32 start = 1; // inclusive uint32 end = 2; // exclusive } message GenerateEvent { oneof event { Token token = 1; GenerateDone done = 2; } } message Token { // Token id at this position. For prefill this is the prompt token; // for decode it's the sampled token. uint32 id = 1; // Absolute position in the session's token list. uint32 position = 2; // True for prefill positions, false for decode. bool is_prefill = 3; // Concept readout at this position. Empty if the position wasn't // covered by readout_ranges. repeated float readout = 4 [packed = true]; // Top-k alternative tokens' logprobs at this position — populated // when the position is covered by logprobs_ranges and // logprob_top_k > 0. repeated TokenLogprob logprobs = 5; // Logprob of the token at `position` (the prompt token for // prefill, the sampled token for decode). Populated when the // position is covered by logprobs_ranges. float sampled_logprob = 6; bool has_sampled_logprob = 7; } message TokenLogprob { uint32 id = 1; float logprob = 2; } message GenerateDone { uint32 prompt_tokens = 1; uint32 completion_tokens = 2; uint32 total_tokens = 3; enum FinishReason { FINISH_REASON_UNSPECIFIED = 0; FINISH_REASON_EOS = 1; // emitted EOS / stop token FINISH_REASON_LENGTH = 2; // hit max_tokens FINISH_REASON_CANCELLED = 3; // client cancelled FINISH_REASON_STOP_STRING = 4; // matched a stop string } FinishReason finish_reason = 4; } // ============================================================ // Readout manifest // ============================================================ message GetReadoutManifestRequest {} message ReadoutManifest { repeated string concepts = 1; repeated uint32 layers = 2; uint32 hidden_size = 3; string dtype = 4; } // ============================================================ // Debug // ============================================================ message DumpSessionRequest { string session_id = 1; } message DumpSessionResponse { // The full session.tokens sequence, verbatim. repeated uint32 tokens = 1 [packed = true]; }