// salience.proto — stateful generation + per-token concept readout over gRPC. // // Shape: // - One server-streaming RPC (Generate) for inference. Every other // operation is unary. This is the minimum streaming we need — // tokens arrive one at a time with optional readouts / logprobs — // and keeping everything else unary makes the client dramatically // simpler than a single bidi state machine did. // // - Server-side sessions hold the token list and image binaries. // Sessions exist for bandwidth: at 200K tokens we'd otherwise // re-ship ~800KB every turn, which hurts badly over a WAN link. // vLLM's prefix cache holds the KV; the session just gives the // client a handle so it can send deltas. // // - The client is the source of truth for prompt content. The server // is the source of truth for image token expansion (how many // IMAGE_PAD tokens an image becomes under this model). The client // never writes vision tokens itself — AppendImage appends the whole // <|vision_start|> + IMAGE_PAD×N + <|vision_end|> block server-side. // // - Every mutation carries (offset, truncating): the client's view of // the server's current length, plus whether the client is deliberately // rewriting history. Server validates on each call and rejects drift. // No silent divergence, no migration bugs. // // - Errors use gRPC status codes. NOT_FOUND for missing sessions, // FAILED_PRECONDITION for offset drift or image-block splits, // RESOURCE_EXHAUSTED for context overflow, ABORTED for "session busy". // // Not in v1: // - Authentication beyond a shared bearer token in gRPC metadata. // - Multi-tenant session namespacing. // - Sampling traces beyond top-k logprobs. syntax = "proto3"; package salience.v1; // ============================================================ // Service // ============================================================ service Salience { // Create a fresh session. Client uses session_id on every subsequent // RPC until CloseSession or TTL eviction (default 30 min idle). To // refresh TTL across a long pause, issue a no-op Generate (empty // append_tokens, max_tokens=0, no ranges). rpc OpenSession(OpenSessionRequest) returns (OpenSessionResponse); // Release the session's tokens + images. Idempotent. rpc CloseSession(CloseSessionRequest) returns (CloseSessionResponse); // Branch a session at a given token position. The new session // inherits tokens [0, at_position) and any images whose vision // block lies fully in that range. Rejected with FAILED_PRECONDITION // if at_position falls inside an image block (client picks a clean // boundary). rpc ForkSession(ForkSessionRequest) returns (ForkSessionResponse); // Append an image to the session. Server decodes, runs vLLM's // multimodal pipeline to compute N (IMAGE_PAD count), and writes // the whole vision block into session.tokens. Returns N and the // new total length. rpc AppendImage(AppendImageRequest) returns (AppendImageResponse); // Prefill + optionally decode. See GenerateRequest for full // semantics; stream yields Token events (with optional readouts / // logprobs per position) followed by a terminating Done. rpc Generate(GenerateRequest) returns (stream GenerateEvent); // Readout manifest for the currently-loaded model — concept names, // layer indices, tensor dtype. Stateless; fetch once at client // startup and cache. rpc GetReadoutManifest(GetReadoutManifestRequest) returns (ReadoutManifest); } // ============================================================ // Lifecycle // ============================================================ message OpenSessionRequest { // Model identifier, must match vLLM's served model. The server // only has one model loaded; this is a safety check on what the // client thinks it's talking to. string model = 1; } message OpenSessionResponse { string session_id = 1; uint32 max_model_len = 2; } message CloseSessionRequest { string session_id = 1; } message CloseSessionResponse {} message ForkSessionRequest { string session_id = 1; // source session uint32 at_position = 2; // new session inherits tokens [0, at_position) } message ForkSessionResponse { string session_id = 1; // new session } // ============================================================ // Mutation // ============================================================ message AppendImageRequest { string session_id = 1; // Image bytes (PNG / JPEG / WebP / …). bytes data = 2; // MIME type, e.g. "image/png". string mime = 3; // Client's view of the session's current token length. Must equal // the server's actual length, OR be strictly less when // truncating=true. Any mismatch is FAILED_PRECONDITION. uint32 offset = 4; // If true, server truncates session.tokens to `offset` before // appending. Rejected with FAILED_PRECONDITION if the truncation // would split an image block. bool truncating = 5; } message AppendImageResponse { // Count of <|image_pad|> tokens inside the vision block. Does not // include the <|vision_start|> / <|vision_end|> bookends, which // contribute one token each. uint32 placeholder_count = 1; // Session's total token length after this append, including both // bookends (= offset + placeholder_count + 2, barring truncation). uint32 total_length = 2; } // ============================================================ // Inference // ============================================================ message GenerateRequest { string session_id = 1; // Tokens to append before prefill. May be empty. Client must NOT // include vision tokens (<|vision_start|>, <|image_pad|>, // <|vision_end|>) — those live in the session via AppendImage. repeated uint32 append_tokens = 2; // Offset / truncating — same semantics as AppendImage. Truncation // that splits an image block is FAILED_PRECONDITION. uint32 offset = 3; bool truncating = 4; // Decode budget. 0 = prefill only (no decode, emit Token events // for positions covered by logprobs_ranges / readout_ranges, then // Done; replaces the old /score endpoint). >0 = decode up to this // many tokens, stopping early on EOS / stop_token_ids. uint32 max_tokens = 5; // Position ranges (absolute, within the session's post-append // token list) at which to emit logprobs on Token events. Empty = // no logprobs. `logprob_top_k > 0` returns the top-k alternative // tokens at each covered position; `logprob_top_k == 0` returns // only the sampled-token's logprob. repeated PositionRange logprobs_ranges = 6; uint32 logprob_top_k = 7; // Position ranges at which to emit concept-readout vectors. Empty // = no readouts. Logical shape per position is // [n_layers][n_concepts] — see GetReadoutManifest. repeated PositionRange readout_ranges = 8; // Sampling parameters. Meaningful only when max_tokens > 0. float temperature = 9; // default 1.0 when zero float top_p = 10; // default 1.0 when zero uint32 top_k = 11; // default 0 (disabled) repeated uint32 stop_token_ids = 12; // vLLM scheduler priority (0 = interactive, 10 = batch). int32 priority = 13; } message PositionRange { uint32 start = 1; // inclusive uint32 end = 2; // exclusive } message GenerateEvent { oneof event { Token token = 1; GenerateDone done = 2; } } message Token { // Token id at this position. For prefill this is the prompt token; // for decode it's the sampled token. uint32 id = 1; // Absolute position in the session's token list. uint32 position = 2; // True for prefill positions, false for decode. bool is_prefill = 3; // Concept readout at this position. Empty if the position wasn't // covered by readout_ranges. repeated float readout = 4 [packed = true]; // Top-k alternative tokens' logprobs at this position — populated // when the position is covered by logprobs_ranges and // logprob_top_k > 0. repeated TokenLogprob logprobs = 5; // Logprob of the token at `position` (the prompt token for // prefill, the sampled token for decode). Populated when the // position is covered by logprobs_ranges. float sampled_logprob = 6; bool has_sampled_logprob = 7; } message TokenLogprob { uint32 id = 1; float logprob = 2; } message GenerateDone { uint32 prompt_tokens = 1; uint32 completion_tokens = 2; uint32 total_tokens = 3; enum FinishReason { FINISH_REASON_UNSPECIFIED = 0; FINISH_REASON_EOS = 1; // emitted EOS / stop token FINISH_REASON_LENGTH = 2; // hit max_tokens FINISH_REASON_CANCELLED = 3; // client cancelled FINISH_REASON_STOP_STRING = 4; // matched a stop string } FinishReason finish_reason = 4; } // ============================================================ // Readout manifest // ============================================================ message GetReadoutManifestRequest {} message ReadoutManifest { repeated string concepts = 1; repeated uint32 layers = 2; uint32 hidden_size = 3; string dtype = 4; }