// salience.proto — stateful generation + per-token concept readout over gRPC.
//
// Shape:
//   - One server-streaming RPC (Generate) for inference. Every other
//     operation is unary. This is the minimum streaming we need —
//     tokens arrive one at a time with optional readouts / logprobs —
//     and keeping everything else unary makes the client dramatically
//     simpler than a single bidi state machine did.
//
//   - Server-side sessions hold the token list and image binaries.
//     Sessions exist for bandwidth: at 200K tokens we'd otherwise
//     re-ship ~800KB every turn, which hurts badly over a WAN link.
//     vLLM's prefix cache holds the KV; the session just gives the
//     client a handle so it can send deltas.
//
//   - The client is the source of truth for prompt content. The server
//     is the source of truth for image token expansion (how many
//     IMAGE_PAD tokens an image becomes under this model). The client
//     never writes vision tokens itself — AppendImage appends the whole
//     <|vision_start|> + IMAGE_PAD×N + <|vision_end|> block server-side.
//
//   - Every mutation carries (offset, truncating): the client's view of
//     the server's current length, plus whether the client is deliberately
//     rewriting history. Server validates on each call and rejects drift.
//     No silent divergence, no migration bugs.
//
//   - Errors use gRPC status codes. NOT_FOUND for missing sessions,
//     FAILED_PRECONDITION for offset drift or image-block splits,
//     RESOURCE_EXHAUSTED for context overflow, ABORTED for "session busy".
//
// Not in v1:
//   - Authentication beyond a shared bearer token in gRPC metadata.
//   - Multi-tenant session namespacing.
//   - Sampling traces beyond top-k logprobs.

syntax = "proto3";

package salience.v1;

// ============================================================
//  Service
// ============================================================

service Salience {
  // Create a fresh session. Client uses session_id on every subsequent
  // RPC until CloseSession or TTL eviction (default 30 min idle). To
  // refresh TTL across a long pause, issue a no-op Generate (empty
  // append_tokens, max_tokens=0, no ranges).
  rpc OpenSession(OpenSessionRequest) returns (OpenSessionResponse);

  // Release the session's tokens + images. Idempotent.
  rpc CloseSession(CloseSessionRequest) returns (CloseSessionResponse);

  // Branch a session at a given token position. The new session
  // inherits tokens [0, at_position) and any images whose vision
  // block lies fully in that range. Rejected with FAILED_PRECONDITION
  // if at_position falls inside an image block (client picks a clean
  // boundary).
  rpc ForkSession(ForkSessionRequest) returns (ForkSessionResponse);

  // Append an image to the session. Server decodes, runs vLLM's
  // multimodal pipeline to compute N (IMAGE_PAD count), and writes
  // the whole vision block into session.tokens. Returns N and the
  // new total length.
  rpc AppendImage(AppendImageRequest) returns (AppendImageResponse);

  // Prefill + optionally decode. See GenerateRequest for full
  // semantics; stream yields Token events (with optional readouts /
  // logprobs per position) followed by a terminating Done.
  rpc Generate(GenerateRequest) returns (stream GenerateEvent);

  // Readout manifest for the currently-loaded model — concept names,
  // layer indices, tensor dtype. Stateless; fetch once at client
  // startup and cache.
  rpc GetReadoutManifest(GetReadoutManifestRequest) returns (ReadoutManifest);
}

// ============================================================
//  Lifecycle
// ============================================================

message OpenSessionRequest {
  // Model identifier, must match vLLM's served model. The server
  // only has one model loaded; this is a safety check on what the
  // client thinks it's talking to.
  string model = 1;
}

message OpenSessionResponse {
  string session_id = 1;
  uint32 max_model_len = 2;
}

message CloseSessionRequest {
  string session_id = 1;
}

message CloseSessionResponse {}

message ForkSessionRequest {
  string session_id = 1;    // source session
  uint32 at_position = 2;   // new session inherits tokens [0, at_position)
}

message ForkSessionResponse {
  string session_id = 1;    // new session
}

// ============================================================
//  Mutation
// ============================================================

message AppendImageRequest {
  string session_id = 1;

  // Image bytes (PNG / JPEG / WebP / …).
  bytes  data = 2;

  // MIME type, e.g. "image/png".
  string mime = 3;

  // Client's view of the session's current token length. Must equal
  // the server's actual length, OR be strictly less when
  // truncating=true. Any mismatch is FAILED_PRECONDITION.
  uint32 offset = 4;

  // If true, server truncates session.tokens to `offset` before
  // appending. Rejected with FAILED_PRECONDITION if the truncation
  // would split an image block.
  bool   truncating = 5;
}

message AppendImageResponse {
  // Count of <|image_pad|> tokens inside the vision block. Does not
  // include the <|vision_start|> / <|vision_end|> bookends, which
  // contribute one token each.
  uint32 placeholder_count = 1;

  // Session's total token length after this append, including both
  // bookends (= offset + placeholder_count + 2, barring truncation).
  uint32 total_length = 2;
}

// ============================================================
//  Inference
// ============================================================

message GenerateRequest {
  string session_id = 1;

  // Tokens to append before prefill. May be empty. Client must NOT
  // include vision tokens (<|vision_start|>, <|image_pad|>,
  // <|vision_end|>) — those live in the session via AppendImage.
  repeated uint32 append_tokens = 2;

  // Offset / truncating — same semantics as AppendImage. Truncation
  // that splits an image block is FAILED_PRECONDITION.
  uint32 offset = 3;
  bool   truncating = 4;

  // Decode budget. 0 = prefill only (no decode, emit Token events
  // for positions covered by logprobs_ranges / readout_ranges, then
  // Done; replaces the old /score endpoint). >0 = decode up to this
  // many tokens, stopping early on EOS / stop_token_ids.
  uint32 max_tokens = 5;

  // Position ranges (absolute, within the session's post-append
  // token list) at which to emit logprobs on Token events. Empty =
  // no logprobs. `logprob_top_k > 0` returns the top-k alternative
  // tokens at each covered position; `logprob_top_k == 0` returns
  // only the sampled-token's logprob.
  repeated PositionRange logprobs_ranges = 6;
  uint32                 logprob_top_k = 7;

  // Position ranges at which to emit concept-readout vectors. Empty
  // = no readouts. Logical shape per position is
  // [n_layers][n_concepts] — see GetReadoutManifest.
  repeated PositionRange readout_ranges = 8;

  // Sampling parameters. Meaningful only when max_tokens > 0.
  float           temperature = 9;      // default 1.0 when zero
  float           top_p = 10;           // default 1.0 when zero
  uint32          top_k = 11;           // default 0 (disabled)
  repeated uint32 stop_token_ids = 12;

  // vLLM scheduler priority (0 = interactive, 10 = batch).
  int32 priority = 13;
}

message PositionRange {
  uint32 start = 1;   // inclusive
  uint32 end = 2;     // exclusive
}

message GenerateEvent {
  oneof event {
    Token        token = 1;
    GenerateDone done = 2;
  }
}

message Token {
  // Token id at this position. For prefill this is the prompt token;
  // for decode it's the sampled token.
  uint32 id = 1;

  // Absolute position in the session's token list.
  uint32 position = 2;

  // True for prefill positions, false for decode.
  bool   is_prefill = 3;

  // Concept readout at this position. Empty if the position wasn't
  // covered by readout_ranges.
  repeated float readout = 4 [packed = true];

  // Top-k alternative tokens' logprobs at this position — populated
  // when the position is covered by logprobs_ranges and
  // logprob_top_k > 0.
  repeated TokenLogprob logprobs = 5;

  // Logprob of the token at `position` (the prompt token for
  // prefill, the sampled token for decode). Populated when the
  // position is covered by logprobs_ranges.
  float sampled_logprob = 6;
  bool  has_sampled_logprob = 7;
}

message TokenLogprob {
  uint32 id = 1;
  float  logprob = 2;
}

message GenerateDone {
  uint32 prompt_tokens = 1;
  uint32 completion_tokens = 2;
  uint32 total_tokens = 3;

  enum FinishReason {
    FINISH_REASON_UNSPECIFIED = 0;
    FINISH_REASON_EOS = 1;              // emitted EOS / stop token
    FINISH_REASON_LENGTH = 2;           // hit max_tokens
    FINISH_REASON_CANCELLED = 3;        // client cancelled
    FINISH_REASON_STOP_STRING = 4;      // matched a stop string
  }
  FinishReason finish_reason = 4;
}

// ============================================================
//  Readout manifest
// ============================================================

message GetReadoutManifestRequest {}

message ReadoutManifest {
  repeated string concepts = 1;
  repeated uint32 layers = 2;
  uint32          hidden_size = 3;
  string          dtype = 4;
}