// agent/api/salience.rs — gRPC client bindings for salience.v1. // // Thin wrapper around the tonic-generated types. Every RPC except // Generate is unary; Generate is server-streaming. Free functions // (open/close session) wrap the lifecycle RPCs; `SessionHandle` just // carries the id + connection params so later RPCs can reuse them. // // The old bidi Session() API is gone — see git history for its shape. #![allow(clippy::enum_variant_names)] use anyhow::{Context, Result}; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint}; /// Generated prost + tonic types for salience.v1. Call sites use /// `pb::OpenSessionRequest`, `pb::Token`, etc. pub mod pb { tonic::include_proto!("salience.v1"); } pub type SalienceClient = pb::salience_client::SalienceClient; /// Open a TLS-aware gRPC channel to the salience server. `base_url` /// looks like `https://host:8443`. User-provided CA certs under /// `~/.consciousness/certs/` are trusted in addition to the system /// roots (for self-signed server certs). /// /// Returns the raw `Channel` so callers (`ApiClient::salience_client`) /// can cache it and clone a `SalienceClient` per request without /// reopening the TCP/TLS connection. tonic multiplexes RPCs over the /// shared channel automatically. pub async fn connect_channel(base_url: &str) -> Result { let mut endpoint = Endpoint::from_shared(base_url.to_string()) .with_context(|| format!("invalid salience endpoint: {}", base_url))? .connect_timeout(std::time::Duration::from_secs(30)) .timeout(std::time::Duration::from_secs(600)); if base_url.starts_with("https://") { let user_certs = super::http::load_user_certs_pem_bytes(); let mut tls = ClientTlsConfig::new().with_native_roots(); if !user_certs.is_empty() { tls = tls.ca_certificate(Certificate::from_pem(user_certs)); } endpoint = endpoint .tls_config(tls) .with_context(|| "configuring tonic TLS")?; } endpoint .connect() .await .with_context(|| format!("failed to connect to salience server at {}", base_url)) } /// Derive the gRPC base URL from the HTTP completions base URL. /// /// vLLM's salience gRPC server listens on a different port (8443) from /// the HTTP endpoint (8000) and accepts no path component. Given an /// HTTP base like `https://host:8000/v1`, produce `https://host:8443`. /// No-op when the path is empty and the port isn't 8000. pub fn derive_grpc_url(http_base: &str) -> String { let mut url = http_base.trim_end_matches('/').to_string(); if let Some(proto_end) = url.find("://") { let rest_start = proto_end + 3; if let Some(path_slash) = url[rest_start..].find('/') { url.truncate(rest_start + path_slash); } } url.replace(":8000", ":8443") } /// Attach a bearer token to a tonic request as gRPC metadata. pub fn with_auth(req: &mut tonic::Request, api_key: &str) { if api_key.is_empty() { return; } let bearer = format!("Bearer {}", api_key); if let Ok(val) = bearer.parse() { req.metadata_mut().insert("authorization", val); } } /// Handle to a server-side session. Carries the id + an `ApiClient` /// clone (which holds the shared tonic Channel) so subsequent /// per-session RPCs go over the process-global connection. /// `committed_len` tracks the server's current session.tokens length /// so the client can submit deltas with the right `offset`. pub struct SessionHandle { pub session_id: String, pub max_model_len: u32, pub committed_len: u32, client: super::ApiClient, } impl SessionHandle { pub async fn open(client: &super::ApiClient) -> Result { let mut c = client.salience_client().await?; let mut req = tonic::Request::new(pb::OpenSessionRequest { model: client.model.clone(), }); with_auth(&mut req, client.api_key()); let resp = c .open_session(req) .await .with_context(|| "OpenSession RPC failed")? .into_inner(); log::debug!(target: "grpc", "SessionHandle::open session_id={} max_model_len={}", resp.session_id, resp.max_model_len); Ok(Self { session_id: resp.session_id, max_model_len: resp.max_model_len, committed_len: 0, client: client.clone(), }) } pub fn client(&self) -> &super::ApiClient { &self.client } /// Append an image via the server-side vision block. Updates /// `committed_len` from the server's response on success. pub async fn append_image( &mut self, data: Vec, mime: String, truncating: bool, ) -> Result { let mut c = self.client.salience_client().await?; let mut req = tonic::Request::new(pb::AppendImageRequest { session_id: self.session_id.clone(), data, mime, offset: self.committed_len, truncating, }); with_auth(&mut req, self.client.api_key()); let resp = c .append_image(req) .await .with_context(|| "AppendImage RPC failed")? .into_inner(); self.committed_len = resp.total_length; Ok(resp) } /// Open a gRPC Generate stream with the given request. Caller /// iterates the returned stream of GenerateEvents; the handle's /// `committed_len` should be advanced by the caller on Done based /// on the Done event's `total_tokens` field. pub async fn generate( &self, req: pb::GenerateRequest, ) -> Result> { let mut c = self.client.salience_client().await?; let mut req = tonic::Request::new(req); with_auth(&mut req, self.client.api_key()); let resp = c .generate(req) .await .with_context(|| "Generate RPC failed")?; Ok(resp.into_inner()) } /// Run a prefill-only Generate (max_tokens=0) that appends the /// given tokens to the session. No decode, no Token events — the /// server just extends session.tokens and runs prefill to warm /// the KV cache. Used to interleave text runs between AppendImage /// calls, and by score paths that want prompt_logprobs without a /// decode step. pub async fn prefill_only(&mut self, tokens: Vec) -> Result<()> { use futures::StreamExt; let req = pb::GenerateRequest { session_id: self.session_id.clone(), append_tokens: tokens, offset: self.committed_len, truncating: false, max_tokens: 0, logprobs_ranges: Vec::new(), logprob_top_k: 0, readout_ranges: Vec::new(), temperature: 0.0, top_p: 0.0, top_k: 0, stop_token_ids: Vec::new(), priority: 0, }; let mut stream = self.generate(req).await?; while let Some(event) = stream.next().await { let event = event.map_err(|s| anyhow::anyhow!("prefill Generate stream: {}", s))?; if let Some(pb::generate_event::Event::Done(d)) = event.event { self.committed_len = d.total_tokens; } } Ok(()) } } /// Drop → fire CloseSession in a detached task so servers don't leak /// sessions until TTL eviction. Best-effort: if no tokio runtime is /// available we skip; the server's 30min TTL will reap it eventually. impl Drop for SessionHandle { fn drop(&mut self) { if self.session_id.is_empty() { return; } let session_id = std::mem::take(&mut self.session_id); let client = self.client.clone(); let Ok(rt) = tokio::runtime::Handle::try_current() else { log::debug!(target: "grpc", "SessionHandle drop outside tokio runtime, session {} leaks to TTL", session_id); return; }; rt.spawn(async move { let Ok(mut c) = client.salience_client().await else { return }; let mut req = tonic::Request::new(pb::CloseSessionRequest { session_id: session_id.clone(), }); with_auth(&mut req, client.api_key()); if let Err(e) = c.close_session(req).await { log::debug!(target: "grpc", "CloseSession on drop failed for {}: {:#}", session_id, e); } }); } } #[cfg(test)] mod tests { use super::*; #[test] fn generated_types_compile() { // Exercise the shape of the new proto types — if build.rs // stops regenerating against the proto, this stops compiling. let _open = pb::OpenSessionRequest { model: "qwen3-vl".into(), }; let _tok = pb::Token { id: 42, position: 0, is_prefill: false, readout: vec![0.1, 0.2, 0.3], logprobs: vec![pb::TokenLogprob { id: 1, logprob: -0.5, }], sampled_logprob: -0.1, has_sampled_logprob: true, }; let _done = pb::GenerateDone { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30, finish_reason: pb::generate_done::FinishReason::Eos as i32, }; let _evt = pb::GenerateEvent { event: Some(pb::generate_event::Event::Done(_done)), }; } #[test] fn derive_grpc_url_cases() { assert_eq!( derive_grpc_url("https://host:8000/v1"), "https://host:8443", ); assert_eq!( derive_grpc_url("https://host:8000/"), "https://host:8443", ); assert_eq!( derive_grpc_url("https://host:9000/v1"), "https://host:9000", ); } }