// agent/api/salience.rs — gRPC client bindings for salience.v1. // // Thin wrapper around the tonic-generated types. Every RPC except // Generate is unary; Generate is server-streaming. Free functions // (open/close session) wrap the lifecycle RPCs; `SessionHandle` just // carries the id + connection params so later RPCs can reuse them. // // The old bidi Session() API is gone — see git history for its shape. #![allow(clippy::enum_variant_names)] use anyhow::{Context, Result}; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint}; /// Generated prost + tonic types for salience.v1. Call sites use /// `pb::OpenSessionRequest`, `pb::Token`, etc. pub mod pb { tonic::include_proto!("salience.v1"); } pub type SalienceClient = pb::salience_client::SalienceClient; /// Open a TLS-aware gRPC channel to the salience server. `base_url` /// looks like `https://host:8443`. User-provided CA certs under /// `~/.consciousness/certs/` are trusted in addition to the system /// roots (for self-signed server certs). pub async fn connect(base_url: &str) -> Result { let mut endpoint = Endpoint::from_shared(base_url.to_string()) .with_context(|| format!("invalid salience endpoint: {}", base_url))? .connect_timeout(std::time::Duration::from_secs(30)) .timeout(std::time::Duration::from_secs(600)); if base_url.starts_with("https://") { let user_certs = super::http::load_user_certs_pem_bytes(); let mut tls = ClientTlsConfig::new().with_native_roots(); if !user_certs.is_empty() { tls = tls.ca_certificate(Certificate::from_pem(user_certs)); } endpoint = endpoint .tls_config(tls) .with_context(|| "configuring tonic TLS")?; } let channel = endpoint .connect() .await .with_context(|| format!("failed to connect to salience server at {}", base_url))?; Ok(pb::salience_client::SalienceClient::new(channel)) } /// Derive the gRPC base URL from the HTTP completions base URL. /// /// vLLM's salience gRPC server listens on a different port (8443) from /// the HTTP endpoint (8000) and accepts no path component. Given an /// HTTP base like `https://host:8000/v1`, produce `https://host:8443`. /// No-op when the path is empty and the port isn't 8000. pub fn derive_grpc_url(http_base: &str) -> String { let mut url = http_base.trim_end_matches('/').to_string(); if let Some(proto_end) = url.find("://") { let rest_start = proto_end + 3; if let Some(path_slash) = url[rest_start..].find('/') { url.truncate(rest_start + path_slash); } } url.replace(":8000", ":8443") } /// Attach a bearer token to a tonic request as gRPC metadata. pub fn with_auth(req: &mut tonic::Request, api_key: &str) { if api_key.is_empty() { return; } let bearer = format!("Bearer {}", api_key); if let Ok(val) = bearer.parse() { req.metadata_mut().insert("authorization", val); } } /// Call the server's `OpenSession` RPC and return the response. pub async fn open_session( base_url: &str, api_key: &str, model: &str, ) -> Result { let mut client = connect(base_url).await?; let mut req = tonic::Request::new(pb::OpenSessionRequest { model: model.to_string(), }); with_auth(&mut req, api_key); let resp = client .open_session(req) .await .with_context(|| "OpenSession RPC failed")?; Ok(resp.into_inner()) } /// Call the server's `CloseSession` RPC. Idempotent on the server. pub async fn close_session(base_url: &str, api_key: &str, session_id: &str) -> Result<()> { let mut client = connect(base_url).await?; let mut req = tonic::Request::new(pb::CloseSessionRequest { session_id: session_id.to_string(), }); with_auth(&mut req, api_key); client .close_session(req) .await .with_context(|| "CloseSession RPC failed")?; Ok(()) } /// Append an image to a session. Server decodes the image, computes N /// via vLLM's own multimodal pipeline, writes the full vision block /// (`<|vision_start|> + IMAGE_PAD×N + <|vision_end|>`) into /// session.tokens, and returns (N, new total length). /// /// `offset` is the client's view of the session's current token count; /// the server rejects if it diverges from its own (unless /// `truncating=true`, in which case the server slices to `offset` /// first — but never through a vision block). pub async fn append_image( base_url: &str, api_key: &str, session_id: &str, data: Vec, mime: String, offset: u32, truncating: bool, ) -> Result { let mut client = connect(base_url).await?; let mut req = tonic::Request::new(pb::AppendImageRequest { session_id: session_id.to_string(), data, mime, offset, truncating, }); with_auth(&mut req, api_key); let resp = client .append_image(req) .await .with_context(|| "AppendImage RPC failed")?; Ok(resp.into_inner()) } /// Handle to a server-side session. Carries the id + connection params /// so subsequent per-session RPCs (AppendImage, Generate, ForkSession) /// can be issued without the caller juggling base_url / api_key each /// time. `committed_len` tracks the server's current session.tokens /// length so the client can submit deltas with the right `offset`. pub struct SessionHandle { pub session_id: String, pub max_model_len: u32, pub base_url: String, pub api_key: String, pub committed_len: u32, } impl SessionHandle { pub async fn open(base_url: &str, api_key: &str, model: &str) -> Result { let grpc_url = derive_grpc_url(base_url); log::debug!(target: "grpc", "SessionHandle::open http_base={} -> grpc_url={}", base_url, grpc_url); let resp = open_session(&grpc_url, api_key, model).await?; log::debug!(target: "grpc", "SessionHandle::open session_id={} max_model_len={}", resp.session_id, resp.max_model_len); Ok(Self { session_id: resp.session_id, max_model_len: resp.max_model_len, base_url: grpc_url, api_key: api_key.to_string(), committed_len: 0, }) } pub async fn close(self) -> Result<()> { close_session(&self.base_url, &self.api_key, &self.session_id).await } /// Append an image via the server-side vision block. Updates /// `committed_len` from the server's response on success. pub async fn append_image( &mut self, data: Vec, mime: String, truncating: bool, ) -> Result { let resp = append_image( &self.base_url, &self.api_key, &self.session_id, data, mime, self.committed_len, truncating, ) .await?; self.committed_len = resp.total_length; Ok(resp) } /// Open a gRPC Generate stream with the given request. Caller /// iterates the returned stream of GenerateEvents; the handle's /// `committed_len` is advanced on Done based on the Done event's /// `total_tokens` field. pub async fn generate( &self, req: pb::GenerateRequest, ) -> Result> { let mut client = connect(&self.base_url).await?; let mut req = tonic::Request::new(req); with_auth(&mut req, &self.api_key); let resp = client .generate(req) .await .with_context(|| "Generate RPC failed")?; Ok(resp.into_inner()) } } #[cfg(test)] mod tests { use super::*; #[test] fn generated_types_compile() { // Exercise the shape of the new proto types — if build.rs // stops regenerating against the proto, this stops compiling. let _open = pb::OpenSessionRequest { model: "qwen3-vl".into(), }; let _tok = pb::Token { id: 42, position: 0, is_prefill: false, readout: vec![0.1, 0.2, 0.3], logprobs: vec![pb::TokenLogprob { id: 1, logprob: -0.5, }], sampled_logprob: -0.1, has_sampled_logprob: true, }; let _done = pb::GenerateDone { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30, finish_reason: pb::generate_done::FinishReason::Eos as i32, }; let _evt = pb::GenerateEvent { event: Some(pb::generate_event::Event::Done(_done)), }; } #[test] fn derive_grpc_url_cases() { assert_eq!( derive_grpc_url("https://host:8000/v1"), "https://host:8443", ); assert_eq!( derive_grpc_url("https://host:8000/"), "https://host:8443", ); assert_eq!( derive_grpc_url("https://host:9000/v1"), "https://host:9000", ); } }