forked from kent/consciousness
249 lines
8.1 KiB
Rust
249 lines
8.1 KiB
Rust
|
|
// agent/api/salience.rs — gRPC client bindings for salience.v1.
|
|||
|
|
//
|
|||
|
|
// Thin wrapper around the tonic-generated types. Every RPC except
|
|||
|
|
// Generate is unary; Generate is server-streaming. Free functions
|
|||
|
|
// (open/close session) wrap the lifecycle RPCs; `SessionHandle` just
|
|||
|
|
// carries the id + connection params so later RPCs can reuse them.
|
|||
|
|
//
|
|||
|
|
// The old bidi Session() API is gone — see git history for its shape.
|
|||
|
|
|
|||
|
|
#![allow(clippy::enum_variant_names)]
|
|||
|
|
|
|||
|
|
use anyhow::{Context, Result};
|
|||
|
|
use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint};
|
|||
|
|
|
|||
|
|
/// Generated prost + tonic types for salience.v1. Call sites use
|
|||
|
|
/// `pb::OpenSessionRequest`, `pb::Token`, etc.
|
|||
|
|
pub mod pb {
|
|||
|
|
tonic::include_proto!("salience.v1");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
pub type SalienceClient = pb::salience_client::SalienceClient<Channel>;
|
|||
|
|
|
|||
|
|
/// Open a TLS-aware gRPC channel to the salience server. `base_url`
|
|||
|
|
/// looks like `https://host:8443`. User-provided CA certs under
|
|||
|
|
/// `~/.consciousness/certs/` are trusted in addition to the system
|
|||
|
|
/// roots (for self-signed server certs).
|
|||
|
|
pub async fn connect(base_url: &str) -> Result<SalienceClient> {
|
|||
|
|
let mut endpoint = Endpoint::from_shared(base_url.to_string())
|
|||
|
|
.with_context(|| format!("invalid salience endpoint: {}", base_url))?
|
|||
|
|
.connect_timeout(std::time::Duration::from_secs(30))
|
|||
|
|
.timeout(std::time::Duration::from_secs(600));
|
|||
|
|
|
|||
|
|
if base_url.starts_with("https://") {
|
|||
|
|
let user_certs = super::http::load_user_certs_pem_bytes();
|
|||
|
|
let mut tls = ClientTlsConfig::new().with_native_roots();
|
|||
|
|
if !user_certs.is_empty() {
|
|||
|
|
tls = tls.ca_certificate(Certificate::from_pem(user_certs));
|
|||
|
|
}
|
|||
|
|
endpoint = endpoint
|
|||
|
|
.tls_config(tls)
|
|||
|
|
.with_context(|| "configuring tonic TLS")?;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let channel = endpoint
|
|||
|
|
.connect()
|
|||
|
|
.await
|
|||
|
|
.with_context(|| format!("failed to connect to salience server at {}", base_url))?;
|
|||
|
|
Ok(pb::salience_client::SalienceClient::new(channel))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Derive the gRPC base URL from the HTTP completions base URL.
|
|||
|
|
///
|
|||
|
|
/// vLLM's salience gRPC server listens on a different port (8443) from
|
|||
|
|
/// the HTTP endpoint (8000) and accepts no path component. Given an
|
|||
|
|
/// HTTP base like `https://host:8000/v1`, produce `https://host:8443`.
|
|||
|
|
/// No-op when the path is empty and the port isn't 8000.
|
|||
|
|
pub fn derive_grpc_url(http_base: &str) -> String {
|
|||
|
|
let mut url = http_base.trim_end_matches('/').to_string();
|
|||
|
|
if let Some(proto_end) = url.find("://") {
|
|||
|
|
let rest_start = proto_end + 3;
|
|||
|
|
if let Some(path_slash) = url[rest_start..].find('/') {
|
|||
|
|
url.truncate(rest_start + path_slash);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
url.replace(":8000", ":8443")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Attach a bearer token to a tonic request as gRPC metadata.
|
|||
|
|
pub fn with_auth<T>(req: &mut tonic::Request<T>, api_key: &str) {
|
|||
|
|
if api_key.is_empty() {
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
let bearer = format!("Bearer {}", api_key);
|
|||
|
|
if let Ok(val) = bearer.parse() {
|
|||
|
|
req.metadata_mut().insert("authorization", val);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Call the server's `OpenSession` RPC and return the response.
|
|||
|
|
pub async fn open_session(
|
|||
|
|
base_url: &str,
|
|||
|
|
api_key: &str,
|
|||
|
|
model: &str,
|
|||
|
|
) -> Result<pb::OpenSessionResponse> {
|
|||
|
|
let mut client = connect(base_url).await?;
|
|||
|
|
let mut req = tonic::Request::new(pb::OpenSessionRequest {
|
|||
|
|
model: model.to_string(),
|
|||
|
|
});
|
|||
|
|
with_auth(&mut req, api_key);
|
|||
|
|
let resp = client
|
|||
|
|
.open_session(req)
|
|||
|
|
.await
|
|||
|
|
.with_context(|| "OpenSession RPC failed")?;
|
|||
|
|
Ok(resp.into_inner())
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Call the server's `CloseSession` RPC. Idempotent on the server.
|
|||
|
|
pub async fn close_session(base_url: &str, api_key: &str, session_id: &str) -> Result<()> {
|
|||
|
|
let mut client = connect(base_url).await?;
|
|||
|
|
let mut req = tonic::Request::new(pb::CloseSessionRequest {
|
|||
|
|
session_id: session_id.to_string(),
|
|||
|
|
});
|
|||
|
|
with_auth(&mut req, api_key);
|
|||
|
|
client
|
|||
|
|
.close_session(req)
|
|||
|
|
.await
|
|||
|
|
.with_context(|| "CloseSession RPC failed")?;
|
|||
|
|
Ok(())
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Append an image to a session. Server decodes the image, computes N
|
|||
|
|
/// via vLLM's own multimodal pipeline, writes the full vision block
|
|||
|
|
/// (`<|vision_start|> + IMAGE_PAD×N + <|vision_end|>`) into
|
|||
|
|
/// session.tokens, and returns (N, new total length).
|
|||
|
|
///
|
|||
|
|
/// `offset` is the client's view of the session's current token count;
|
|||
|
|
/// the server rejects if it diverges from its own (unless
|
|||
|
|
/// `truncating=true`, in which case the server slices to `offset`
|
|||
|
|
/// first — but never through a vision block).
|
|||
|
|
pub async fn append_image(
|
|||
|
|
base_url: &str,
|
|||
|
|
api_key: &str,
|
|||
|
|
session_id: &str,
|
|||
|
|
data: Vec<u8>,
|
|||
|
|
mime: String,
|
|||
|
|
offset: u32,
|
|||
|
|
truncating: bool,
|
|||
|
|
) -> Result<pb::AppendImageResponse> {
|
|||
|
|
let mut client = connect(base_url).await?;
|
|||
|
|
let mut req = tonic::Request::new(pb::AppendImageRequest {
|
|||
|
|
session_id: session_id.to_string(),
|
|||
|
|
data,
|
|||
|
|
mime,
|
|||
|
|
offset,
|
|||
|
|
truncating,
|
|||
|
|
});
|
|||
|
|
with_auth(&mut req, api_key);
|
|||
|
|
let resp = client
|
|||
|
|
.append_image(req)
|
|||
|
|
.await
|
|||
|
|
.with_context(|| "AppendImage RPC failed")?;
|
|||
|
|
Ok(resp.into_inner())
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Handle to a server-side session. Carries the id + connection params
|
|||
|
|
/// so subsequent per-session RPCs (AppendImage, Generate, ForkSession)
|
|||
|
|
/// can be issued without the caller juggling base_url / api_key each
|
|||
|
|
/// time.
|
|||
|
|
pub struct SessionHandle {
|
|||
|
|
pub session_id: String,
|
|||
|
|
pub max_model_len: u32,
|
|||
|
|
pub base_url: String,
|
|||
|
|
pub api_key: String,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
impl SessionHandle {
|
|||
|
|
pub async fn open(base_url: &str, api_key: &str, model: &str) -> Result<Self> {
|
|||
|
|
let grpc_url = derive_grpc_url(base_url);
|
|||
|
|
log::debug!(target: "grpc",
|
|||
|
|
"SessionHandle::open http_base={} -> grpc_url={}",
|
|||
|
|
base_url, grpc_url);
|
|||
|
|
let resp = open_session(&grpc_url, api_key, model).await?;
|
|||
|
|
log::debug!(target: "grpc",
|
|||
|
|
"SessionHandle::open session_id={} max_model_len={}",
|
|||
|
|
resp.session_id, resp.max_model_len);
|
|||
|
|
Ok(Self {
|
|||
|
|
session_id: resp.session_id,
|
|||
|
|
max_model_len: resp.max_model_len,
|
|||
|
|
base_url: grpc_url,
|
|||
|
|
api_key: api_key.to_string(),
|
|||
|
|
})
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
pub async fn close(self) -> Result<()> {
|
|||
|
|
close_session(&self.base_url, &self.api_key, &self.session_id).await
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/// Append an image via the server-side vision block. See
|
|||
|
|
/// `append_image` free function for full semantics.
|
|||
|
|
pub async fn append_image(
|
|||
|
|
&self,
|
|||
|
|
data: Vec<u8>,
|
|||
|
|
mime: String,
|
|||
|
|
offset: u32,
|
|||
|
|
truncating: bool,
|
|||
|
|
) -> Result<pb::AppendImageResponse> {
|
|||
|
|
append_image(
|
|||
|
|
&self.base_url,
|
|||
|
|
&self.api_key,
|
|||
|
|
&self.session_id,
|
|||
|
|
data,
|
|||
|
|
mime,
|
|||
|
|
offset,
|
|||
|
|
truncating,
|
|||
|
|
)
|
|||
|
|
.await
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#[cfg(test)]
|
|||
|
|
mod tests {
|
|||
|
|
use super::*;
|
|||
|
|
|
|||
|
|
#[test]
|
|||
|
|
fn generated_types_compile() {
|
|||
|
|
// Exercise the shape of the new proto types — if build.rs
|
|||
|
|
// stops regenerating against the proto, this stops compiling.
|
|||
|
|
let _open = pb::OpenSessionRequest {
|
|||
|
|
model: "qwen3-vl".into(),
|
|||
|
|
};
|
|||
|
|
let _tok = pb::Token {
|
|||
|
|
id: 42,
|
|||
|
|
position: 0,
|
|||
|
|
is_prefill: false,
|
|||
|
|
readout: vec![0.1, 0.2, 0.3],
|
|||
|
|
logprobs: vec![pb::TokenLogprob {
|
|||
|
|
id: 1,
|
|||
|
|
logprob: -0.5,
|
|||
|
|
}],
|
|||
|
|
sampled_logprob: -0.1,
|
|||
|
|
has_sampled_logprob: true,
|
|||
|
|
};
|
|||
|
|
let _done = pb::GenerateDone {
|
|||
|
|
prompt_tokens: 10,
|
|||
|
|
completion_tokens: 20,
|
|||
|
|
total_tokens: 30,
|
|||
|
|
finish_reason: pb::generate_done::FinishReason::Eos as i32,
|
|||
|
|
};
|
|||
|
|
let _evt = pb::GenerateEvent {
|
|||
|
|
event: Some(pb::generate_event::Event::Done(_done)),
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
#[test]
|
|||
|
|
fn derive_grpc_url_cases() {
|
|||
|
|
assert_eq!(
|
|||
|
|
derive_grpc_url("https://host:8000/v1"),
|
|||
|
|
"https://host:8443",
|
|||
|
|
);
|
|||
|
|
assert_eq!(
|
|||
|
|
derive_grpc_url("https://host:8000/"),
|
|||
|
|
"https://host:8443",
|
|||
|
|
);
|
|||
|
|
assert_eq!(
|
|||
|
|
derive_grpc_url("https://host:9000/v1"),
|
|||
|
|
"https://host:9000",
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
}
|