forked from kent/consciousness
Mirrors the vLLM-side rewrite. AppendImage is gone; images now ride along on Generate via a parallel `images` list. - Productionize `qwen3_image_token_count` (was test-only). Image leaf computes its IMAGE_PAD count eagerly at construction from height/width; `token_count` is no longer "0 until the server tells us." - WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision blocks live inline in the token stream. - `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`. `WireImage` carries `pad_start` / `pad_end` (absolute positions in the full walk) alongside bytes + mime. - `assemble_prompt` returns `(chunks, images, match_upto)`. - `stream_session_mm` / `run_session_generate` take the parallel images list, filter to those past `match_upto`, and pass them in `GenerateRequest.images` as `pb::ImageAttachment` entries. - Drop `SessionHandle::append_image`, `ContextState::commit_image_token_counts`, `StreamToken::ImageAppended`, the WireChunk::Image branch in `learn.rs`, and the now-empty `prompt_to_chunks` helper. - Add 'v' toggle on the conscious-screen tree to render token-id vectors in place of text content (debug-aid: lets us see what the server actually has when output is suspicious). - Comment out the subconscious-trigger spawn loop — Kent had this disabled before; it had crept back into running. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
279 lines
10 KiB
Rust
279 lines
10 KiB
Rust
// agent/api/salience.rs — gRPC client bindings for salience.v1.
|
|
//
|
|
// Thin wrapper around the tonic-generated types. Every RPC except
|
|
// Generate is unary; Generate is server-streaming. Free functions
|
|
// (open/close session) wrap the lifecycle RPCs; `SessionHandle` just
|
|
// carries the id + connection params so later RPCs can reuse them.
|
|
//
|
|
// The old bidi Session() API is gone — see git history for its shape.
|
|
|
|
#![allow(clippy::enum_variant_names)]
|
|
|
|
use anyhow::{Context, Result};
|
|
use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint};
|
|
|
|
/// Generated prost + tonic types for salience.v1. Call sites use
|
|
/// `pb::OpenSessionRequest`, `pb::Token`, etc.
|
|
pub mod pb {
|
|
tonic::include_proto!("salience.v1");
|
|
}
|
|
|
|
pub type SalienceClient = pb::salience_client::SalienceClient<Channel>;
|
|
|
|
/// Open a TLS-aware gRPC channel to the salience server. `base_url`
|
|
/// looks like `https://host:8443`. User-provided CA certs under
|
|
/// `~/.consciousness/certs/` are trusted in addition to the system
|
|
/// roots (for self-signed server certs).
|
|
///
|
|
/// Returns the raw `Channel` so callers (`ApiClient::salience_client`)
|
|
/// can cache it and clone a `SalienceClient` per request without
|
|
/// reopening the TCP/TLS connection. tonic multiplexes RPCs over the
|
|
/// shared channel automatically.
|
|
pub async fn connect_channel(base_url: &str) -> Result<Channel> {
|
|
let mut endpoint = Endpoint::from_shared(base_url.to_string())
|
|
.with_context(|| format!("invalid salience endpoint: {}", base_url))?
|
|
.connect_timeout(std::time::Duration::from_secs(30))
|
|
.timeout(std::time::Duration::from_secs(600));
|
|
|
|
if base_url.starts_with("https://") {
|
|
let user_certs = super::http::load_user_certs_pem_bytes();
|
|
let mut tls = ClientTlsConfig::new().with_native_roots();
|
|
if !user_certs.is_empty() {
|
|
tls = tls.ca_certificate(Certificate::from_pem(user_certs));
|
|
}
|
|
endpoint = endpoint
|
|
.tls_config(tls)
|
|
.with_context(|| "configuring tonic TLS")?;
|
|
}
|
|
|
|
endpoint
|
|
.connect()
|
|
.await
|
|
.with_context(|| format!("failed to connect to salience server at {}", base_url))
|
|
}
|
|
|
|
/// Derive the gRPC base URL from the HTTP completions base URL.
|
|
///
|
|
/// vLLM's salience gRPC server listens on a different port (8443) from
|
|
/// the HTTP endpoint (8000) and accepts no path component. Given an
|
|
/// HTTP base like `https://host:8000/v1`, produce `https://host:8443`.
|
|
/// No-op when the path is empty and the port isn't 8000.
|
|
pub fn derive_grpc_url(http_base: &str) -> String {
|
|
let mut url = http_base.trim_end_matches('/').to_string();
|
|
if let Some(proto_end) = url.find("://") {
|
|
let rest_start = proto_end + 3;
|
|
if let Some(path_slash) = url[rest_start..].find('/') {
|
|
url.truncate(rest_start + path_slash);
|
|
}
|
|
}
|
|
url.replace(":8000", ":8443")
|
|
}
|
|
|
|
/// Attach a bearer token to a tonic request as gRPC metadata.
|
|
pub fn with_auth<T>(req: &mut tonic::Request<T>, api_key: &str) {
|
|
if api_key.is_empty() {
|
|
return;
|
|
}
|
|
let bearer = format!("Bearer {}", api_key);
|
|
if let Ok(val) = bearer.parse() {
|
|
req.metadata_mut().insert("authorization", val);
|
|
}
|
|
}
|
|
|
|
/// Handle to a server-side session. Carries the id + an `ApiClient`
|
|
/// clone (which holds the shared tonic Channel) so subsequent
|
|
/// per-session RPCs go over the process-global connection.
|
|
/// `committed_len` tracks the server's current session.tokens length
|
|
/// so the client can submit deltas with the right `offset`.
|
|
pub struct SessionHandle {
|
|
pub session_id: String,
|
|
pub max_model_len: u32,
|
|
pub committed_len: u32,
|
|
client: super::ApiClient,
|
|
}
|
|
|
|
impl SessionHandle {
|
|
pub async fn open(client: &super::ApiClient) -> Result<Self> {
|
|
let t0 = std::time::Instant::now();
|
|
log::debug!(target: "grpc", "OpenSession rpc: start");
|
|
let mut c = client.salience_client().await?;
|
|
let mut req = tonic::Request::new(pb::OpenSessionRequest {
|
|
model: client.model.clone(),
|
|
});
|
|
with_auth(&mut req, client.api_key());
|
|
let resp = c
|
|
.open_session(req)
|
|
.await
|
|
.with_context(|| "OpenSession RPC failed")?
|
|
.into_inner();
|
|
log::debug!(target: "grpc",
|
|
"OpenSession rpc: done session_id={} max_model_len={} elapsed={:?}",
|
|
resp.session_id, resp.max_model_len, t0.elapsed());
|
|
Ok(Self {
|
|
session_id: resp.session_id,
|
|
max_model_len: resp.max_model_len,
|
|
committed_len: 0,
|
|
client: client.clone(),
|
|
})
|
|
}
|
|
|
|
pub fn client(&self) -> &super::ApiClient { &self.client }
|
|
|
|
/// Debug-only: fetch the server's full session.tokens. Used to
|
|
/// verify client-side accounting byte-for-byte when divergence
|
|
/// is suspected. Not cheap on large sessions.
|
|
pub async fn dump_tokens(&self) -> Result<Vec<u32>> {
|
|
let mut c = self.client.salience_client().await?;
|
|
let mut req = tonic::Request::new(pb::DumpSessionRequest {
|
|
session_id: self.session_id.clone(),
|
|
});
|
|
with_auth(&mut req, self.client.api_key());
|
|
let resp = c
|
|
.dump_session(req)
|
|
.await
|
|
.with_context(|| "DumpSession RPC failed")?
|
|
.into_inner();
|
|
Ok(resp.tokens)
|
|
}
|
|
|
|
/// Open a gRPC Generate stream with the given request. Caller
|
|
/// iterates the returned stream of GenerateEvents; the handle's
|
|
/// `committed_len` should be advanced by the caller on Done based
|
|
/// on the Done event's `total_tokens` field.
|
|
pub async fn generate(
|
|
&self,
|
|
req: pb::GenerateRequest,
|
|
) -> Result<tonic::Streaming<pb::GenerateEvent>> {
|
|
let t0 = std::time::Instant::now();
|
|
log::debug!(target: "grpc",
|
|
"Generate rpc: open-stream session={} offset={} append={} max_tokens={}",
|
|
self.session_id, req.offset, req.append_tokens.len(), req.max_tokens);
|
|
let mut c = self.client.salience_client().await?;
|
|
let mut req = tonic::Request::new(req);
|
|
with_auth(&mut req, self.client.api_key());
|
|
let resp = c
|
|
.generate(req)
|
|
.await
|
|
.with_context(|| "Generate RPC failed")?;
|
|
log::debug!(target: "grpc",
|
|
"Generate rpc: stream opened session={} open-latency={:?}",
|
|
self.session_id, t0.elapsed());
|
|
Ok(resp.into_inner())
|
|
}
|
|
|
|
/// Run a prefill-only Generate (max_tokens=0) that appends the
|
|
/// given tokens to the session. No decode, no Token events — the
|
|
/// server just extends session.tokens and runs prefill to warm
|
|
/// the KV cache. Used to interleave text runs between AppendImage
|
|
/// calls, and by score paths that want prompt_logprobs without a
|
|
/// decode step.
|
|
pub async fn prefill_only(&mut self, tokens: Vec<u32>) -> Result<()> {
|
|
use futures::StreamExt;
|
|
let req = pb::GenerateRequest {
|
|
session_id: self.session_id.clone(),
|
|
append_tokens: tokens,
|
|
offset: self.committed_len,
|
|
truncating: false,
|
|
max_tokens: 0,
|
|
logprobs_ranges: Vec::new(),
|
|
logprob_top_k: 0,
|
|
readout_ranges: Vec::new(),
|
|
temperature: 0.0,
|
|
top_p: 0.0,
|
|
top_k: 0,
|
|
stop_token_ids: Vec::new(),
|
|
priority: 0,
|
|
images: Vec::new(),
|
|
};
|
|
let mut stream = self.generate(req).await?;
|
|
while let Some(event) = stream.next().await {
|
|
let event = event.map_err(|s| anyhow::anyhow!("prefill Generate stream: {}", s))?;
|
|
if let Some(pb::generate_event::Event::Done(d)) = event.event {
|
|
self.committed_len = d.total_tokens;
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Drop → fire CloseSession in a detached task so servers don't leak
|
|
/// sessions until TTL eviction. Best-effort: if no tokio runtime is
|
|
/// available we skip; the server's 30min TTL will reap it eventually.
|
|
impl Drop for SessionHandle {
|
|
fn drop(&mut self) {
|
|
if self.session_id.is_empty() {
|
|
return;
|
|
}
|
|
let session_id = std::mem::take(&mut self.session_id);
|
|
let client = self.client.clone();
|
|
let Ok(rt) = tokio::runtime::Handle::try_current() else {
|
|
log::debug!(target: "grpc",
|
|
"SessionHandle drop outside tokio runtime, session {} leaks to TTL",
|
|
session_id);
|
|
return;
|
|
};
|
|
rt.spawn(async move {
|
|
let Ok(mut c) = client.salience_client().await else { return };
|
|
let mut req = tonic::Request::new(pb::CloseSessionRequest {
|
|
session_id: session_id.clone(),
|
|
});
|
|
with_auth(&mut req, client.api_key());
|
|
if let Err(e) = c.close_session(req).await {
|
|
log::debug!(target: "grpc",
|
|
"CloseSession on drop failed for {}: {:#}",
|
|
session_id, e);
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn generated_types_compile() {
|
|
// Exercise the shape of the new proto types — if build.rs
|
|
// stops regenerating against the proto, this stops compiling.
|
|
let _open = pb::OpenSessionRequest {
|
|
model: "qwen3-vl".into(),
|
|
};
|
|
let _tok = pb::Token {
|
|
id: 42,
|
|
position: 0,
|
|
is_prefill: false,
|
|
readout: vec![0.1, 0.2, 0.3],
|
|
logprobs: vec![pb::TokenLogprob {
|
|
id: 1,
|
|
logprob: -0.5,
|
|
}],
|
|
sampled_logprob: -0.1,
|
|
has_sampled_logprob: true,
|
|
};
|
|
let _done = pb::GenerateDone {
|
|
prompt_tokens: 10,
|
|
completion_tokens: 20,
|
|
total_tokens: 30,
|
|
finish_reason: pb::generate_done::FinishReason::Eos as i32,
|
|
};
|
|
let _evt = pb::GenerateEvent {
|
|
event: Some(pb::generate_event::Event::Done(_done)),
|
|
};
|
|
}
|
|
|
|
#[test]
|
|
fn derive_grpc_url_cases() {
|
|
assert_eq!(
|
|
derive_grpc_url("https://host:8000/v1"),
|
|
"https://host:8443",
|
|
);
|
|
assert_eq!(
|
|
derive_grpc_url("https://host:8000/"),
|
|
"https://host:8443",
|
|
);
|
|
assert_eq!(
|
|
derive_grpc_url("https://host:9000/v1"),
|
|
"https://host:9000",
|
|
);
|
|
}
|
|
}
|