consciousness/src/agent/api/salience.rs

// agent/api/salience.rs — gRPC client bindings for salience.v1.
//
// Thin wrapper around the tonic-generated types. Every RPC except
// Generate is unary; Generate is server-streaming. Free functions
// (open/close session) wrap the lifecycle RPCs; `SessionHandle` just
// carries the id + connection params so later RPCs can reuse them.
//
// The old bidi Session() API is gone — see git history for its shape.

#![allow(clippy::enum_variant_names)]

use anyhow::{Context, Result};
use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint};

/// Generated prost + tonic types for salience.v1. Call sites use
/// `pb::OpenSessionRequest`, `pb::Token`, etc.
pub mod pb {
    tonic::include_proto!("salience.v1");
}

pub type SalienceClient = pb::salience_client::SalienceClient<Channel>;

/// Open a TLS-aware gRPC channel to the salience server. `base_url`
/// looks like `https://host:8443`. User-provided CA certs under
/// `~/.consciousness/certs/` are trusted in addition to the system
/// roots (for self-signed server certs).
pub async fn connect(base_url: &str) -> Result<SalienceClient> {
    let mut endpoint = Endpoint::from_shared(base_url.to_string())
        .with_context(|| format!("invalid salience endpoint: {}", base_url))?
        .connect_timeout(std::time::Duration::from_secs(30))
        .timeout(std::time::Duration::from_secs(600));

    if base_url.starts_with("https://") {
        let user_certs = super::http::load_user_certs_pem_bytes();
        let mut tls = ClientTlsConfig::new().with_native_roots();
        if !user_certs.is_empty() {
            tls = tls.ca_certificate(Certificate::from_pem(user_certs));
        }
        endpoint = endpoint
            .tls_config(tls)
            .with_context(|| "configuring tonic TLS")?;
    }

    let channel = endpoint
        .connect()
        .await
        .with_context(|| format!("failed to connect to salience server at {}", base_url))?;
    Ok(pb::salience_client::SalienceClient::new(channel))
}

/// Derive the gRPC base URL from the HTTP completions base URL.
///
/// vLLM's salience gRPC server listens on a different port (8443) from
/// the HTTP endpoint (8000) and accepts no path component. Given an
/// HTTP base like `https://host:8000/v1`, produce `https://host:8443`.
/// No-op when the path is empty and the port isn't 8000.
pub fn derive_grpc_url(http_base: &str) -> String {
    let mut url = http_base.trim_end_matches('/').to_string();
    if let Some(proto_end) = url.find("://") {
        let rest_start = proto_end + 3;
        if let Some(path_slash) = url[rest_start..].find('/') {
            url.truncate(rest_start + path_slash);
        }
    }
    url.replace(":8000", ":8443")
}

/// Attach a bearer token to a tonic request as gRPC metadata.
pub fn with_auth<T>(req: &mut tonic::Request<T>, api_key: &str) {
    if api_key.is_empty() {
        return;
    }
    let bearer = format!("Bearer {}", api_key);
    if let Ok(val) = bearer.parse() {
        req.metadata_mut().insert("authorization", val);
    }
}

/// Call the server's `OpenSession` RPC and return the response.
pub async fn open_session(
    base_url: &str,
    api_key: &str,
    model: &str,
) -> Result<pb::OpenSessionResponse> {
    let mut client = connect(base_url).await?;
    let mut req = tonic::Request::new(pb::OpenSessionRequest {
        model: model.to_string(),
    });
    with_auth(&mut req, api_key);
    let resp = client
        .open_session(req)
        .await
        .with_context(|| "OpenSession RPC failed")?;
    Ok(resp.into_inner())
}

/// Call the server's `CloseSession` RPC. Idempotent on the server.
pub async fn close_session(base_url: &str, api_key: &str, session_id: &str) -> Result<()> {
    let mut client = connect(base_url).await?;
    let mut req = tonic::Request::new(pb::CloseSessionRequest {
        session_id: session_id.to_string(),
    });
    with_auth(&mut req, api_key);
    client
        .close_session(req)
        .await
        .with_context(|| "CloseSession RPC failed")?;
    Ok(())
}

/// Append an image to a session. Server decodes the image, computes N
/// via vLLM's own multimodal pipeline, writes the full vision block
/// (`<|vision_start|> + IMAGE_PAD×N + <|vision_end|>`) into
/// session.tokens, and returns (N, new total length).
///
/// `offset` is the client's view of the session's current token count;
/// the server rejects if it diverges from its own (unless
/// `truncating=true`, in which case the server slices to `offset`
/// first — but never through a vision block).
pub async fn append_image(
    base_url: &str,
    api_key: &str,
    session_id: &str,
    data: Vec<u8>,
    mime: String,
    offset: u32,
    truncating: bool,
) -> Result<pb::AppendImageResponse> {
    let mut client = connect(base_url).await?;
    let mut req = tonic::Request::new(pb::AppendImageRequest {
        session_id: session_id.to_string(),
        data,
        mime,
        offset,
        truncating,
    });
    with_auth(&mut req, api_key);
    let resp = client
        .append_image(req)
        .await
        .with_context(|| "AppendImage RPC failed")?;
    Ok(resp.into_inner())
}

/// Handle to a server-side session. Carries the id + connection params
/// so subsequent per-session RPCs (AppendImage, Generate, ForkSession)
/// can be issued without the caller juggling base_url / api_key each
/// time.
pub struct SessionHandle {
    pub session_id: String,
    pub max_model_len: u32,
    pub base_url: String,
    pub api_key: String,
}

impl SessionHandle {
    pub async fn open(base_url: &str, api_key: &str, model: &str) -> Result<Self> {
        let grpc_url = derive_grpc_url(base_url);
        log::debug!(target: "grpc",
            "SessionHandle::open http_base={} -> grpc_url={}",
            base_url, grpc_url);
        let resp = open_session(&grpc_url, api_key, model).await?;
        log::debug!(target: "grpc",
            "SessionHandle::open session_id={} max_model_len={}",
            resp.session_id, resp.max_model_len);
        Ok(Self {
            session_id: resp.session_id,
            max_model_len: resp.max_model_len,
            base_url: grpc_url,
            api_key: api_key.to_string(),
        })
    }

    pub async fn close(self) -> Result<()> {
        close_session(&self.base_url, &self.api_key, &self.session_id).await
    }

    /// Append an image via the server-side vision block. See
    /// `append_image` free function for full semantics.
    pub async fn append_image(
        &self,
        data: Vec<u8>,
        mime: String,
        offset: u32,
        truncating: bool,
    ) -> Result<pb::AppendImageResponse> {
        append_image(
            &self.base_url,
            &self.api_key,
            &self.session_id,
            data,
            mime,
            offset,
            truncating,
        )
        .await
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn generated_types_compile() {
        // Exercise the shape of the new proto types — if build.rs
        // stops regenerating against the proto, this stops compiling.
        let _open = pb::OpenSessionRequest {
            model: "qwen3-vl".into(),
        };
        let _tok = pb::Token {
            id: 42,
            position: 0,
            is_prefill: false,
            readout: vec![0.1, 0.2, 0.3],
            logprobs: vec![pb::TokenLogprob {
                id: 1,
                logprob: -0.5,
            }],
            sampled_logprob: -0.1,
            has_sampled_logprob: true,
        };
        let _done = pb::GenerateDone {
            prompt_tokens: 10,
            completion_tokens: 20,
            total_tokens: 30,
            finish_reason: pb::generate_done::FinishReason::Eos as i32,
        };
        let _evt = pb::GenerateEvent {
            event: Some(pb::generate_event::Event::Done(_done)),
        };
    }

    #[test]
    fn derive_grpc_url_cases() {
        assert_eq!(
            derive_grpc_url("https://host:8000/v1"),
            "https://host:8443",
        );
        assert_eq!(
            derive_grpc_url("https://host:8000/"),
            "https://host:8443",
        );
        assert_eq!(
            derive_grpc_url("https://host:9000/v1"),
            "https://host:9000",
        );
    }
}
-												salience: add gRPC client + TLS plumbing for stateful vllm sessions

Adds the client-side of a stateful gRPC protocol against vllm, plus
the TLS trust machinery so we can talk to self-signed vllm servers.

Protocol (proto/salience.proto):
  Bidi-streaming Session RPC carries OpenSession / AppendTokens /
  Generate / Cancel from client and SessionReady / PrefillProgress /
  Token / GenerateDone / Error from server. Separate Fork unary RPC
  for cheap branching (prefix cache shares KV automatically). Plus
  ListSessions, CloseSession, GetReadoutManifest admin RPCs.

  Per-token readouts ship as packed f32 ([n_layers * n_concepts] per
  token, flat). Logprobs use range-selected positions plus a top-k
  parameter — empty ranges means no logprobs, any range means emit
  sampled-token logprob at those positions, top_k > 0 adds
  alternatives.

Client (src/agent/api/salience.rs):
  Tonic-generated types under pb::, a connect() helper, with_auth()
  for bearer metadata, and a Session handle wrapping the bidi stream:
  open() handshakes SessionReady; append() is fire-and-forget;
  generate() returns impl Stream<Item = Event> that drains inbound
  until Done or terminating Error. One generate at a time per session.

Peak picker (src/agent/salience.rs):
  Pure function over ReadoutEntry traces. Per-concept z-score against
  trace global stats; contiguous above-threshold regions emit one
  peak at the local max. Configurable sigma threshold and min-std
  safety floor. Deterministic tie-break on offset then concept name.
  12 unit tests covering empty traces, flat channels, single/multi
  spikes, contiguous humps, multi-concept independence, trailing
  runs, sub-threshold noise, layer-out-of-range, manifest shape
  mismatch, and threshold tunability.

TLS (src/agent/api/http.rs):
  HttpClient::build now also loads every .pem file under
  ~/.consciousness/certs/ into the rustls root store — so dropping
  a <host>.pem in that directory is enough to trust a new self-
  signed server; no code changes per new host. Also installs the
  rustls default crypto provider explicitly via OnceLock: tonic's
  tls features pulled in both ring and aws-lc-rs on the resolver
  path, and rustls 0.23 refuses to auto-pick when either could win.

Build (build.rs, Cargo.toml):
  tonic-build generates Rust types from proto/salience.proto at
  cargo-build time, using a vendored protoc binary
  (protoc-bin-vendored) so no system install is required. New
  runtime deps: tonic, prost, async-stream, tokio-stream,
  rustls-pemfile.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-23 02:21:07 -04:00
+								// agent/api/salience.rs — gRPC client bindings for salience.v1.
 								//
 								// Thin wrapper around the tonic-generated types. Every RPC except
 								// Generate is unary; Generate is server-streaming. Free functions
 								// (open/close session) wrap the lifecycle RPCs; `SessionHandle` just
 								// carries the id + connection params so later RPCs can reuse them.
 								//
 								// The old bidi Session() API is gone — see git history for its shape.
 								#![allow(clippy::enum_variant_names)]
 								use anyhow::{Context, Result};
 								use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint};
 								/// Generated prost + tonic types for salience.v1. Call sites use
 								/// `pb::OpenSessionRequest`, `pb::Token`, etc.
 								pub mod pb {
 								    tonic::include_proto!("salience.v1");
 								}
 								pub type SalienceClient = pb::salience_client::SalienceClient<Channel>;
 								/// Open a TLS-aware gRPC channel to the salience server. `base_url`
 								/// looks like `https://host:8443`. User-provided CA certs under
 								/// `~/.consciousness/certs/` are trusted in addition to the system
 								/// roots (for self-signed server certs).
 								pub async fn connect(base_url: &str) -> Result<SalienceClient> {
 								    let mut endpoint = Endpoint::from_shared(base_url.to_string())
 								        .with_context(|| format!("invalid salience endpoint: {}", base_url))?
 								        .connect_timeout(std::time::Duration::from_secs(30))
 								        .timeout(std::time::Duration::from_secs(600));
 								    if base_url.starts_with("https://") {
 								        let user_certs = super::http::load_user_certs_pem_bytes();
 								        let mut tls = ClientTlsConfig::new().with_native_roots();
 								        if !user_certs.is_empty() {
 								            tls = tls.ca_certificate(Certificate::from_pem(user_certs));
 								        }
 								        endpoint = endpoint
 								            .tls_config(tls)
 								            .with_context(|| "configuring tonic TLS")?;
 								    }
 								    let channel = endpoint
 								        .connect()
 								        .await
 								        .with_context(|| format!("failed to connect to salience server at {}", base_url))?;
 								    Ok(pb::salience_client::SalienceClient::new(channel))
 								}
 								/// Derive the gRPC base URL from the HTTP completions base URL.
 								///
 								/// vLLM's salience gRPC server listens on a different port (8443) from
 								/// the HTTP endpoint (8000) and accepts no path component. Given an
 								/// HTTP base like `https://host:8000/v1`, produce `https://host:8443`.
 								/// No-op when the path is empty and the port isn't 8000.
 								pub fn derive_grpc_url(http_base: &str) -> String {
 								    let mut url = http_base.trim_end_matches('/').to_string();
 								    if let Some(proto_end) = url.find("://") {
 								        let rest_start = proto_end + 3;
 								        if let Some(path_slash) = url[rest_start..].find('/') {
 								            url.truncate(rest_start + path_slash);
 								        }
 								    }
 								    url.replace(":8000", ":8443")
 								}
 								/// Attach a bearer token to a tonic request as gRPC metadata.
 								pub fn with_auth<T>(req: &mut tonic::Request<T>, api_key: &str) {
 								    if api_key.is_empty() {
 								        return;
 								    }
 								    let bearer = format!("Bearer {}", api_key);
 								    if let Ok(val) = bearer.parse() {
 								        req.metadata_mut().insert("authorization", val);
 								    }
 								}
 								/// Call the server's `OpenSession` RPC and return the response.
 								pub async fn open_session(
 								    base_url: &str,
 								    api_key: &str,
 								    model: &str,
 								) -> Result<pb::OpenSessionResponse> {
 								    let mut client = connect(base_url).await?;
 								    let mut req = tonic::Request::new(pb::OpenSessionRequest {
 								        model: model.to_string(),
 								    });
 								    with_auth(&mut req, api_key);
 								    let resp = client
 								        .open_session(req)
 								        .await
 								        .with_context(|| "OpenSession RPC failed")?;
 								    Ok(resp.into_inner())
 								}
 								/// Call the server's `CloseSession` RPC. Idempotent on the server.
 								pub async fn close_session(base_url: &str, api_key: &str, session_id: &str) -> Result<()> {
 								    let mut client = connect(base_url).await?;
 								    let mut req = tonic::Request::new(pb::CloseSessionRequest {
 								        session_id: session_id.to_string(),
 								    });
 								    with_auth(&mut req, api_key);
 								    client
 								        .close_session(req)
 								        .await
 								        .with_context(|| "CloseSession RPC failed")?;
 								    Ok(())
 								}
 								/// Append an image to a session. Server decodes the image, computes N
 								/// via vLLM's own multimodal pipeline, writes the full vision block
 								/// (`<|vision_start|> + IMAGE_PAD×N + <|vision_end|>`) into
 								/// session.tokens, and returns (N, new total length).
 								///
 								/// `offset` is the client's view of the session's current token count;
 								/// the server rejects if it diverges from its own (unless
 								/// `truncating=true`, in which case the server slices to `offset`
 								/// first — but never through a vision block).
 								pub async fn append_image(
 								    base_url: &str,
 								    api_key: &str,
 								    session_id: &str,
 								    data: Vec<u8>,
 								    mime: String,
 								    offset: u32,
 								    truncating: bool,
 								) -> Result<pb::AppendImageResponse> {
 								    let mut client = connect(base_url).await?;
 								    let mut req = tonic::Request::new(pb::AppendImageRequest {
 								        session_id: session_id.to_string(),
 								        data,
 								        mime,
 								        offset,
 								        truncating,
 								    });
 								    with_auth(&mut req, api_key);
 								    let resp = client
 								        .append_image(req)
 								        .await
 								        .with_context(|| "AppendImage RPC failed")?;
 								    Ok(resp.into_inner())
 								}
 								/// Handle to a server-side session. Carries the id + connection params
 								/// so subsequent per-session RPCs (AppendImage, Generate, ForkSession)
 								/// can be issued without the caller juggling base_url / api_key each
 								/// time.
 								pub struct SessionHandle {
 								    pub session_id: String,
 								    pub max_model_len: u32,
 								    pub base_url: String,
 								    pub api_key: String,
 								}
 								impl SessionHandle {
 								    pub async fn open(base_url: &str, api_key: &str, model: &str) -> Result<Self> {
 								        let grpc_url = derive_grpc_url(base_url);
 								        log::debug!(target: "grpc",
 								            "SessionHandle::open http_base={} -> grpc_url={}",
 								            base_url, grpc_url);
 								        let resp = open_session(&grpc_url, api_key, model).await?;
 								        log::debug!(target: "grpc",
 								            "SessionHandle::open session_id={} max_model_len={}",
 								            resp.session_id, resp.max_model_len);
 								        Ok(Self {
 								            session_id: resp.session_id,
 								            max_model_len: resp.max_model_len,
 								            base_url: grpc_url,
 								            api_key: api_key.to_string(),
 								        })
 								    }
 								    pub async fn close(self) -> Result<()> {
 								        close_session(&self.base_url, &self.api_key, &self.session_id).await
 								    }
 								    /// Append an image via the server-side vision block. See
 								    /// `append_image` free function for full semantics.
 								    pub async fn append_image(
 								        &self,
 								        data: Vec<u8>,
 								        mime: String,
 								        offset: u32,
 								        truncating: bool,
 								    ) -> Result<pb::AppendImageResponse> {
 								        append_image(
 								            &self.base_url,
 								            &self.api_key,
 								            &self.session_id,
 								            data,
 								            mime,
 								            offset,
 								            truncating,
 								        )
 								        .await
 								    }
 								}
 								#[cfg(test)]
 								mod tests {
 								    use super::*;
 								    #[test]
 								    fn generated_types_compile() {
 								        // Exercise the shape of the new proto types — if build.rs
 								        // stops regenerating against the proto, this stops compiling.
 								        let _open = pb::OpenSessionRequest {
 								            model: "qwen3-vl".into(),
 								        };
 								        let _tok = pb::Token {
 								            id: 42,
 								            position: 0,
 								            is_prefill: false,
 								            readout: vec![0.1, 0.2, 0.3],
 								            logprobs: vec![pb::TokenLogprob {
 								                id: 1,
 								                logprob: -0.5,
 								            }],
 								            sampled_logprob: -0.1,
 								            has_sampled_logprob: true,
 								        };
 								        let _done = pb::GenerateDone {
 								            prompt_tokens: 10,
 								            completion_tokens: 20,
 								            total_tokens: 30,
 								            finish_reason: pb::generate_done::FinishReason::Eos as i32,
 								        };
 								        let _evt = pb::GenerateEvent {
 								            event: Some(pb::generate_event::Event::Done(_done)),
 								        };
 								    }
 								    #[test]
 								    fn derive_grpc_url_cases() {
 								        assert_eq!(
 								            derive_grpc_url("https://host:8000/v1"),
 								            "https://host:8443",
 								        );
 								        assert_eq!(
 								            derive_grpc_url("https://host:8000/"),
 								            "https://host:8443",
 								        );
 								        assert_eq!(
 								            derive_grpc_url("https://host:9000/v1"),
 								            "https://host:9000",
 								        );
 								    }
 								}