consciousness/src/agent/api/salience.rs

// agent/api/salience.rs — gRPC client bindings for salience.v1.
//
// Thin wrapper around the tonic-generated types. Every RPC except
// Generate is unary; Generate is server-streaming. Free functions
// (open/close session) wrap the lifecycle RPCs; `SessionHandle` just
// carries the id + connection params so later RPCs can reuse them.
//
// The old bidi Session() API is gone — see git history for its shape.

#![allow(clippy::enum_variant_names)]

use anyhow::{Context, Result};
use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint};

/// Generated prost + tonic types for salience.v1. Call sites use
/// `pb::OpenSessionRequest`, `pb::Token`, etc.
pub mod pb {
    tonic::include_proto!("salience.v1");
}

pub type SalienceClient = pb::salience_client::SalienceClient<Channel>;

/// Open a TLS-aware gRPC channel to the salience server. `base_url`
/// looks like `https://host:8443`. User-provided CA certs under
/// `~/.consciousness/certs/` are trusted in addition to the system
/// roots (for self-signed server certs).
///
/// Returns the raw `Channel` so callers (`ApiClient::salience_client`)
/// can cache it and clone a `SalienceClient` per request without
/// reopening the TCP/TLS connection. tonic multiplexes RPCs over the
/// shared channel automatically.
pub async fn connect_channel(base_url: &str) -> Result<Channel> {
    let mut endpoint = Endpoint::from_shared(base_url.to_string())
        .with_context(|| format!("invalid salience endpoint: {}", base_url))?
        .connect_timeout(std::time::Duration::from_secs(30))
        .timeout(std::time::Duration::from_secs(600));

    if base_url.starts_with("https://") {
        let user_certs = super::http::load_user_certs_pem_bytes();
        let mut tls = ClientTlsConfig::new().with_native_roots();
        if !user_certs.is_empty() {
            tls = tls.ca_certificate(Certificate::from_pem(user_certs));
        }
        endpoint = endpoint
            .tls_config(tls)
            .with_context(|| "configuring tonic TLS")?;
    }

    endpoint
        .connect()
        .await
        .with_context(|| format!("failed to connect to salience server at {}", base_url))
}

/// Derive the gRPC base URL from the HTTP completions base URL.
///
/// vLLM's salience gRPC server listens on a different port (8443) from
/// the HTTP endpoint (8000) and accepts no path component. Given an
/// HTTP base like `https://host:8000/v1`, produce `https://host:8443`.
/// No-op when the path is empty and the port isn't 8000.
pub fn derive_grpc_url(http_base: &str) -> String {
    let mut url = http_base.trim_end_matches('/').to_string();
    if let Some(proto_end) = url.find("://") {
        let rest_start = proto_end + 3;
        if let Some(path_slash) = url[rest_start..].find('/') {
            url.truncate(rest_start + path_slash);
        }
    }
    url.replace(":8000", ":8443")
}

/// Attach a bearer token to a tonic request as gRPC metadata.
pub fn with_auth<T>(req: &mut tonic::Request<T>, api_key: &str) {
    if api_key.is_empty() {
        return;
    }
    let bearer = format!("Bearer {}", api_key);
    if let Ok(val) = bearer.parse() {
        req.metadata_mut().insert("authorization", val);
    }
}

/// Handle to a server-side session. Carries the id + an `ApiClient`
/// clone (which holds the shared tonic Channel) so subsequent
/// per-session RPCs go over the process-global connection.
/// `committed_len` tracks the server's current session.tokens length
/// so the client can submit deltas with the right `offset`.
pub struct SessionHandle {
    pub session_id: String,
    pub max_model_len: u32,
    pub committed_len: u32,
    client: super::ApiClient,
}

impl SessionHandle {
    pub async fn open(client: &super::ApiClient) -> Result<Self> {
        let t0 = std::time::Instant::now();
        log::debug!(target: "grpc", "OpenSession rpc: start");
        let mut c = client.salience_client().await?;
        let mut req = tonic::Request::new(pb::OpenSessionRequest {
            model: client.model.clone(),
        });
        with_auth(&mut req, client.api_key());
        let resp = c
            .open_session(req)
            .await
            .with_context(|| "OpenSession RPC failed")?
            .into_inner();
        log::debug!(target: "grpc",
            "OpenSession rpc: done session_id={} max_model_len={} elapsed={:?}",
            resp.session_id, resp.max_model_len, t0.elapsed());
        Ok(Self {
            session_id: resp.session_id,
            max_model_len: resp.max_model_len,
            committed_len: 0,
            client: client.clone(),
        })
    }

    pub fn client(&self) -> &super::ApiClient { &self.client }

    /// Debug-only: fetch the server's full session.tokens. Used to
    /// verify client-side accounting byte-for-byte when divergence
    /// is suspected. Not cheap on large sessions.
    pub async fn dump_tokens(&self) -> Result<Vec<u32>> {
        let mut c = self.client.salience_client().await?;
        let mut req = tonic::Request::new(pb::DumpSessionRequest {
            session_id: self.session_id.clone(),
        });
        with_auth(&mut req, self.client.api_key());
        let resp = c
            .dump_session(req)
            .await
            .with_context(|| "DumpSession RPC failed")?
            .into_inner();
        Ok(resp.tokens)
    }

    /// Open a gRPC Generate stream with the given request. Caller
    /// iterates the returned stream of GenerateEvents; the handle's
    /// `committed_len` should be advanced by the caller on Done based
    /// on the Done event's `total_tokens` field.
    pub async fn generate(
        &self,
        req: pb::GenerateRequest,
    ) -> Result<tonic::Streaming<pb::GenerateEvent>> {
        let t0 = std::time::Instant::now();
        log::debug!(target: "grpc",
            "Generate rpc: open-stream session={} offset={} append={} max_tokens={}",
            self.session_id, req.offset, req.append_tokens.len(), req.max_tokens);
        let mut c = self.client.salience_client().await?;
        let mut req = tonic::Request::new(req);
        with_auth(&mut req, self.client.api_key());
        let resp = c
            .generate(req)
            .await
            .with_context(|| "Generate RPC failed")?;
        log::debug!(target: "grpc",
            "Generate rpc: stream opened session={} open-latency={:?}",
            self.session_id, t0.elapsed());
        Ok(resp.into_inner())
    }

    /// Run a prefill-only Generate (max_tokens=0) that appends the
    /// given tokens to the session. No decode, no Token events — the
    /// server just extends session.tokens and runs prefill to warm
    /// the KV cache. Used to interleave text runs between AppendImage
    /// calls, and by score paths that want prompt_logprobs without a
    /// decode step.
    pub async fn prefill_only(&mut self, tokens: Vec<u32>) -> Result<()> {
        use futures::StreamExt;
        let req = pb::GenerateRequest {
            session_id: self.session_id.clone(),
            append_tokens: tokens,
            offset: self.committed_len,
            truncating: false,
            max_tokens: 0,
            logprobs_ranges: Vec::new(),
            logprob_top_k: 0,
            readout_ranges: Vec::new(),
            temperature: 0.0,
            top_p: 0.0,
            top_k: 0,
            stop_token_ids: Vec::new(),
            priority: 0,
            images: Vec::new(),
        };
        let mut stream = self.generate(req).await?;
        while let Some(event) = stream.next().await {
            let event = event.map_err(|s| anyhow::anyhow!("prefill Generate stream: {}", s))?;
            if let Some(pb::generate_event::Event::Done(d)) = event.event {
                self.committed_len = d.total_tokens;
            }
        }
        Ok(())
    }
}

/// Drop → fire CloseSession in a detached task so servers don't leak
/// sessions until TTL eviction. Best-effort: if no tokio runtime is
/// available we skip; the server's 30min TTL will reap it eventually.
impl Drop for SessionHandle {
    fn drop(&mut self) {
        if self.session_id.is_empty() {
            return;
        }
        let session_id = std::mem::take(&mut self.session_id);
        let client = self.client.clone();
        let Ok(rt) = tokio::runtime::Handle::try_current() else {
            log::debug!(target: "grpc",
                "SessionHandle drop outside tokio runtime, session {} leaks to TTL",
                session_id);
            return;
        };
        rt.spawn(async move {
            let Ok(mut c) = client.salience_client().await else { return };
            let mut req = tonic::Request::new(pb::CloseSessionRequest {
                session_id: session_id.clone(),
            });
            with_auth(&mut req, client.api_key());
            if let Err(e) = c.close_session(req).await {
                log::debug!(target: "grpc",
                    "CloseSession on drop failed for {}: {:#}",
                    session_id, e);
            }
        });
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn generated_types_compile() {
        // Exercise the shape of the new proto types — if build.rs
        // stops regenerating against the proto, this stops compiling.
        let _open = pb::OpenSessionRequest {
            model: "qwen3-vl".into(),
        };
        let _tok = pb::Token {
            id: 42,
            position: 0,
            is_prefill: false,
            readout: vec![0.1, 0.2, 0.3],
            logprobs: vec![pb::TokenLogprob {
                id: 1,
                logprob: -0.5,
            }],
            sampled_logprob: -0.1,
            has_sampled_logprob: true,
        };
        let _done = pb::GenerateDone {
            prompt_tokens: 10,
            completion_tokens: 20,
            total_tokens: 30,
            finish_reason: pb::generate_done::FinishReason::Eos as i32,
        };
        let _evt = pb::GenerateEvent {
            event: Some(pb::generate_event::Event::Done(_done)),
        };
    }

    #[test]
    fn derive_grpc_url_cases() {
        assert_eq!(
            derive_grpc_url("https://host:8000/v1"),
            "https://host:8443",
        );
        assert_eq!(
            derive_grpc_url("https://host:8000/"),
            "https://host:8443",
        );
        assert_eq!(
            derive_grpc_url("https://host:9000/v1"),
            "https://host:9000",
        );
    }
}