agent: share one tonic Channel + migrate scoring to gRPC Generate

Two changes that bolt together — the shared connection means the new scoring path actually costs one HTTP/2 handshake across the whole process instead of one-per-RPC. ApiClient gains `salience_channel: Arc<OnceCell<Channel>>`. First call to `ApiClient::salience_client()` opens the channel via `connect_channel()` and stores the Channel; subsequent calls clone it (cheap — tonic multiplexes concurrent RPCs over the single HTTP/2 connection). Every ApiClient clone shares the same OnceCell, so all agents spawned from Mind's client — plus every ephemeral scoring session — reuse one connection. SessionHandle refactored to hold an `ApiClient` clone instead of a bag of (base_url, api_key) strings. `open` / `append_image` / `generate` go through `self.client.salience_client()` now. New `prefill_only(tokens)` method encapsulates the "Generate with max_tokens=0 to append text" pattern (previously a private free function in api/mod.rs called `flush_pending`). Drop impl on SessionHandle stays — still fires CloseSession on the shared channel in a detached task. `run_session_generate` switched from `(base_url, api_key, model)` to `&ApiClient`; the agent-turn flow that uses it keeps the same shape but `stream_session_mm` clones the ApiClient into the spawned worker. learn.rs migrated from the HTTP `/v1/score` endpoint to a gRPC session-based score: * `call_score` opens an ephemeral SessionHandle on the client, converts (prompt_tokens, images) → Vec<WireChunk> via the new `prompt_to_chunks` helper (splits on VISION_START/VISION_END), walks chunks calling `prefill_only` + `append_image`, runs a final Generate with `max_tokens=0` + `logprobs_ranges` over the scored positions, and sums each Token event's `sampled_logprob` per range to produce `ScoreResult`s. * SessionHandle drops at end of scope → CloseSession auto-fires, keeping the server's session map clean between calls. * No more HTTP path, no more `http_client()` helper, no more `ScoreResponse` / serde plumbing for /v1/score. * `send_to_train` still uses HTTP (it talks to /v1/train which isn't on the gRPC protocol); its ad-hoc HTTP client lives inline now instead of reaching for the deleted `http_client()`. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-24 12:51:53 -04:00 · 2026-04-24 12:51:53 -04:00 · 4feebb7bc4
commit 4feebb7bc4
parent be6ba4e9a5
3 changed files with 268 additions and 213 deletions
--- a/src/agent/api/salience.rs
+++ b/src/agent/api/salience.rs
@ -24,7 +24,12 @@ pub type SalienceClient = pb::salience_client::SalienceClient<Channel>;
 /// looks like `https://host:8443`. User-provided CA certs under
 /// `~/.consciousness/certs/` are trusted in addition to the system
 /// roots (for self-signed server certs).
-pub async fn connect(base_url: &str) -> Result<SalienceClient> {
+///
+/// Returns the raw `Channel` so callers (`ApiClient::salience_client`)
+/// can cache it and clone a `SalienceClient` per request without
+/// reopening the TCP/TLS connection. tonic multiplexes RPCs over the
+/// shared channel automatically.
+pub async fn connect_channel(base_url: &str) -> Result<Channel> {
    let mut endpoint = Endpoint::from_shared(base_url.to_string())
        .with_context(|| format!("invalid salience endpoint: {}", base_url))?
        .connect_timeout(std::time::Duration::from_secs(30))
@ -41,11 +46,10 @@ pub async fn connect(base_url: &str) -> Result<SalienceClient> {
            .with_context(|| "configuring tonic TLS")?;
    }

-    let channel = endpoint
+    endpoint
        .connect()
        .await
-        .with_context(|| format!("failed to connect to salience server at {}", base_url))?;
-    Ok(pb::salience_client::SalienceClient::new(channel))
+        .with_context(|| format!("failed to connect to salience server at {}", base_url))
 }

 /// Derive the gRPC base URL from the HTTP completions base URL.
@ -76,107 +80,42 @@ pub fn with_auth<T>(req: &mut tonic::Request<T>, api_key: &str) {
    }
 }

-/// Call the server's `OpenSession` RPC and return the response.
-pub async fn open_session(
-    base_url: &str,
-    api_key: &str,
-    model: &str,
-) -> Result<pb::OpenSessionResponse> {
-    let mut client = connect(base_url).await?;
-    let mut req = tonic::Request::new(pb::OpenSessionRequest {
-        model: model.to_string(),
-    });
-    with_auth(&mut req, api_key);
-    let resp = client
-        .open_session(req)
-        .await
-        .with_context(|| "OpenSession RPC failed")?;
-    Ok(resp.into_inner())
-}
-
-/// Call the server's `CloseSession` RPC. Idempotent on the server.
-pub async fn close_session(base_url: &str, api_key: &str, session_id: &str) -> Result<()> {
-    let mut client = connect(base_url).await?;
-    let mut req = tonic::Request::new(pb::CloseSessionRequest {
-        session_id: session_id.to_string(),
-    });
-    with_auth(&mut req, api_key);
-    client
-        .close_session(req)
-        .await
-        .with_context(|| "CloseSession RPC failed")?;
-    Ok(())
-}
-
-/// Append an image to a session. Server decodes the image, computes N
-/// via vLLM's own multimodal pipeline, writes the full vision block
-/// (`<|vision_start|> + IMAGE_PAD×N + <|vision_end|>`) into
-/// session.tokens, and returns (N, new total length).
-///
-/// `offset` is the client's view of the session's current token count;
-/// the server rejects if it diverges from its own (unless
-/// `truncating=true`, in which case the server slices to `offset`
-/// first — but never through a vision block).
-pub async fn append_image(
-    base_url: &str,
-    api_key: &str,
-    session_id: &str,
-    data: Vec<u8>,
-    mime: String,
-    offset: u32,
-    truncating: bool,
-) -> Result<pb::AppendImageResponse> {
-    let mut client = connect(base_url).await?;
-    let mut req = tonic::Request::new(pb::AppendImageRequest {
-        session_id: session_id.to_string(),
-        data,
-        mime,
-        offset,
-        truncating,
-    });
-    with_auth(&mut req, api_key);
-    let resp = client
-        .append_image(req)
-        .await
-        .with_context(|| "AppendImage RPC failed")?;
-    Ok(resp.into_inner())
-}
-
-/// Handle to a server-side session. Carries the id + connection params
-/// so subsequent per-session RPCs (AppendImage, Generate, ForkSession)
-/// can be issued without the caller juggling base_url / api_key each
-/// time. `committed_len` tracks the server's current session.tokens
-/// length so the client can submit deltas with the right `offset`.
+/// Handle to a server-side session. Carries the id + an `ApiClient`
+/// clone (which holds the shared tonic Channel) so subsequent
+/// per-session RPCs go over the process-global connection.
+/// `committed_len` tracks the server's current session.tokens length
+/// so the client can submit deltas with the right `offset`.
 pub struct SessionHandle {
    pub session_id: String,
    pub max_model_len: u32,
-    pub base_url: String,
-    pub api_key: String,
    pub committed_len: u32,
+    client: super::ApiClient,
 }

 impl SessionHandle {
-    pub async fn open(base_url: &str, api_key: &str, model: &str) -> Result<Self> {
-        let grpc_url = derive_grpc_url(base_url);
-        log::debug!(target: "grpc",
-            "SessionHandle::open http_base={} -> grpc_url={}",
-            base_url, grpc_url);
-        let resp = open_session(&grpc_url, api_key, model).await?;
+    pub async fn open(client: &super::ApiClient) -> Result<Self> {
+        let mut c = client.salience_client().await?;
+        let mut req = tonic::Request::new(pb::OpenSessionRequest {
+            model: client.model.clone(),
+        });
+        with_auth(&mut req, client.api_key());
+        let resp = c
+            .open_session(req)
+            .await
+            .with_context(|| "OpenSession RPC failed")?
+            .into_inner();
        log::debug!(target: "grpc",
            "SessionHandle::open session_id={} max_model_len={}",
            resp.session_id, resp.max_model_len);
        Ok(Self {
            session_id: resp.session_id,
            max_model_len: resp.max_model_len,
-            base_url: grpc_url,
-            api_key: api_key.to_string(),
            committed_len: 0,
+            client: client.clone(),
        })
    }

-    pub async fn close(self) -> Result<()> {
-        close_session(&self.base_url, &self.api_key, &self.session_id).await
-    }
+    pub fn client(&self) -> &super::ApiClient { &self.client }

    /// Append an image via the server-side vision block. Updates
    /// `committed_len` from the server's response on success.
@ -186,37 +125,105 @@ impl SessionHandle {
        mime: String,
        truncating: bool,
    ) -> Result<pb::AppendImageResponse> {
-        let resp = append_image(
-            &self.base_url,
-            &self.api_key,
-            &self.session_id,
+        let mut c = self.client.salience_client().await?;
+        let mut req = tonic::Request::new(pb::AppendImageRequest {
+            session_id: self.session_id.clone(),
            data,
            mime,
-            self.committed_len,
+            offset: self.committed_len,
            truncating,
-        )
-        .await?;
+        });
+        with_auth(&mut req, self.client.api_key());
+        let resp = c
+            .append_image(req)
+            .await
+            .with_context(|| "AppendImage RPC failed")?
+            .into_inner();
        self.committed_len = resp.total_length;
        Ok(resp)
    }

    /// Open a gRPC Generate stream with the given request. Caller
    /// iterates the returned stream of GenerateEvents; the handle's
-    /// `committed_len` is advanced on Done based on the Done event's
-    /// `total_tokens` field.
+    /// `committed_len` should be advanced by the caller on Done based
+    /// on the Done event's `total_tokens` field.
    pub async fn generate(
        &self,
        req: pb::GenerateRequest,
    ) -> Result<tonic::Streaming<pb::GenerateEvent>> {
-        let mut client = connect(&self.base_url).await?;
+        let mut c = self.client.salience_client().await?;
        let mut req = tonic::Request::new(req);
-        with_auth(&mut req, &self.api_key);
-        let resp = client
+        with_auth(&mut req, self.client.api_key());
+        let resp = c
            .generate(req)
            .await
            .with_context(|| "Generate RPC failed")?;
        Ok(resp.into_inner())
    }
+
+    /// Run a prefill-only Generate (max_tokens=0) that appends the
+    /// given tokens to the session. No decode, no Token events — the
+    /// server just extends session.tokens and runs prefill to warm
+    /// the KV cache. Used to interleave text runs between AppendImage
+    /// calls, and by score paths that want prompt_logprobs without a
+    /// decode step.
+    pub async fn prefill_only(&mut self, tokens: Vec<u32>) -> Result<()> {
+        use futures::StreamExt;
+        let req = pb::GenerateRequest {
+            session_id: self.session_id.clone(),
+            append_tokens: tokens,
+            offset: self.committed_len,
+            truncating: false,
+            max_tokens: 0,
+            logprobs_ranges: Vec::new(),
+            logprob_top_k: 0,
+            readout_ranges: Vec::new(),
+            temperature: 0.0,
+            top_p: 0.0,
+            top_k: 0,
+            stop_token_ids: Vec::new(),
+            priority: 0,
+        };
+        let mut stream = self.generate(req).await?;
+        while let Some(event) = stream.next().await {
+            let event = event.map_err(|s| anyhow::anyhow!("prefill Generate stream: {}", s))?;
+            if let Some(pb::generate_event::Event::Done(d)) = event.event {
+                self.committed_len = d.total_tokens;
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Drop → fire CloseSession in a detached task so servers don't leak
+/// sessions until TTL eviction. Best-effort: if no tokio runtime is
+/// available we skip; the server's 30min TTL will reap it eventually.
+impl Drop for SessionHandle {
+    fn drop(&mut self) {
+        if self.session_id.is_empty() {
+            return;
+        }
+        let session_id = std::mem::take(&mut self.session_id);
+        let client = self.client.clone();
+        let Ok(rt) = tokio::runtime::Handle::try_current() else {
+            log::debug!(target: "grpc",
+                "SessionHandle drop outside tokio runtime, session {} leaks to TTL",
+                session_id);
+            return;
+        };
+        rt.spawn(async move {
+            let Ok(mut c) = client.salience_client().await else { return };
+            let mut req = tonic::Request::new(pb::CloseSessionRequest {
+                session_id: session_id.clone(),
+            });
+            with_auth(&mut req, client.api_key());
+            if let Err(e) = c.close_session(req).await {
+                log::debug!(target: "grpc",
+                    "CloseSession on drop failed for {}: {:#}",
+                    session_id, e);
+            }
+        });
+    }
 }

 #[cfg(test)]