salience: add gRPC client + TLS plumbing for stateful vllm sessions

Adds the client-side of a stateful gRPC protocol against vllm, plus the TLS trust machinery so we can talk to self-signed vllm servers. Protocol (proto/salience.proto): Bidi-streaming Session RPC carries OpenSession / AppendTokens / Generate / Cancel from client and SessionReady / PrefillProgress / Token / GenerateDone / Error from server. Separate Fork unary RPC for cheap branching (prefix cache shares KV automatically). Plus ListSessions, CloseSession, GetReadoutManifest admin RPCs. Per-token readouts ship as packed f32 ([n_layers * n_concepts] per token, flat). Logprobs use range-selected positions plus a top-k parameter — empty ranges means no logprobs, any range means emit sampled-token logprob at those positions, top_k > 0 adds alternatives. Client (src/agent/api/salience.rs): Tonic-generated types under pb::, a connect() helper, with_auth() for bearer metadata, and a Session handle wrapping the bidi stream: open() handshakes SessionReady; append() is fire-and-forget; generate() returns impl Stream<Item = Event> that drains inbound until Done or terminating Error. One generate at a time per session. Peak picker (src/agent/salience.rs): Pure function over ReadoutEntry traces. Per-concept z-score against trace global stats; contiguous above-threshold regions emit one peak at the local max. Configurable sigma threshold and min-std safety floor. Deterministic tie-break on offset then concept name. 12 unit tests covering empty traces, flat channels, single/multi spikes, contiguous humps, multi-concept independence, trailing runs, sub-threshold noise, layer-out-of-range, manifest shape mismatch, and threshold tunability. TLS (src/agent/api/http.rs): HttpClient::build now also loads every .pem file under ~/.consciousness/certs/ into the rustls root store — so dropping a <host>.pem in that directory is enough to trust a new self- signed server; no code changes per new host. Also installs the rustls default crypto provider explicitly via OnceLock: tonic's tls features pulled in both ring and aws-lc-rs on the resolver path, and rustls 0.23 refuses to auto-pick when either could win. Build (build.rs, Cargo.toml): tonic-build generates Rust types from proto/salience.proto at cargo-build time, using a vendored protoc binary (protoc-bin-vendored) so no system install is required. New runtime deps: tonic, prost, async-stream, tokio-stream, rustls-pemfile. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-23 02:21:07 -04:00 · 2026-04-23 02:21:07 -04:00 · 08213f9093
commit 08213f9093
parent 0e459aae92
15 changed files with 1689 additions and 440 deletions
--- a/src/agent/context.rs
+++ b/src/agent/context.rs
@ -359,8 +359,8 @@ impl AstNode {
        mime: impl Into<String>,
        orig_height: u32,
        orig_width: u32,
+        token_count: u32,
    ) -> Self {
-        let token_count = qwen3_image_token_count(orig_height, orig_width);
        Self::Leaf(NodeLeaf::new(NodeBody::Image {
            bytes,
            mime: mime.into(),
@ -898,10 +898,12 @@ impl Ast for ContextState {
 }

 /// An image collected from the AST for a request body. The AST stores
-/// the pre-expanded token form (N image_pads) for accurate budget
-/// accounting; the wire form collapses each Image to a single
-/// `<|image_pad|>` between vision bookends and ships the bytes
-/// separately as multi_modal_data.
+/// the pre-expanded token form (`<|vision_start|> + <|image_pad|>×N +
+/// <|vision_end|>`), and the wire form mirrors that exactly so the
+/// server's `session.tokens` length matches what vLLM's engine will
+/// process. The authoritative N is obtained from the server via the
+/// CountImageTokens RPC before the Image leaf is constructed.
+#[derive(Clone)]
 pub struct WireImage {
    pub bytes: Vec<u8>,
    pub mime: String,
@ -911,9 +913,10 @@ fn wire_into(node: &AstNode, tokens: &mut Vec<u32>, images: &mut Vec<WireImage>)
    match node {
        AstNode::Leaf(leaf) => match leaf.body() {
            NodeBody::Image { bytes, mime, .. } => {
-                tokens.push(tokenizer::VISION_START);
-                tokens.push(tokenizer::IMAGE_PAD);
-                tokens.push(tokenizer::VISION_END);
+                // Send the pre-expanded token form (includes N
+                // <|image_pad|> tokens); engine's multi_modal
+                // pipeline pairs them with the binary data below.
+                tokens.extend_from_slice(leaf.token_ids());
                images.push(WireImage {
                    bytes: bytes.clone(),
                    mime: mime.clone(),
@ -1225,11 +1228,20 @@ impl ContextState {
 // to at request time. Constants come from Qwen3.5-27B's preprocessor_config.
 // ---------------------------------------------------------------------------

+// Test-only client-side estimate of image token expansion. Production
+// callers obtain the authoritative count from the server via
+// CountImageTokens; these constants and helpers stay around only to
+// keep the context-shape unit tests self-contained.
+#[cfg(test)]
 const QWEN3_PATCH_SIZE: u32 = 16;
+#[cfg(test)]
 const QWEN3_MERGE_SIZE: u32 = 2;
+#[cfg(test)]
 const QWEN3_MIN_PIXELS: u64 = 65_536;
+#[cfg(test)]
 const QWEN3_MAX_PIXELS: u64 = 16_777_216;

+#[cfg(test)]
 fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) -> (u32, u32) {
    let max_s = h.max(w) as f64;
    let min_s = h.min(w) as f64;
@ -1258,10 +1270,10 @@ fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) -
    }
 }

-/// Compute how many `<|image_pad|>` tokens vLLM will emit for an image of
-/// the given dimensions. Matches Qwen3VL's feature-size calculation exactly:
-///   (grid_h * grid_w) / merge_size^2
-/// where (grid_h, grid_w) = resized dims / patch_size.
+/// Test-only: client-side estimate of how many `<|image_pad|>` tokens
+/// vLLM will emit for an image of the given dimensions. Production
+/// callers use `salience::count_image_tokens` (server-authoritative).
+#[cfg(test)]
 fn qwen3_image_token_count(orig_h: u32, orig_w: u32) -> u32 {
    let factor = QWEN3_PATCH_SIZE * QWEN3_MERGE_SIZE;
    let (rh, rw) = smart_resize(orig_h, orig_w, factor, QWEN3_MIN_PIXELS, QWEN3_MAX_PIXELS);
@ -1697,7 +1709,7 @@ mod tests {

    #[test]
    fn test_image_render_and_token_ids() {
-        let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512);
+        let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512, qwen3_image_token_count(512, 512));
        let leaf = node.leaf().unwrap();
        // 3 tokens of bookend + 256 image_pad tokens
        assert_eq!(leaf.token_ids().len(), 258);
@ -1713,36 +1725,41 @@ mod tests {
    }

    #[test]
-    fn test_wire_prompt_collapses_image_pads() {
+    fn test_wire_prompt_preserves_expanded_image_pads() {
        let mut ctx = ContextState::new();
        ctx.push_no_log(Section::Conversation, AstNode::branch(Role::User, vec![
            AstNode::content("look:"),
-            AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512),
+            AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512, qwen3_image_token_count(512, 512)),
        ]));

-        // AST side: N image_pads + bookends, full budget accounting.
+        // AST side and wire side should both carry N image_pads + bookends —
+        // server's session.tokens length must match what vLLM's engine will
+        // actually process. Binary image bytes are shipped separately in
+        // multi_modal_data via the WireImage list.
+        let n_expected = qwen3_image_token_count(512, 512) as usize;
+
        let full = ctx.token_ids();
        let n_image_pads_full = full.iter()
            .filter(|&&t| t == tokenizer::IMAGE_PAD).count();
-        assert_eq!(n_image_pads_full, qwen3_image_token_count(512, 512) as usize);
+        assert_eq!(n_image_pads_full, n_expected);

-        // Wire side: single image_pad, bytes moved to images list.
        let (wire, images, _) = ctx.wire_prompt(0..ctx.conversation().len(), |_| false);
        let n_image_pads_wire = wire.iter()
            .filter(|&&t| t == tokenizer::IMAGE_PAD).count();
-        assert_eq!(n_image_pads_wire, 1);
+        assert_eq!(n_image_pads_wire, n_expected);
+
        assert_eq!(images.len(), 1);
        assert_eq!(images[0].bytes, vec![0xDE, 0xAD]);
        assert_eq!(images[0].mime, "image/png");

-        // vision_start/vision_end bookends are preserved in wire form.
+        // One pair of vision_start/vision_end bookends around the N pads.
        assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_START).count(), 1);
        assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_END).count(), 1);
    }

    #[test]
    fn test_image_serde_roundtrip() {
-        let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64);
+        let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64, qwen3_image_token_count(64, 64));
        let json = serde_json::to_string(&node).unwrap();
        // bytes must be base64-encoded in the JSON form
        assert!(json.contains("3q2+7w=="));