salience: client-side pad expansion, drop AppendImage

Mirrors the vLLM-side rewrite. AppendImage is gone; images now ride along on Generate via a parallel `images` list. - Productionize `qwen3_image_token_count` (was test-only). Image leaf computes its IMAGE_PAD count eagerly at construction from height/width; `token_count` is no longer "0 until the server tells us." - WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision blocks live inline in the token stream. - `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`. `WireImage` carries `pad_start` / `pad_end` (absolute positions in the full walk) alongside bytes + mime. - `assemble_prompt` returns `(chunks, images, match_upto)`. - `stream_session_mm` / `run_session_generate` take the parallel images list, filter to those past `match_upto`, and pass them in `GenerateRequest.images` as `pb::ImageAttachment` entries. - Drop `SessionHandle::append_image`, `ContextState::commit_image_token_counts`, `StreamToken::ImageAppended`, the WireChunk::Image branch in `learn.rs`, and the now-empty `prompt_to_chunks` helper. - Add 'v' toggle on the conscious-screen tree to render token-id vectors in place of text content (debug-aid: lets us see what the server actually has when output is suspicious). - Comment out the subconscious-trigger spawn loop — Kent had this disabled before; it had crept back into running. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-24 20:26:47 -04:00 · 2026-04-24 20:26:47 -04:00 · fe232cf292
commit fe232cf292
parent 4feebb7bc4
12 changed files with 468 additions and 306 deletions
--- a/src/agent/api/salience.rs
+++ b/src/agent/api/salience.rs
@ -94,6 +94,8 @@ pub struct SessionHandle {

 impl SessionHandle {
    pub async fn open(client: &super::ApiClient) -> Result<Self> {
+        let t0 = std::time::Instant::now();
+        log::debug!(target: "grpc", "OpenSession rpc: start");
        let mut c = client.salience_client().await?;
        let mut req = tonic::Request::new(pb::OpenSessionRequest {
            model: client.model.clone(),
@ -105,8 +107,8 @@ impl SessionHandle {
            .with_context(|| "OpenSession RPC failed")?
            .into_inner();
        log::debug!(target: "grpc",
-            "SessionHandle::open session_id={} max_model_len={}",
-            resp.session_id, resp.max_model_len);
+            "OpenSession rpc: done session_id={} max_model_len={} elapsed={:?}",
+            resp.session_id, resp.max_model_len, t0.elapsed());
        Ok(Self {
            session_id: resp.session_id,
            max_model_len: resp.max_model_len,
@ -117,30 +119,21 @@ impl SessionHandle {

    pub fn client(&self) -> &super::ApiClient { &self.client }

-    /// Append an image via the server-side vision block. Updates
-    /// `committed_len` from the server's response on success.
-    pub async fn append_image(
-        &mut self,
-        data: Vec<u8>,
-        mime: String,
-        truncating: bool,
-    ) -> Result<pb::AppendImageResponse> {
+    /// Debug-only: fetch the server's full session.tokens. Used to
+    /// verify client-side accounting byte-for-byte when divergence
+    /// is suspected. Not cheap on large sessions.
+    pub async fn dump_tokens(&self) -> Result<Vec<u32>> {
        let mut c = self.client.salience_client().await?;
-        let mut req = tonic::Request::new(pb::AppendImageRequest {
+        let mut req = tonic::Request::new(pb::DumpSessionRequest {
            session_id: self.session_id.clone(),
-            data,
-            mime,
-            offset: self.committed_len,
-            truncating,
        });
        with_auth(&mut req, self.client.api_key());
        let resp = c
-            .append_image(req)
+            .dump_session(req)
            .await
-            .with_context(|| "AppendImage RPC failed")?
+            .with_context(|| "DumpSession RPC failed")?
            .into_inner();
-        self.committed_len = resp.total_length;
-        Ok(resp)
+        Ok(resp.tokens)
    }

    /// Open a gRPC Generate stream with the given request. Caller
@ -151,6 +144,10 @@ impl SessionHandle {
        &self,
        req: pb::GenerateRequest,
    ) -> Result<tonic::Streaming<pb::GenerateEvent>> {
+        let t0 = std::time::Instant::now();
+        log::debug!(target: "grpc",
+            "Generate rpc: open-stream session={} offset={} append={} max_tokens={}",
+            self.session_id, req.offset, req.append_tokens.len(), req.max_tokens);
        let mut c = self.client.salience_client().await?;
        let mut req = tonic::Request::new(req);
        with_auth(&mut req, self.client.api_key());
@ -158,6 +155,9 @@ impl SessionHandle {
            .generate(req)
            .await
            .with_context(|| "Generate RPC failed")?;
+        log::debug!(target: "grpc",
+            "Generate rpc: stream opened session={} open-latency={:?}",
+            self.session_id, t0.elapsed());
        Ok(resp.into_inner())
    }

@ -183,6 +183,7 @@ impl SessionHandle {
            top_k: 0,
            stop_token_ids: Vec::new(),
            priority: 0,
+            images: Vec::new(),
        };
        let mut stream = self.generate(req).await?;
        while let Some(event) = stream.next().await {