salience: client-side pad expansion, drop AppendImage

Mirrors the vLLM-side rewrite. AppendImage is gone; images now
ride along on Generate via a parallel `images` list.

- Productionize `qwen3_image_token_count` (was test-only). Image
  leaf computes its IMAGE_PAD count eagerly at construction from
  height/width; `token_count` is no longer "0 until the server
  tells us."
- WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision
  blocks live inline in the token stream.
- `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`.
  `WireImage` carries `pad_start` / `pad_end` (absolute positions
  in the full walk) alongside bytes + mime.
- `assemble_prompt` returns `(chunks, images, match_upto)`.
- `stream_session_mm` / `run_session_generate` take the parallel
  images list, filter to those past `match_upto`, and pass them
  in `GenerateRequest.images` as `pb::ImageAttachment` entries.
- Drop `SessionHandle::append_image`,
  `ContextState::commit_image_token_counts`,
  `StreamToken::ImageAppended`, the WireChunk::Image branch in
  `learn.rs`, and the now-empty `prompt_to_chunks` helper.
- Add 'v' toggle on the conscious-screen tree to render token-id
  vectors in place of text content (debug-aid: lets us see what
  the server actually has when output is suspicious).
- Comment out the subconscious-trigger spawn loop — Kent had this
  disabled before; it had crept back into running.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-24 20:26:47 -04:00
commit fe232cf292
12 changed files with 468 additions and 306 deletions

View file

@ -94,6 +94,8 @@ pub struct SessionHandle {
impl SessionHandle {
pub async fn open(client: &super::ApiClient) -> Result<Self> {
let t0 = std::time::Instant::now();
log::debug!(target: "grpc", "OpenSession rpc: start");
let mut c = client.salience_client().await?;
let mut req = tonic::Request::new(pb::OpenSessionRequest {
model: client.model.clone(),
@ -105,8 +107,8 @@ impl SessionHandle {
.with_context(|| "OpenSession RPC failed")?
.into_inner();
log::debug!(target: "grpc",
"SessionHandle::open session_id={} max_model_len={}",
resp.session_id, resp.max_model_len);
"OpenSession rpc: done session_id={} max_model_len={} elapsed={:?}",
resp.session_id, resp.max_model_len, t0.elapsed());
Ok(Self {
session_id: resp.session_id,
max_model_len: resp.max_model_len,
@ -117,30 +119,21 @@ impl SessionHandle {
pub fn client(&self) -> &super::ApiClient { &self.client }
/// Append an image via the server-side vision block. Updates
/// `committed_len` from the server's response on success.
pub async fn append_image(
&mut self,
data: Vec<u8>,
mime: String,
truncating: bool,
) -> Result<pb::AppendImageResponse> {
/// Debug-only: fetch the server's full session.tokens. Used to
/// verify client-side accounting byte-for-byte when divergence
/// is suspected. Not cheap on large sessions.
pub async fn dump_tokens(&self) -> Result<Vec<u32>> {
let mut c = self.client.salience_client().await?;
let mut req = tonic::Request::new(pb::AppendImageRequest {
let mut req = tonic::Request::new(pb::DumpSessionRequest {
session_id: self.session_id.clone(),
data,
mime,
offset: self.committed_len,
truncating,
});
with_auth(&mut req, self.client.api_key());
let resp = c
.append_image(req)
.dump_session(req)
.await
.with_context(|| "AppendImage RPC failed")?
.with_context(|| "DumpSession RPC failed")?
.into_inner();
self.committed_len = resp.total_length;
Ok(resp)
Ok(resp.tokens)
}
/// Open a gRPC Generate stream with the given request. Caller
@ -151,6 +144,10 @@ impl SessionHandle {
&self,
req: pb::GenerateRequest,
) -> Result<tonic::Streaming<pb::GenerateEvent>> {
let t0 = std::time::Instant::now();
log::debug!(target: "grpc",
"Generate rpc: open-stream session={} offset={} append={} max_tokens={}",
self.session_id, req.offset, req.append_tokens.len(), req.max_tokens);
let mut c = self.client.salience_client().await?;
let mut req = tonic::Request::new(req);
with_auth(&mut req, self.client.api_key());
@ -158,6 +155,9 @@ impl SessionHandle {
.generate(req)
.await
.with_context(|| "Generate RPC failed")?;
log::debug!(target: "grpc",
"Generate rpc: stream opened session={} open-latency={:?}",
self.session_id, t0.elapsed());
Ok(resp.into_inner())
}
@ -183,6 +183,7 @@ impl SessionHandle {
top_k: 0,
stop_token_ids: Vec::new(),
priority: 0,
images: Vec::new(),
};
let mut stream = self.generate(req).await?;
while let Some(event) = stream.next().await {