forked from kent/consciousness
salience: client-side pad expansion, drop AppendImage
Mirrors the vLLM-side rewrite. AppendImage is gone; images now ride along on Generate via a parallel `images` list. - Productionize `qwen3_image_token_count` (was test-only). Image leaf computes its IMAGE_PAD count eagerly at construction from height/width; `token_count` is no longer "0 until the server tells us." - WireChunk shrinks to a single `Tokens(Vec<u32>)` variant — vision blocks live inline in the token stream. - `wire_chunks` now returns `(Vec<WireChunk>, Vec<WireImage>)`. `WireImage` carries `pad_start` / `pad_end` (absolute positions in the full walk) alongside bytes + mime. - `assemble_prompt` returns `(chunks, images, match_upto)`. - `stream_session_mm` / `run_session_generate` take the parallel images list, filter to those past `match_upto`, and pass them in `GenerateRequest.images` as `pb::ImageAttachment` entries. - Drop `SessionHandle::append_image`, `ContextState::commit_image_token_counts`, `StreamToken::ImageAppended`, the WireChunk::Image branch in `learn.rs`, and the now-empty `prompt_to_chunks` helper. - Add 'v' toggle on the conscious-screen tree to render token-id vectors in place of text content (debug-aid: lets us see what the server actually has when output is suspicious). - Comment out the subconscious-trigger spawn loop — Kent had this disabled before; it had crept back into running. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
4feebb7bc4
commit
fe232cf292
12 changed files with 468 additions and 306 deletions
|
|
@ -94,6 +94,8 @@ pub struct SessionHandle {
|
|||
|
||||
impl SessionHandle {
|
||||
pub async fn open(client: &super::ApiClient) -> Result<Self> {
|
||||
let t0 = std::time::Instant::now();
|
||||
log::debug!(target: "grpc", "OpenSession rpc: start");
|
||||
let mut c = client.salience_client().await?;
|
||||
let mut req = tonic::Request::new(pb::OpenSessionRequest {
|
||||
model: client.model.clone(),
|
||||
|
|
@ -105,8 +107,8 @@ impl SessionHandle {
|
|||
.with_context(|| "OpenSession RPC failed")?
|
||||
.into_inner();
|
||||
log::debug!(target: "grpc",
|
||||
"SessionHandle::open session_id={} max_model_len={}",
|
||||
resp.session_id, resp.max_model_len);
|
||||
"OpenSession rpc: done session_id={} max_model_len={} elapsed={:?}",
|
||||
resp.session_id, resp.max_model_len, t0.elapsed());
|
||||
Ok(Self {
|
||||
session_id: resp.session_id,
|
||||
max_model_len: resp.max_model_len,
|
||||
|
|
@ -117,30 +119,21 @@ impl SessionHandle {
|
|||
|
||||
pub fn client(&self) -> &super::ApiClient { &self.client }
|
||||
|
||||
/// Append an image via the server-side vision block. Updates
|
||||
/// `committed_len` from the server's response on success.
|
||||
pub async fn append_image(
|
||||
&mut self,
|
||||
data: Vec<u8>,
|
||||
mime: String,
|
||||
truncating: bool,
|
||||
) -> Result<pb::AppendImageResponse> {
|
||||
/// Debug-only: fetch the server's full session.tokens. Used to
|
||||
/// verify client-side accounting byte-for-byte when divergence
|
||||
/// is suspected. Not cheap on large sessions.
|
||||
pub async fn dump_tokens(&self) -> Result<Vec<u32>> {
|
||||
let mut c = self.client.salience_client().await?;
|
||||
let mut req = tonic::Request::new(pb::AppendImageRequest {
|
||||
let mut req = tonic::Request::new(pb::DumpSessionRequest {
|
||||
session_id: self.session_id.clone(),
|
||||
data,
|
||||
mime,
|
||||
offset: self.committed_len,
|
||||
truncating,
|
||||
});
|
||||
with_auth(&mut req, self.client.api_key());
|
||||
let resp = c
|
||||
.append_image(req)
|
||||
.dump_session(req)
|
||||
.await
|
||||
.with_context(|| "AppendImage RPC failed")?
|
||||
.with_context(|| "DumpSession RPC failed")?
|
||||
.into_inner();
|
||||
self.committed_len = resp.total_length;
|
||||
Ok(resp)
|
||||
Ok(resp.tokens)
|
||||
}
|
||||
|
||||
/// Open a gRPC Generate stream with the given request. Caller
|
||||
|
|
@ -151,6 +144,10 @@ impl SessionHandle {
|
|||
&self,
|
||||
req: pb::GenerateRequest,
|
||||
) -> Result<tonic::Streaming<pb::GenerateEvent>> {
|
||||
let t0 = std::time::Instant::now();
|
||||
log::debug!(target: "grpc",
|
||||
"Generate rpc: open-stream session={} offset={} append={} max_tokens={}",
|
||||
self.session_id, req.offset, req.append_tokens.len(), req.max_tokens);
|
||||
let mut c = self.client.salience_client().await?;
|
||||
let mut req = tonic::Request::new(req);
|
||||
with_auth(&mut req, self.client.api_key());
|
||||
|
|
@ -158,6 +155,9 @@ impl SessionHandle {
|
|||
.generate(req)
|
||||
.await
|
||||
.with_context(|| "Generate RPC failed")?;
|
||||
log::debug!(target: "grpc",
|
||||
"Generate rpc: stream opened session={} open-latency={:?}",
|
||||
self.session_id, t0.elapsed());
|
||||
Ok(resp.into_inner())
|
||||
}
|
||||
|
||||
|
|
@ -183,6 +183,7 @@ impl SessionHandle {
|
|||
top_k: 0,
|
||||
stop_token_ids: Vec::new(),
|
||||
priority: 0,
|
||||
images: Vec::new(),
|
||||
};
|
||||
let mut stream = self.generate(req).await?;
|
||||
while let Some(event) = stream.next().await {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue