forked from kent/consciousness
agent: share one tonic Channel + migrate scoring to gRPC Generate
Two changes that bolt together — the shared connection means the new
scoring path actually costs one HTTP/2 handshake across the whole
process instead of one-per-RPC.
ApiClient gains `salience_channel: Arc<OnceCell<Channel>>`. First
call to `ApiClient::salience_client()` opens the channel via
`connect_channel()` and stores the Channel; subsequent calls clone
it (cheap — tonic multiplexes concurrent RPCs over the single
HTTP/2 connection). Every ApiClient clone shares the same OnceCell,
so all agents spawned from Mind's client — plus every ephemeral
scoring session — reuse one connection.
SessionHandle refactored to hold an `ApiClient` clone instead of
a bag of (base_url, api_key) strings. `open` / `append_image` /
`generate` go through `self.client.salience_client()` now. New
`prefill_only(tokens)` method encapsulates the "Generate with
max_tokens=0 to append text" pattern (previously a private free
function in api/mod.rs called `flush_pending`). Drop impl on
SessionHandle stays — still fires CloseSession on the shared
channel in a detached task.
`run_session_generate` switched from `(base_url, api_key, model)`
to `&ApiClient`; the agent-turn flow that uses it keeps the same
shape but `stream_session_mm` clones the ApiClient into the
spawned worker.
learn.rs migrated from the HTTP `/v1/score` endpoint to a gRPC
session-based score:
* `call_score` opens an ephemeral SessionHandle on the client,
converts (prompt_tokens, images) → Vec<WireChunk> via the new
`prompt_to_chunks` helper (splits on VISION_START/VISION_END),
walks chunks calling `prefill_only` + `append_image`, runs a
final Generate with `max_tokens=0` + `logprobs_ranges` over
the scored positions, and sums each Token event's
`sampled_logprob` per range to produce `ScoreResult`s.
* SessionHandle drops at end of scope → CloseSession auto-fires,
keeping the server's session map clean between calls.
* No more HTTP path, no more `http_client()` helper, no more
`ScoreResponse` / serde plumbing for /v1/score.
* `send_to_train` still uses HTTP (it talks to /v1/train which
isn't on the gRPC protocol); its ad-hoc HTTP client lives
inline now instead of reaching for the deleted `http_client()`.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
be6ba4e9a5
commit
4feebb7bc4
3 changed files with 268 additions and 213 deletions
|
|
@ -24,7 +24,12 @@ pub type SalienceClient = pb::salience_client::SalienceClient<Channel>;
|
|||
/// looks like `https://host:8443`. User-provided CA certs under
|
||||
/// `~/.consciousness/certs/` are trusted in addition to the system
|
||||
/// roots (for self-signed server certs).
|
||||
pub async fn connect(base_url: &str) -> Result<SalienceClient> {
|
||||
///
|
||||
/// Returns the raw `Channel` so callers (`ApiClient::salience_client`)
|
||||
/// can cache it and clone a `SalienceClient` per request without
|
||||
/// reopening the TCP/TLS connection. tonic multiplexes RPCs over the
|
||||
/// shared channel automatically.
|
||||
pub async fn connect_channel(base_url: &str) -> Result<Channel> {
|
||||
let mut endpoint = Endpoint::from_shared(base_url.to_string())
|
||||
.with_context(|| format!("invalid salience endpoint: {}", base_url))?
|
||||
.connect_timeout(std::time::Duration::from_secs(30))
|
||||
|
|
@ -41,11 +46,10 @@ pub async fn connect(base_url: &str) -> Result<SalienceClient> {
|
|||
.with_context(|| "configuring tonic TLS")?;
|
||||
}
|
||||
|
||||
let channel = endpoint
|
||||
endpoint
|
||||
.connect()
|
||||
.await
|
||||
.with_context(|| format!("failed to connect to salience server at {}", base_url))?;
|
||||
Ok(pb::salience_client::SalienceClient::new(channel))
|
||||
.with_context(|| format!("failed to connect to salience server at {}", base_url))
|
||||
}
|
||||
|
||||
/// Derive the gRPC base URL from the HTTP completions base URL.
|
||||
|
|
@ -76,107 +80,42 @@ pub fn with_auth<T>(req: &mut tonic::Request<T>, api_key: &str) {
|
|||
}
|
||||
}
|
||||
|
||||
/// Call the server's `OpenSession` RPC and return the response.
|
||||
pub async fn open_session(
|
||||
base_url: &str,
|
||||
api_key: &str,
|
||||
model: &str,
|
||||
) -> Result<pb::OpenSessionResponse> {
|
||||
let mut client = connect(base_url).await?;
|
||||
let mut req = tonic::Request::new(pb::OpenSessionRequest {
|
||||
model: model.to_string(),
|
||||
});
|
||||
with_auth(&mut req, api_key);
|
||||
let resp = client
|
||||
.open_session(req)
|
||||
.await
|
||||
.with_context(|| "OpenSession RPC failed")?;
|
||||
Ok(resp.into_inner())
|
||||
}
|
||||
|
||||
/// Call the server's `CloseSession` RPC. Idempotent on the server.
|
||||
pub async fn close_session(base_url: &str, api_key: &str, session_id: &str) -> Result<()> {
|
||||
let mut client = connect(base_url).await?;
|
||||
let mut req = tonic::Request::new(pb::CloseSessionRequest {
|
||||
session_id: session_id.to_string(),
|
||||
});
|
||||
with_auth(&mut req, api_key);
|
||||
client
|
||||
.close_session(req)
|
||||
.await
|
||||
.with_context(|| "CloseSession RPC failed")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Append an image to a session. Server decodes the image, computes N
|
||||
/// via vLLM's own multimodal pipeline, writes the full vision block
|
||||
/// (`<|vision_start|> + IMAGE_PAD×N + <|vision_end|>`) into
|
||||
/// session.tokens, and returns (N, new total length).
|
||||
///
|
||||
/// `offset` is the client's view of the session's current token count;
|
||||
/// the server rejects if it diverges from its own (unless
|
||||
/// `truncating=true`, in which case the server slices to `offset`
|
||||
/// first — but never through a vision block).
|
||||
pub async fn append_image(
|
||||
base_url: &str,
|
||||
api_key: &str,
|
||||
session_id: &str,
|
||||
data: Vec<u8>,
|
||||
mime: String,
|
||||
offset: u32,
|
||||
truncating: bool,
|
||||
) -> Result<pb::AppendImageResponse> {
|
||||
let mut client = connect(base_url).await?;
|
||||
let mut req = tonic::Request::new(pb::AppendImageRequest {
|
||||
session_id: session_id.to_string(),
|
||||
data,
|
||||
mime,
|
||||
offset,
|
||||
truncating,
|
||||
});
|
||||
with_auth(&mut req, api_key);
|
||||
let resp = client
|
||||
.append_image(req)
|
||||
.await
|
||||
.with_context(|| "AppendImage RPC failed")?;
|
||||
Ok(resp.into_inner())
|
||||
}
|
||||
|
||||
/// Handle to a server-side session. Carries the id + connection params
|
||||
/// so subsequent per-session RPCs (AppendImage, Generate, ForkSession)
|
||||
/// can be issued without the caller juggling base_url / api_key each
|
||||
/// time. `committed_len` tracks the server's current session.tokens
|
||||
/// length so the client can submit deltas with the right `offset`.
|
||||
/// Handle to a server-side session. Carries the id + an `ApiClient`
|
||||
/// clone (which holds the shared tonic Channel) so subsequent
|
||||
/// per-session RPCs go over the process-global connection.
|
||||
/// `committed_len` tracks the server's current session.tokens length
|
||||
/// so the client can submit deltas with the right `offset`.
|
||||
pub struct SessionHandle {
|
||||
pub session_id: String,
|
||||
pub max_model_len: u32,
|
||||
pub base_url: String,
|
||||
pub api_key: String,
|
||||
pub committed_len: u32,
|
||||
client: super::ApiClient,
|
||||
}
|
||||
|
||||
impl SessionHandle {
|
||||
pub async fn open(base_url: &str, api_key: &str, model: &str) -> Result<Self> {
|
||||
let grpc_url = derive_grpc_url(base_url);
|
||||
log::debug!(target: "grpc",
|
||||
"SessionHandle::open http_base={} -> grpc_url={}",
|
||||
base_url, grpc_url);
|
||||
let resp = open_session(&grpc_url, api_key, model).await?;
|
||||
pub async fn open(client: &super::ApiClient) -> Result<Self> {
|
||||
let mut c = client.salience_client().await?;
|
||||
let mut req = tonic::Request::new(pb::OpenSessionRequest {
|
||||
model: client.model.clone(),
|
||||
});
|
||||
with_auth(&mut req, client.api_key());
|
||||
let resp = c
|
||||
.open_session(req)
|
||||
.await
|
||||
.with_context(|| "OpenSession RPC failed")?
|
||||
.into_inner();
|
||||
log::debug!(target: "grpc",
|
||||
"SessionHandle::open session_id={} max_model_len={}",
|
||||
resp.session_id, resp.max_model_len);
|
||||
Ok(Self {
|
||||
session_id: resp.session_id,
|
||||
max_model_len: resp.max_model_len,
|
||||
base_url: grpc_url,
|
||||
api_key: api_key.to_string(),
|
||||
committed_len: 0,
|
||||
client: client.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn close(self) -> Result<()> {
|
||||
close_session(&self.base_url, &self.api_key, &self.session_id).await
|
||||
}
|
||||
pub fn client(&self) -> &super::ApiClient { &self.client }
|
||||
|
||||
/// Append an image via the server-side vision block. Updates
|
||||
/// `committed_len` from the server's response on success.
|
||||
|
|
@ -186,37 +125,105 @@ impl SessionHandle {
|
|||
mime: String,
|
||||
truncating: bool,
|
||||
) -> Result<pb::AppendImageResponse> {
|
||||
let resp = append_image(
|
||||
&self.base_url,
|
||||
&self.api_key,
|
||||
&self.session_id,
|
||||
let mut c = self.client.salience_client().await?;
|
||||
let mut req = tonic::Request::new(pb::AppendImageRequest {
|
||||
session_id: self.session_id.clone(),
|
||||
data,
|
||||
mime,
|
||||
self.committed_len,
|
||||
offset: self.committed_len,
|
||||
truncating,
|
||||
)
|
||||
.await?;
|
||||
});
|
||||
with_auth(&mut req, self.client.api_key());
|
||||
let resp = c
|
||||
.append_image(req)
|
||||
.await
|
||||
.with_context(|| "AppendImage RPC failed")?
|
||||
.into_inner();
|
||||
self.committed_len = resp.total_length;
|
||||
Ok(resp)
|
||||
}
|
||||
|
||||
/// Open a gRPC Generate stream with the given request. Caller
|
||||
/// iterates the returned stream of GenerateEvents; the handle's
|
||||
/// `committed_len` is advanced on Done based on the Done event's
|
||||
/// `total_tokens` field.
|
||||
/// `committed_len` should be advanced by the caller on Done based
|
||||
/// on the Done event's `total_tokens` field.
|
||||
pub async fn generate(
|
||||
&self,
|
||||
req: pb::GenerateRequest,
|
||||
) -> Result<tonic::Streaming<pb::GenerateEvent>> {
|
||||
let mut client = connect(&self.base_url).await?;
|
||||
let mut c = self.client.salience_client().await?;
|
||||
let mut req = tonic::Request::new(req);
|
||||
with_auth(&mut req, &self.api_key);
|
||||
let resp = client
|
||||
with_auth(&mut req, self.client.api_key());
|
||||
let resp = c
|
||||
.generate(req)
|
||||
.await
|
||||
.with_context(|| "Generate RPC failed")?;
|
||||
Ok(resp.into_inner())
|
||||
}
|
||||
|
||||
/// Run a prefill-only Generate (max_tokens=0) that appends the
|
||||
/// given tokens to the session. No decode, no Token events — the
|
||||
/// server just extends session.tokens and runs prefill to warm
|
||||
/// the KV cache. Used to interleave text runs between AppendImage
|
||||
/// calls, and by score paths that want prompt_logprobs without a
|
||||
/// decode step.
|
||||
pub async fn prefill_only(&mut self, tokens: Vec<u32>) -> Result<()> {
|
||||
use futures::StreamExt;
|
||||
let req = pb::GenerateRequest {
|
||||
session_id: self.session_id.clone(),
|
||||
append_tokens: tokens,
|
||||
offset: self.committed_len,
|
||||
truncating: false,
|
||||
max_tokens: 0,
|
||||
logprobs_ranges: Vec::new(),
|
||||
logprob_top_k: 0,
|
||||
readout_ranges: Vec::new(),
|
||||
temperature: 0.0,
|
||||
top_p: 0.0,
|
||||
top_k: 0,
|
||||
stop_token_ids: Vec::new(),
|
||||
priority: 0,
|
||||
};
|
||||
let mut stream = self.generate(req).await?;
|
||||
while let Some(event) = stream.next().await {
|
||||
let event = event.map_err(|s| anyhow::anyhow!("prefill Generate stream: {}", s))?;
|
||||
if let Some(pb::generate_event::Event::Done(d)) = event.event {
|
||||
self.committed_len = d.total_tokens;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop → fire CloseSession in a detached task so servers don't leak
|
||||
/// sessions until TTL eviction. Best-effort: if no tokio runtime is
|
||||
/// available we skip; the server's 30min TTL will reap it eventually.
|
||||
impl Drop for SessionHandle {
|
||||
fn drop(&mut self) {
|
||||
if self.session_id.is_empty() {
|
||||
return;
|
||||
}
|
||||
let session_id = std::mem::take(&mut self.session_id);
|
||||
let client = self.client.clone();
|
||||
let Ok(rt) = tokio::runtime::Handle::try_current() else {
|
||||
log::debug!(target: "grpc",
|
||||
"SessionHandle drop outside tokio runtime, session {} leaks to TTL",
|
||||
session_id);
|
||||
return;
|
||||
};
|
||||
rt.spawn(async move {
|
||||
let Ok(mut c) = client.salience_client().await else { return };
|
||||
let mut req = tonic::Request::new(pb::CloseSessionRequest {
|
||||
session_id: session_id.clone(),
|
||||
});
|
||||
with_auth(&mut req, client.api_key());
|
||||
if let Err(e) = c.close_session(req).await {
|
||||
log::debug!(target: "grpc",
|
||||
"CloseSession on drop failed for {}: {:#}",
|
||||
session_id, e);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue