// api/ — LLM API client (OpenAI-compatible) // // Works with any provider that implements the OpenAI chat completions // API: OpenRouter, vLLM, llama.cpp, Fireworks, Together, etc. // // Diagnostics: anomalies always logged to debug panel. // Set POC_DEBUG=1 for verbose per-turn logging. pub mod http; use std::time::{Duration, Instant}; use anyhow::Result; use tokio::sync::mpsc; use serde::Deserialize; use http::{HttpClient, HttpResponse}; #[derive(Debug, Clone, Deserialize)] pub struct Usage { pub prompt_tokens: u32, pub completion_tokens: u32, pub total_tokens: u32, } /// Concept-readout manifest returned by the vLLM server's /// `/v1/readout/manifest` endpoint. Maps the nameless tensor indices /// in streaming `readout` fields back to concept names and layer /// indices. #[derive(Debug, Clone, Deserialize)] pub struct ReadoutManifest { pub concepts: Vec, pub layers: Vec, } /// Per-token per-layer concept projections streamed alongside each /// sampled token. Shape `[n_layers][n_concepts]`. Named values come /// from pairing with the manifest fetched at startup. pub type TokenReadout = Vec>; /// A JoinHandle that aborts its task when dropped. pub(crate) struct AbortOnDrop(tokio::task::JoinHandle<()>); impl Drop for AbortOnDrop { fn drop(&mut self) { self.0.abort(); } } /// Sampling parameters for model generation. #[derive(Clone, Copy)] pub(crate) struct SamplingParams { pub temperature: f32, pub top_p: f32, pub top_k: u32, } // ───────────────────────────────────────────────────────────── // Stream events — yielded by backends, consumed by the runner // ───────────────────────────────────────────────────────────── /// One token from the streaming completions API. pub enum StreamToken { /// A sampled token, optionally with its per-layer concept readout. /// `readout` is `None` when the server has readout disabled or /// returned no readout for this chunk. Token { id: u32, readout: Option }, Done { usage: Option }, Error(String), } #[derive(Clone)] pub struct ApiClient { client: HttpClient, api_key: String, pub model: String, base_url: String, } impl ApiClient { pub fn new(base_url: &str, api_key: &str, model: &str) -> Self { let client = HttpClient::builder() .connect_timeout(Duration::from_secs(30)) .timeout(Duration::from_secs(600)) .build(); Self { client, api_key: api_key.to_string(), model: model.to_string(), base_url: base_url.trim_end_matches('/').to_string(), } } pub(crate) fn stream_completion_mm( &self, prompt_tokens: &[u32], images: &[super::context::WireImage], sampling: SamplingParams, priority: Option, ) -> (mpsc::UnboundedReceiver, AbortOnDrop) { let (tx, rx) = mpsc::unbounded_channel(); let client = self.client.clone(); let api_key = self.api_key.clone(); let model = self.model.clone(); let prompt_tokens = prompt_tokens.to_vec(); let images: Vec<(Vec, String)> = images.iter() .map(|i| (i.bytes.clone(), i.mime.clone())) .collect(); let base_url = self.base_url.clone(); let handle = tokio::spawn(async move { let result = stream_completions( &client, &base_url, &api_key, &model, &prompt_tokens, &images, &tx, sampling, priority, ).await; if let Err(e) = result { let _ = tx.send(StreamToken::Error(e.to_string())); } }); (rx, AbortOnDrop(handle)) } pub fn base_url(&self) -> &str { &self.base_url } pub fn api_key(&self) -> &str { &self.api_key } /// Fetch `/v1/readout/manifest` — returns `Ok(Some(..))` if /// readout is enabled on the server, `Ok(None)` on 404 (disabled), /// or an error on any other failure. /// /// Call once at startup and cache the result; the manifest doesn't /// change during a server run. pub async fn fetch_readout_manifest(&self) -> Result> { let url = format!("{}/readout/manifest", self.base_url); let auth = format!("Bearer {}", self.api_key); let response = self .client .get_with_headers(&url, &[("Authorization", &auth)]) .await .map_err(|e| anyhow::anyhow!("readout manifest fetch ({}): {}", url, e))?; let status = response.status(); if status.as_u16() == 404 { return Ok(None); } if !status.is_success() { let body = response.text().await.unwrap_or_default(); let n = body.floor_char_boundary(body.len().min(500)); anyhow::bail!("readout manifest HTTP {} ({}): {}", status, url, &body[..n]); } Ok(Some(response.json().await?)) } } async fn stream_completions( client: &HttpClient, base_url: &str, api_key: &str, model: &str, prompt_tokens: &[u32], images: &[(Vec, String)], tx: &mpsc::UnboundedSender, sampling: SamplingParams, priority: Option, ) -> anyhow::Result<()> { let mut request = serde_json::json!({ "model": model, "prompt": prompt_tokens, "max_tokens": 16384, "temperature": sampling.temperature, "top_p": sampling.top_p, "top_k": sampling.top_k, "stream": true, "return_token_ids": true, "skip_special_tokens": false, "stop_token_ids": [super::tokenizer::IM_END], }); if !images.is_empty() { use base64::Engine; let b64 = base64::engine::general_purpose::STANDARD; let uris: Vec = images.iter() .map(|(bytes, mime)| format!("data:{};base64,{}", mime, b64.encode(bytes))) .collect(); request["multi_modal_data"] = serde_json::json!({ "image": uris }); } if let Some(p) = priority { request["priority"] = serde_json::json!(p); } let url = format!("{}/completions", base_url); let debug_label = format!("{} prompt tokens, model={}", prompt_tokens.len(), model); let mut response = send_and_check( client, &url, &request, ("Authorization", &format!("Bearer {}", api_key)), &[], &debug_label, None, ).await?; let mut reader = SseReader::new(); let mut usage = None; while let Some(event) = reader.next_event(&mut response).await? { if let Some(err_msg) = event["error"]["message"].as_str() { anyhow::bail!("API error in stream: {}", err_msg); } if let Some(u) = event["usage"].as_object() { if let Ok(u) = serde_json::from_value::(serde_json::Value::Object(u.clone())) { usage = Some(u); } } let choices = match event["choices"].as_array() { Some(c) => c, None => continue, }; for choice in choices { // `readout`, if present, is a nested list // `[num_tokens][n_layers][n_concepts]`. Parse it once per // chunk and pair rows with token ids by index — the rows // are in the same order as `token_ids`. let readouts: Option> = choice["readout"] .as_array() .map(|outer| { outer.iter().filter_map(|per_token| { per_token.as_array().map(|layers| { layers.iter().filter_map(|per_layer| { per_layer.as_array().map(|vals| { vals.iter() .filter_map(|v| v.as_f64().map(|f| f as f32)) .collect::>() }) }).collect::>>() }) }).collect() }); if let Some(ids) = choice["token_ids"].as_array() { for (i, id_val) in ids.iter().enumerate() { if let Some(id) = id_val.as_u64() { let readout = readouts .as_ref() .and_then(|r| r.get(i).cloned()); let _ = tx.send(StreamToken::Token { id: id as u32, readout, }); } } } else if let Some(text) = choice["text"].as_str() { // Fallback: provider didn't return token_ids, encode locally. // No readout available in this path — the encoder may // produce a different token count than the server did. if !text.is_empty() { for id in super::tokenizer::encode(text) { let _ = tx.send(StreamToken::Token { id, readout: None }); } } } } } let _ = tx.send(StreamToken::Done { usage }); Ok(()) } /// Send an HTTP request and check for errors. pub(crate) async fn send_and_check( client: &HttpClient, url: &str, body: &impl serde::Serialize, auth_header: (&str, &str), extra_headers: &[(&str, &str)], debug_label: &str, request_json: Option<&str>, ) -> Result { let debug = std::env::var("POC_DEBUG").is_ok(); let start = Instant::now(); if debug { let payload_size = serde_json::to_string(body) .map(|s| s.len()) .unwrap_or(0); dbglog!( "request: {}K payload, {}", payload_size / 1024, debug_label, ); } let mut headers: Vec<(&str, &str)> = Vec::with_capacity(extra_headers.len() + 1); headers.push(auth_header); headers.extend_from_slice(extra_headers); let response = client .send_json("POST", url, &headers, body) .await .map_err(|e| { let msg = e.to_string(); let cause = if msg.contains("connect timeout") || msg.contains("TCP connect") { "connection refused" } else if msg.contains("request timeout") { "request timed out" } else { "request error" }; anyhow::anyhow!("{} ({}): {}", cause, url, msg) })?; let status = response.status(); let elapsed = start.elapsed(); if debug { for name in [ "x-ratelimit-remaining", "x-ratelimit-limit", "x-request-id", ] { if let Some(val) = response.header(name) { dbglog!("header {}: {}", name, val); } } } if !status.is_success() { let body = response.text().await.unwrap_or_default(); dbglog!( "HTTP {} after {:.1}s ({}): {}", status, elapsed.as_secs_f64(), url, &body[..body.floor_char_boundary(body.len().min(500))] ); if let Some(json) = request_json { let log_dir = dirs::home_dir() .unwrap_or_default() .join(".consciousness/logs/failed-requests"); let _ = std::fs::create_dir_all(&log_dir); let ts = chrono::Local::now().format("%Y%m%dT%H%M%S"); let path = log_dir.join(format!("{}.json", ts)); if std::fs::write(&path, json).is_ok() { dbglog!( "saved failed request to {} (HTTP {})", path.display(), status ); } } anyhow::bail!("HTTP {} ({}): {}", status, url, &body[..body.floor_char_boundary(body.len().min(1000))]); } if debug { dbglog!( "connected in {:.1}s (HTTP {})", elapsed.as_secs_f64(), status.as_u16() ); } Ok(response) } /// SSE stream reader. Handles the generic SSE plumbing shared by both /// backends: chunk reading with timeout, line buffering, `data:` prefix /// stripping, `[DONE]` detection, JSON parsing, and parse error diagnostics. /// Yields parsed events as serde_json::Value — each backend handles its /// own event types. pub(crate) struct SseReader { line_buf: String, chunk_timeout: Duration, pub stream_start: Instant, pub chunks_received: u64, pub sse_lines_parsed: u64, pub sse_parse_errors: u64, debug: bool, done: bool, /// Serialized request payload — saved to disk on errors for replay debugging. pub(crate) request_json: Option, } impl SseReader { pub(crate) fn new() -> Self { Self { line_buf: String::new(), chunk_timeout: Duration::from_secs(crate::config::get().api_stream_timeout_secs), stream_start: Instant::now(), chunks_received: 0, sse_lines_parsed: 0, sse_parse_errors: 0, debug: std::env::var("POC_DEBUG").is_ok(), done: false, request_json: None, } } /// Attach the serialized request payload for error diagnostics. /// Save the request payload to disk for replay debugging. fn save_failed_request(&self, reason: &str) { let Some(ref json) = self.request_json else { return }; let log_dir = dirs::home_dir() .unwrap_or_default() .join(".consciousness/logs/failed-requests"); let _ = std::fs::create_dir_all(&log_dir); let ts = chrono::Local::now().format("%Y%m%dT%H%M%S"); let path = log_dir.join(format!("{}.json", ts)); if std::fs::write(&path, json).is_ok() { dbglog!( "saved failed request to {} ({})", path.display(), reason ); } } /// Read the next SSE event from the response stream. /// Returns Ok(Some(value)) for each parsed data line, /// Ok(None) when the stream ends or [DONE] is received. pub(crate) async fn next_event( &mut self, response: &mut HttpResponse, ) -> Result> { loop { // Drain complete lines from the buffer before reading more chunks while let Some(newline_pos) = self.line_buf.find('\n') { let line = self.line_buf[..newline_pos].trim().to_string(); self.line_buf = self.line_buf[newline_pos + 1..].to_string(); if line == "data: [DONE]" { self.done = true; return Ok(None); } if line.is_empty() || line.starts_with("event: ") || !line.starts_with("data: ") { continue; } let json_str = &line[6..]; self.sse_lines_parsed += 1; match serde_json::from_str(json_str) { Ok(v) => return Ok(Some(v)), Err(e) => { self.sse_parse_errors += 1; if self.sse_parse_errors == 1 || self.debug { let preview = if json_str.len() > 200 { format!("{}...", &json_str[..200]) } else { json_str.to_string() }; dbglog!( "SSE parse error (#{}) {}: {}", self.sse_parse_errors, e, preview ); } continue; } } } if self.done { return Ok(None); } // Read more data from the response stream match tokio::time::timeout(self.chunk_timeout, response.chunk()).await { Ok(Ok(Some(chunk))) => { self.chunks_received += 1; self.line_buf.push_str(&String::from_utf8_lossy(&chunk)); } Ok(Ok(None)) => return Ok(None), Ok(Err(e)) => { let buf_preview = if self.line_buf.is_empty() { "(empty)".to_string() } else { let n = self.line_buf.len().min(500); format!("{}B: {}", self.line_buf.len(), &self.line_buf[..n]) }; let msg = format!( "stream error after {} chunks, {:.1}s, {} sse lines: {} | buf: {}", self.chunks_received, self.stream_start.elapsed().as_secs_f64(), self.sse_lines_parsed, e, buf_preview, ); dbglog!("{}", msg); self.save_failed_request(&msg); return Err(e.into()); } Err(_) => { let buf_preview = if self.line_buf.is_empty() { "(empty)".to_string() } else { let n = self.line_buf.len().min(500); format!("{}B: {}", self.line_buf.len(), &self.line_buf[..n]) }; let msg = format!( "stream timeout: {}s, {} chunks, {} sse lines, {:.1}s elapsed | buf: {}", self.chunk_timeout.as_secs(), self.chunks_received, self.sse_lines_parsed, self.stream_start.elapsed().as_secs_f64(), buf_preview, ); dbglog!("{}", msg); self.save_failed_request(&msg); anyhow::bail!( "stream timeout: no data for {}s ({} chunks received)", self.chunk_timeout.as_secs(), self.chunks_received ); } } } } }