chunk scoring calls to avoid OOM on large contexts

Split conversation into ~50K token chunks (configurable via
scoring_chunk_tokens in config) for prompt_logprobs calls.
Each chunk ends at an assistant message boundary. Avoids the
~40GB logprobs tensor allocation that OOM'd on full contexts.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-02 22:35:29 -04:00
parent 19205b9bae
commit 29b3aeca57
2 changed files with 74 additions and 6 deletions

View file

@ -55,6 +55,7 @@ pub struct ContextGroup {
fn default_true() -> bool { true }
fn default_context_window() -> usize { 128_000 }
fn default_stream_timeout() -> u64 { 60 }
fn default_scoring_chunk_tokens() -> usize { 50_000 }
fn default_identity_dir() -> PathBuf {
dirs::home_dir().unwrap_or_default().join(".consciousness/identity")
}
@ -95,6 +96,9 @@ pub struct Config {
/// Stream chunk timeout in seconds (no data = timeout).
#[serde(default = "default_stream_timeout")]
pub api_stream_timeout_secs: u64,
/// Max tokens per chunk for memory scoring logprobs calls.
#[serde(default = "default_scoring_chunk_tokens")]
pub scoring_chunk_tokens: usize,
pub api_reasoning: String,
pub agent_types: Vec<String>,
/// Surface agent timeout in seconds.
@ -143,6 +147,7 @@ impl Default for Config {
api_model: None,
api_context_window: default_context_window(),
api_stream_timeout_secs: default_stream_timeout(),
scoring_chunk_tokens: default_scoring_chunk_tokens(),
agent_model: None,
api_reasoning: "high".to_string(),
agent_types: vec![