chunk scoring calls to avoid OOM on large contexts
Split conversation into ~50K token chunks (configurable via scoring_chunk_tokens in config) for prompt_logprobs calls. Each chunk ends at an assistant message boundary. Avoids the ~40GB logprobs tensor allocation that OOM'd on full contexts. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
19205b9bae
commit
29b3aeca57
2 changed files with 74 additions and 6 deletions
|
|
@ -55,6 +55,7 @@ pub struct ContextGroup {
|
|||
fn default_true() -> bool { true }
|
||||
fn default_context_window() -> usize { 128_000 }
|
||||
fn default_stream_timeout() -> u64 { 60 }
|
||||
fn default_scoring_chunk_tokens() -> usize { 50_000 }
|
||||
fn default_identity_dir() -> PathBuf {
|
||||
dirs::home_dir().unwrap_or_default().join(".consciousness/identity")
|
||||
}
|
||||
|
|
@ -95,6 +96,9 @@ pub struct Config {
|
|||
/// Stream chunk timeout in seconds (no data = timeout).
|
||||
#[serde(default = "default_stream_timeout")]
|
||||
pub api_stream_timeout_secs: u64,
|
||||
/// Max tokens per chunk for memory scoring logprobs calls.
|
||||
#[serde(default = "default_scoring_chunk_tokens")]
|
||||
pub scoring_chunk_tokens: usize,
|
||||
pub api_reasoning: String,
|
||||
pub agent_types: Vec<String>,
|
||||
/// Surface agent timeout in seconds.
|
||||
|
|
@ -143,6 +147,7 @@ impl Default for Config {
|
|||
api_model: None,
|
||||
api_context_window: default_context_window(),
|
||||
api_stream_timeout_secs: default_stream_timeout(),
|
||||
scoring_chunk_tokens: default_scoring_chunk_tokens(),
|
||||
agent_model: None,
|
||||
api_reasoning: "high".to_string(),
|
||||
agent_types: vec![
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue