consciousness/src/subconscious/learn.rs

// training.rs — Memory importance scoring via /v1/score
//
// Three scoring modes, all built on the same call_score() primitive:
//
// score_memories()  — Full N×M matrix (memories × responses) for the
//                     debug screen. Expensive: N+1 API calls.
//
// memory_score()    — Single memory importance. Scores the 50 messages
//                     after it was surfaced, with/without that memory.
//                     2 API calls.
//
// finetune_score()  — Identifies training candidates. Scores recent
//                     messages with all memories stripped. Responses
//                     with high divergence depend on memories the model
//                     hasn't internalized. 2 API calls.

use crate::agent::api::ApiClient;
use crate::agent::api::types::*;
use crate::agent::context::{ConversationEntry, ContextState};

const SCORE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(120);

// ── Message building ────────────────────────────────────────────

/// What to filter when building the message array for scoring.
enum Filter<'a> {
    None,
    SkipIndex(usize),
    SkipKey(&'a str),
    SkipAllMemories,
}

/// Build the messages array for a scoring call.
///
/// Always includes system prompt + context message as prefix, then
/// entries from `range` filtered by `filter`.
fn build_messages(
    context: &ContextState,
    range: std::ops::Range<usize>,
    filter: Filter,
) -> Vec<serde_json::Value> {
    let mut msgs = vec![
        serde_json::json!({"role": "system", "content": &context.system_prompt}),
    ];
    let ctx = context.render_context_message();
    if !ctx.is_empty() {
        msgs.push(serde_json::json!({"role": "user", "content": ctx}));
    }
    for i in range {
        let entry = &context.entries[i];
        let skip = match &filter {
            Filter::None => false,
            Filter::SkipIndex(idx) => i == *idx,
            Filter::SkipKey(key) => matches!(entry, ConversationEntry::Memory { key: k, .. } if k == key),
            Filter::SkipAllMemories => entry.is_memory(),
        };
        if skip { continue; }
        let m = entry.api_message();
        msgs.push(serde_json::json!({
            "role": m.role_str(),
            "content": m.content_text(),
        }));
    }
    msgs
}

// ── Score API ───────────────────────────────────────────────────

#[derive(serde::Deserialize)]
struct ScoreResult {
    total_logprob: f64,
}

#[derive(serde::Deserialize)]
struct ScoreResponse {
    scores: Vec<ScoreResult>,
}

fn http_client() -> reqwest::Client {
    reqwest::Client::builder()
        .timeout(SCORE_TIMEOUT)
        .pool_max_idle_per_host(2)
        .build()
        .unwrap_or_default()
}

async fn call_score(
    http: &reqwest::Client,
    client: &ApiClient,
    messages: &[serde_json::Value],
) -> anyhow::Result<Vec<ScoreResult>> {
    let response = http
        .post(format!("{}/score", client.base_url()))
        .header("Content-Type", "application/json")
        .header("Authorization", format!("Bearer {}", client.api_key()))
        .json(&serde_json::json!({
            "model": client.model,
            "messages": messages,
            "logprobs": 1,
        }))
        .send()
        .await
        .map_err(|e| if e.is_timeout() {
            anyhow::anyhow!("score request timed out after {}s", SCORE_TIMEOUT.as_secs())
        } else {
            anyhow::anyhow!("score request failed: {}", e)
        })?;

    let status = response.status();
    let body: serde_json::Value = response.json().await?;

    if !status.is_success() {
        let msg = body.get("error").and_then(|e| e.as_str()).unwrap_or("unknown error");
        anyhow::bail!("score API HTTP {}: {}", status, msg);
    }
    if let Some(err) = body.get("error").and_then(|e| e.as_str()) {
        anyhow::bail!("score API error: {}", err);
    }

    let result: ScoreResponse = serde_json::from_value(body)
        .map_err(|e| anyhow::anyhow!("failed to parse score response: {}", e))?;
    Ok(result.scores)
}

/// Compute per-position logprob divergence: how much worse the model
/// scores each response without something vs with it.
fn divergence(baseline: &[ScoreResult], without: &[ScoreResult]) -> Vec<f64> {
    baseline.iter().enumerate()
        .map(|(i, base)| {
            let without_lp = without.get(i).map(|s| s.total_logprob).unwrap_or(base.total_logprob);
            (base.total_logprob - without_lp).max(0.0)
        })
        .collect()
}

/// Score two message sets and return total divergence.
async fn score_divergence(
    http: &reqwest::Client,
    client: &ApiClient,
    context: &ContextState,
    range: std::ops::Range<usize>,
    filter: Filter<'_>,
) -> anyhow::Result<(Vec<f64>, Vec<ScoreResult>)> {
    let baseline = call_score(http, client, &build_messages(context, range.clone(), Filter::None)).await?;
    let without = call_score(http, client, &build_messages(context, range, filter)).await?;
    let divs = divergence(&baseline, &without);
    Ok((divs, baseline))
}

// ── Full matrix scoring (debug screen) ──────────────────────────

/// Result of scoring one conversation's memory usage.
pub struct MemoryScore {
    pub memory_weights: Vec<(String, f64)>,
    pub response_scores: Vec<f64>,
    /// Full matrix: divergence[memory_idx][response_idx]
    pub matrix: Vec<Vec<f64>>,
    pub memory_keys: Vec<String>,
    pub response_entry_indices: Vec<usize>,
}

impl MemoryScore {
    pub fn important_memories_for_entry(&self, entry_idx: usize) -> Vec<(&str, f64)> {
        let Some(resp_idx) = self.response_entry_indices.iter().position(|&i| i == entry_idx)
        else { return Vec::new() };

        let mut result: Vec<(&str, f64)> = self.memory_keys.iter()
            .zip(self.matrix.iter())
            .filter_map(|(key, row)| {
                let score = row.get(resp_idx).copied().unwrap_or(0.0);
                if score > 0.01 { Some((key.as_str(), score)) } else { None }
            })
            .collect();
        result.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
        result
    }
}

/// Score how important each memory is to the conversation (full matrix).
pub async fn score_memories(
    context: &ContextState,
    client: &ApiClient,
) -> anyhow::Result<MemoryScore> {
    let mut memory_keys: Vec<String> = context.entries.iter()
        .filter_map(|e| match e {
            ConversationEntry::Memory { key, .. } => Some(key.clone()),
            _ => None,
        })
        .collect();
    memory_keys.dedup();

    let response_indices: Vec<usize> = context.entries.iter().enumerate()
        .filter(|(_, e)| e.message().role == Role::Assistant)
        .map(|(i, _)| i)
        .collect();

    if memory_keys.is_empty() || response_indices.is_empty() {
        return Ok(MemoryScore {
            memory_weights: Vec::new(), response_scores: Vec::new(),
            matrix: Vec::new(), memory_keys: Vec::new(),
            response_entry_indices: Vec::new(),
        });
    }


    let http = http_client();
    let range = 0..context.entries.len();

    let baseline = call_score(&http, client, &build_messages(context, range.clone(), Filter::None)).await?;

    let total = memory_keys.len();
    let mut matrix: Vec<Vec<f64>> = Vec::new();

    for (mem_idx, key) in memory_keys.iter().enumerate() {
        dbglog!(
            "scoring {}/{}: {}...", mem_idx + 1, total, key,
        );
        let msgs = build_messages(context, range.clone(), Filter::SkipKey(key));
        match call_score(&http, client, &msgs).await {
            Ok(without) => matrix.push(divergence(&baseline, &without)),
            Err(e) => {
                dbglog!(
                    "[training] {} FAILED: {:#}", key, e,
                );
                matrix.push(vec![0.0; baseline.len()]);
            }
        }
    }


    let memory_weights: Vec<(String, f64)> = memory_keys.iter()
        .zip(matrix.iter())
        .map(|(key, row)| (key.clone(), row.iter().sum()))
        .collect();

    let mut response_scores = vec![0.0; response_indices.len()];
    for row in &matrix {
        for (j, &v) in row.iter().enumerate() {
            if j < response_scores.len() { response_scores[j] += v; }
        }
    }

    Ok(MemoryScore {
        memory_weights, response_scores, matrix, memory_keys,
        response_entry_indices: response_indices,
    })
}

/// Find the entry index after `start` that contains the Nth assistant response.
/// Returns (end_index, true) if N responses were found, (entries.len(), false) if not.
fn nth_response_end(entries: &[ConversationEntry], start: usize, n: usize) -> (usize, bool) {
    let mut count = 0;
    for i in start..entries.len() {
        if entries[i].message().role == Role::Assistant {
            count += 1;
            if count >= n { return (i + 1, true); }
        }
    }
    (entries.len(), false)
}

// ── Single memory scoring ───────────────────────────────────────

/// Score how important a single memory is to the conversation.
///
/// Scores the 50 messages after the memory was surfaced — the window
/// where it could have influenced responses. Returns the sum of
/// divergence, or 0.0 if the memory isn't in the conversation.
pub async fn score_memory(
    context: &ContextState,
    key: &str,
    client: &ApiClient,
) -> anyhow::Result<f64> {
    const RESPONSE_WINDOW: usize = 50;

    let first_pos = match context.entries.iter().position(|e| {
        matches!(e, ConversationEntry::Memory { key: k, .. } if k == key)
    }) {
        Some(p) => p,
        None => return Ok(0.0),
    };

    let (end, _) = nth_response_end(&context.entries, first_pos, RESPONSE_WINDOW);
    let range = first_pos..end;
    if !context.entries[range.clone()].iter().any(|e| e.message().role == Role::Assistant) {
        return Ok(0.0);
    }

    let http = http_client();
    let (divs, _) = score_divergence(&http, client, context, range, Filter::SkipKey(key)).await?;

    Ok(divs.iter().sum())
}

// ── Background memory scoring ───────────────────────────────────

/// Score memories in the conversation that are due for re-scoring.
///
/// Checks the graph for each memory's last_scored timestamp. Scores
/// nodes that haven't been scored within `max_age_secs`, oldest first.
/// Updates the graph weight (EWMA) and last_scored after each.
///
/// Returns the number of nodes scored and their (key, score) pairs.
pub async fn score_memories_incremental(
    context: &ContextState,
    max_age_secs: i64,
    response_window: usize,
    client: &ApiClient,
    agent: &std::sync::Arc<tokio::sync::Mutex<crate::agent::Agent>>,
) -> anyhow::Result<Vec<(String, f64)>> {
    let now = chrono::Utc::now().timestamp();

    // Collect unique memory keys with their first position
    let mut seen = std::collections::HashSet::new();
    let mut candidates: Vec<(usize, String, i64)> = Vec::new(); // (pos, key, last_scored)

    let store = crate::hippocampus::store::Store::load().unwrap_or_default();

    for (i, entry) in context.entries.iter().enumerate() {
        if let ConversationEntry::Memory { key, .. } = entry {
            if !seen.insert(key.clone()) { continue; }
            let last_scored = store.nodes.get(key.as_str())
                .map(|n| n.last_scored)
                .unwrap_or(0);
            if now - last_scored >= max_age_secs {
                candidates.push((i, key.clone(), last_scored));
            }
        }
    }

    // Score oldest-first
    candidates.sort_by_key(|&(_, _, last)| last);

    let http = http_client();
    let mut results = Vec::new();

    let total_entries = context.entries.len();
    let first_quarter = total_entries / 4;

    for (pos, key, _) in &candidates {
        let (end, full_window) = nth_response_end(&context.entries, *pos, response_window);
        // Skip memories without a full window, unless they're in the
        // first quarter of the conversation (always score those).
        if !full_window && *pos >= first_quarter {
            continue;
        }
        let range = *pos..end;
        if !context.entries[range.clone()].iter().any(|e| e.message().role == Role::Assistant) {
            continue;
        }

        let _scoring = crate::agent::start_activity(agent, format!("scoring: {}", key)).await;
        match score_divergence(&http, client, context, range, Filter::SkipKey(key)).await {
            Ok((divs, _)) => {
                let n_responses = divs.len();
                let max_div = divs.iter().cloned().fold(0.0f64, f64::max);
                dbglog!(
                    "[scoring] {} max:{:.3} ({} responses)", key, max_div, n_responses,
                );
                results.push((key.clone(), max_div));
            }
            Err(e) => {
                dbglog!(
                    "[scoring] {} FAILED: {:#}", key, e,
                );
            }
        }
    }

    Ok(results)
}

// ── Fine-tuning scoring ─────────────────────────────────────────

/// Score which recent responses are candidates for fine-tuning.
///
/// Removes all memories and scores the most recent `count` messages.
/// Responses with high divergence depend on memories the model hasn't
/// internalized — these are fine-tuning candidates.
///
/// Returns (entry_index, divergence) pairs, sorted by divergence descending.
pub async fn score_finetune(
    context: &ContextState,
    count: usize,
    client: &ApiClient,
) -> anyhow::Result<Vec<(usize, f64)>> {
    let range = context.entries.len().saturating_sub(count)..context.entries.len();

    let response_positions: Vec<usize> = range.clone()
        .filter(|&i| context.entries[i].message().role == Role::Assistant)
        .collect();
    if response_positions.is_empty() {
        return Ok(Vec::new());
    }

    let http = http_client();
    let (divs, _) = score_divergence(&http, client, context, range, Filter::SkipAllMemories).await?;

    let mut results: Vec<(usize, f64)> = response_positions.iter()
        .enumerate()
        .map(|(i, &entry_idx)| (entry_idx, divs.get(i).copied().unwrap_or(0.0)))
        .collect();
    results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
    Ok(results)
}
-												switch memory scoring to /v1/score endpoint

Replace prompt_logprobs-based scoring with the new vLLM /v1/score
endpoint. Much simpler: one API call per memory drop, returns
per-message total_logprob directly. No chunking needed, no OOM risk
— the endpoint only computes logits for scored tokens.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 00:31:57 -04:00
+								// training.rs — Memory importance scoring via /v1/score
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								//
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// Three scoring modes, all built on the same call_score() primitive:
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								//
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// score_memories()  — Full N×M matrix (memories × responses) for the
 								//                     debug screen. Expensive: N+1 API calls.
 								//
 								// memory_score()    — Single memory importance. Scores the 50 messages
 								//                     after it was surfaced, with/without that memory.
 								//                     2 API calls.
 								//
 								// finetune_score()  — Identifies training candidates. Scores recent
 								//                     messages with all memories stripped. Responses
 								//                     with high divergence depend on memories the model
 								//                     hasn't internalized. 2 API calls.
-												Move API code from user/ to agent/

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-04 00:29:11 -04:00
-												more reorg

											
										
										
											2026-04-05 01:48:11 -04:00
+								use crate::agent::api::ApiClient;
-												Move API code from user/ to agent/

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-04 00:29:11 -04:00
+								use crate::agent::api::types::*;
 								use crate::agent::context::{ConversationEntry, ContextState};
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
+								const SCORE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(120);
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// ── Message building ────────────────────────────────────────────
 								/// What to filter when building the message array for scoring.
 								enum Filter<'a> {
 								    None,
 								    SkipIndex(usize),
 								    SkipKey(&'a str),
 								    SkipAllMemories,
 								}
 								/// Build the messages array for a scoring call.
 								///
 								/// Always includes system prompt + context message as prefix, then
 								/// entries from `range` filtered by `filter`.
 								fn build_messages(
 								    context: &ContextState,
 								    range: std::ops::Range<usize>,
 								    filter: Filter,
 								) -> Vec<serde_json::Value> {
 								    let mut msgs = vec![
 								        serde_json::json!({"role": "system", "content": &context.system_prompt}),
 								    ];
 								    let ctx = context.render_context_message();
 								    if !ctx.is_empty() {
 								        msgs.push(serde_json::json!({"role": "user", "content": ctx}));
 								    }
 								    for i in range {
 								        let entry = &context.entries[i];
 								        let skip = match &filter {
 								            Filter::None => false,
 								            Filter::SkipIndex(idx) => i == *idx,
 								            Filter::SkipKey(key) => matches!(entry, ConversationEntry::Memory { key: k, .. } if k == key),
 								            Filter::SkipAllMemories => entry.is_memory(),
 								        };
 								        if skip { continue; }
 								        let m = entry.api_message();
 								        msgs.push(serde_json::json!({
 								            "role": m.role_str(),
 								            "content": m.content_text(),
 								        }));
 								    }
 								    msgs
 								}
 								// ── Score API ───────────────────────────────────────────────────
 								#[derive(serde::Deserialize)]
 								struct ScoreResult {
 								    total_logprob: f64,
 								}
 								#[derive(serde::Deserialize)]
 								struct ScoreResponse {
 								    scores: Vec<ScoreResult>,
 								}
 								fn http_client() -> reqwest::Client {
 								    reqwest::Client::builder()
 								        .timeout(SCORE_TIMEOUT)
 								        .pool_max_idle_per_host(2)
 								        .build()
 								        .unwrap_or_default()
 								}
 								async fn call_score(
 								    http: &reqwest::Client,
 								    client: &ApiClient,
 								    messages: &[serde_json::Value],
 								) -> anyhow::Result<Vec<ScoreResult>> {
 								    let response = http
 								        .post(format!("{}/score", client.base_url()))
 								        .header("Content-Type", "application/json")
 								        .header("Authorization", format!("Bearer {}", client.api_key()))
 								        .json(&serde_json::json!({
 								            "model": client.model,
 								            "messages": messages,
 								            "logprobs": 1,
 								        }))
 								        .send()
 								        .await
 								        .map_err(|e| if e.is_timeout() {
 								            anyhow::anyhow!("score request timed out after {}s", SCORE_TIMEOUT.as_secs())
 								        } else {
 								            anyhow::anyhow!("score request failed: {}", e)
 								        })?;
 								    let status = response.status();
 								    let body: serde_json::Value = response.json().await?;
 								    if !status.is_success() {
 								        let msg = body.get("error").and_then(|e| e.as_str()).unwrap_or("unknown error");
 								        anyhow::bail!("score API HTTP {}: {}", status, msg);
 								    }
 								    if let Some(err) = body.get("error").and_then(|e| e.as_str()) {
 								        anyhow::bail!("score API error: {}", err);
 								    }
 								    let result: ScoreResponse = serde_json::from_value(body)
 								        .map_err(|e| anyhow::anyhow!("failed to parse score response: {}", e))?;
 								    Ok(result.scores)
 								}
 								/// Compute per-position logprob divergence: how much worse the model
 								/// scores each response without something vs with it.
 								fn divergence(baseline: &[ScoreResult], without: &[ScoreResult]) -> Vec<f64> {
 								    baseline.iter().enumerate()
 								        .map(|(i, base)| {
 								            let without_lp = without.get(i).map(|s| s.total_logprob).unwrap_or(base.total_logprob);
 								            (base.total_logprob - without_lp).max(0.0)
 								        })
 								        .collect()
 								}
 								/// Score two message sets and return total divergence.
 								async fn score_divergence(
 								    http: &reqwest::Client,
 								    client: &ApiClient,
 								    context: &ContextState,
 								    range: std::ops::Range<usize>,
 								    filter: Filter<'_>,
 								) -> anyhow::Result<(Vec<f64>, Vec<ScoreResult>)> {
 								    let baseline = call_score(http, client, &build_messages(context, range.clone(), Filter::None)).await?;
 								    let without = call_score(http, client, &build_messages(context, range, filter)).await?;
 								    let divs = divergence(&baseline, &without);
 								    Ok((divs, baseline))
 								}
 								// ── Full matrix scoring (debug screen) ──────────────────────────
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								/// Result of scoring one conversation's memory usage.
 								pub struct MemoryScore {
 								    pub memory_weights: Vec<(String, f64)>,
 								    pub response_scores: Vec<f64>,
 								    /// Full matrix: divergence[memory_idx][response_idx]
 								    pub matrix: Vec<Vec<f64>>,
 								    pub memory_keys: Vec<String>,
-												show scoring progress and per-response memory attribution

Status bar shows "scoring 3/7..." during scoring. Debug pane logs
per-memory importance and top-5 response breakdowns. F10 context
screen shows which memories were important for each assistant
response as drilldown children (← memory_key (score)).

Added important_memories_for_entry() to look up the matrix by
conversation entry index.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:27:43 -04:00
+								    pub response_entry_indices: Vec<usize>,
 								}
 								impl MemoryScore {
 								    pub fn important_memories_for_entry(&self, entry_idx: usize) -> Vec<(&str, f64)> {
 								        let Some(resp_idx) = self.response_entry_indices.iter().position(|&i| i == entry_idx)
 								        else { return Vec::new() };
 								        let mut result: Vec<(&str, f64)> = self.memory_keys.iter()
 								            .zip(self.matrix.iter())
 								            .filter_map(|(key, row)| {
 								                let score = row.get(resp_idx).copied().unwrap_or(0.0);
 								                if score > 0.01 { Some((key.as_str(), score)) } else { None }
 								            })
 								            .collect();
 								        result.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
 								        result
 								    }
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								}
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								/// Score how important each memory is to the conversation (full matrix).
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								pub async fn score_memories(
 								    context: &ContextState,
 								    client: &ApiClient,
 								) -> anyhow::Result<MemoryScore> {
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let mut memory_keys: Vec<String> = context.entries.iter()
 								        .filter_map(|e| match e {
 								            ConversationEntry::Memory { key, .. } => Some(key.clone()),
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								            _ => None,
 								        })
 								        .collect();
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    memory_keys.dedup();
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
 								    let response_indices: Vec<usize> = context.entries.iter().enumerate()
 								        .filter(|(_, e)| e.message().role == Role::Assistant)
 								        .map(|(i, _)| i)
 								        .collect();
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    if memory_keys.is_empty() || response_indices.is_empty() {
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								        return Ok(MemoryScore {
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								            memory_weights: Vec::new(), response_scores: Vec::new(),
 								            matrix: Vec::new(), memory_keys: Vec::new(),
-												show scoring progress and per-response memory attribution

Status bar shows "scoring 3/7..." during scoring. Debug pane logs
per-memory importance and top-5 response breakdowns. F10 context
screen shows which memories were important for each assistant
response as drilldown children (← memory_key (score)).

Added important_memories_for_entry() to look up the matrix by
conversation entry index.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:27:43 -04:00
+								            response_entry_indices: Vec::new(),
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								        });
 								    }
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let http = http_client();
 								    let range = 0..context.entries.len();
-												switch memory scoring to /v1/score endpoint

Replace prompt_logprobs-based scoring with the new vLLM /v1/score
endpoint. Much simpler: one API call per memory drop, returns
per-message total_logprob directly. No chunking needed, no OOM risk
— the endpoint only computes logits for scored tokens.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 00:31:57 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let baseline = call_score(&http, client, &build_messages(context, range.clone(), Filter::None)).await?;
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let total = memory_keys.len();
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								    let mut matrix: Vec<Vec<f64>> = Vec::new();
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    for (mem_idx, key) in memory_keys.iter().enumerate() {
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								        dbglog!(
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
+								            "scoring {}/{}: {}...", mem_idx + 1, total, key,
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								        );
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								        let msgs = build_messages(context, range.clone(), Filter::SkipKey(key));
 								        match call_score(&http, client, &msgs).await {
 								            Ok(without) => matrix.push(divergence(&baseline, &without)),
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
+								            Err(e) => {
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                dbglog!(
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								                    "[training] {} FAILED: {:#}", key, e,
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                );
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
+								                matrix.push(vec![0.0; baseline.len()]);
 								            }
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								        }
 								    }
-												switch memory scoring to /v1/score endpoint

Replace prompt_logprobs-based scoring with the new vLLM /v1/score
endpoint. Much simpler: one API call per memory drop, returns
per-message total_logprob directly. No chunking needed, no OOM risk
— the endpoint only computes logits for scored tokens.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 00:31:57 -04:00
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								    let memory_weights: Vec<(String, f64)> = memory_keys.iter()
 								        .zip(matrix.iter())
 								        .map(|(key, row)| (key.clone(), row.iter().sum()))
 								        .collect();
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let mut response_scores = vec![0.0; response_indices.len()];
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								    for row in &matrix {
 								        for (j, &v) in row.iter().enumerate() {
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								            if j < response_scores.len() { response_scores[j] += v; }
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								        }
 								    }
 								    Ok(MemoryScore {
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								        memory_weights, response_scores, matrix, memory_keys,
-												show scoring progress and per-response memory attribution

Status bar shows "scoring 3/7..." during scoring. Debug pane logs
per-memory importance and top-5 response breakdowns. F10 context
screen shows which memories were important for each assistant
response as drilldown children (← memory_key (score)).

Added important_memories_for_entry() to look up the matrix by
conversation entry index.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:27:43 -04:00
+								        response_entry_indices: response_indices,
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								    })
 								}
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								/// Find the entry index after `start` that contains the Nth assistant response.
 								/// Returns (end_index, true) if N responses were found, (entries.len(), false) if not.
 								fn nth_response_end(entries: &[ConversationEntry], start: usize, n: usize) -> (usize, bool) {
 								    let mut count = 0;
 								    for i in start..entries.len() {
 								        if entries[i].message().role == Role::Assistant {
 								            count += 1;
 								            if count >= n { return (i + 1, true); }
 								        }
 								    }
 								    (entries.len(), false)
 								}
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// ── Single memory scoring ───────────────────────────────────────
-												chunk scoring calls to avoid OOM on large contexts

Split conversation into ~50K token chunks (configurable via
scoring_chunk_tokens in config) for prompt_logprobs calls.
Each chunk ends at an assistant message boundary. Avoids the
~40GB logprobs tensor allocation that OOM'd on full contexts.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:35:29 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								/// Score how important a single memory is to the conversation.
 								///
 								/// Scores the 50 messages after the memory was surfaced — the window
 								/// where it could have influenced responses. Returns the sum of
 								/// divergence, or 0.0 if the memory isn't in the conversation.
 								pub async fn score_memory(
 								    context: &ContextState,
 								    key: &str,
 								    client: &ApiClient,
 								) -> anyhow::Result<f64> {
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    const RESPONSE_WINDOW: usize = 50;
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
 								    let first_pos = match context.entries.iter().position(|e| {
 								        matches!(e, ConversationEntry::Memory { key: k, .. } if k == key)
 								    }) {
 								        Some(p) => p,
 								        None => return Ok(0.0),
 								    };
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    let (end, _) = nth_response_end(&context.entries, first_pos, RESPONSE_WINDOW);
 								    let range = first_pos..end;
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    if !context.entries[range.clone()].iter().any(|e| e.message().role == Role::Assistant) {
 								        return Ok(0.0);
-												chunk scoring calls to avoid OOM on large contexts

Split conversation into ~50K token chunks (configurable via
scoring_chunk_tokens in config) for prompt_logprobs calls.
Each chunk ends at an assistant message boundary. Avoids the
~40GB logprobs tensor allocation that OOM'd on full contexts.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:35:29 -04:00
+								    }
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let http = http_client();
 								    let (divs, _) = score_divergence(&http, client, context, range, Filter::SkipKey(key)).await?;
-												chunk scoring calls to avoid OOM on large contexts

Split conversation into ~50K token chunks (configurable via
scoring_chunk_tokens in config) for prompt_logprobs calls.
Each chunk ends at an assistant message boundary. Avoids the
~40GB logprobs tensor allocation that OOM'd on full contexts.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:35:29 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    Ok(divs.iter().sum())
 								}
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								// ── Background memory scoring ───────────────────────────────────
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								/// Score memories in the conversation that are due for re-scoring.
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								///
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								/// Checks the graph for each memory's last_scored timestamp. Scores
 								/// nodes that haven't been scored within `max_age_secs`, oldest first.
 								/// Updates the graph weight (EWMA) and last_scored after each.
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								///
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								/// Returns the number of nodes scored and their (key, score) pairs.
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								pub async fn score_memories_incremental(
 								    context: &ContextState,
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    max_age_secs: i64,
 								    response_window: usize,
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								    client: &ApiClient,
-												Kill StatusUpdate, Activity, DmnAnnotation, ContextInfoUpdate, AgentUpdate

Status bar reads directly from Agent and MindState on each render tick.
Activity is now a field on Agent — set by agent code directly, read by
UI via try_lock. DmnAnnotation, ContextInfoUpdate, AgentUpdate were
already dead (no senders).

UiMessage down to 4 variants: TextDelta, Reasoning, Debug, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:34:27 -04:00
+								    agent: &std::sync::Arc<tokio::sync::Mutex<crate::agent::Agent>>,
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								) -> anyhow::Result<Vec<(String, f64)>> {
 								    let now = chrono::Utc::now().timestamp();
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    // Collect unique memory keys with their first position
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								    let mut seen = std::collections::HashSet::new();
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    let mut candidates: Vec<(usize, String, i64)> = Vec::new(); // (pos, key, last_scored)
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    let store = crate::hippocampus::store::Store::load().unwrap_or_default();
 								    for (i, entry) in context.entries.iter().enumerate() {
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								        if let ConversationEntry::Memory { key, .. } = entry {
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								            if !seen.insert(key.clone()) { continue; }
 								            let last_scored = store.nodes.get(key.as_str())
 								                .map(|n| n.last_scored)
 								                .unwrap_or(0);
 								            if now - last_scored >= max_age_secs {
 								                candidates.push((i, key.clone(), last_scored));
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								            }
 								        }
 								    }
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    // Score oldest-first
 								    candidates.sort_by_key(|&(_, _, last)| last);
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								    let http = http_client();
 								    let mut results = Vec::new();
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    let total_entries = context.entries.len();
 								    let first_quarter = total_entries / 4;
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    for (pos, key, _) in &candidates {
 								        let (end, full_window) = nth_response_end(&context.entries, *pos, response_window);
 								        // Skip memories without a full window, unless they're in the
 								        // first quarter of the conversation (always score those).
 								        if !full_window && *pos >= first_quarter {
 								            continue;
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								        }
 								        let range = *pos..end;
 								        if !context.entries[range.clone()].iter().any(|e| e.message().role == Role::Assistant) {
 								            continue;
 								        }
-												Kill TextDelta, Info — UiMessage is dead. RAII ActivityGuards replace all status feedback

Streaming text now goes directly to agent entries via append_streaming().
sync_from_agent diffs the growing entry each tick. The streaming entry
is popped when the response completes; build_response_message pushes
the final version.

All status feedback uses RAII ActivityGuards:
- push_activity() for long-running work (thinking, streaming, scoring)
- notify() for instant feedback (compacted, DMN state changes, commands)
- Guards auto-remove on Drop, appending "(complete)" and lingering 5s
- expire_activities() cleans up timed-out notifications on render tick

UiMessage enum reduced to a single Info variant with zero sends.
The channel infrastructure remains for now (Mind/Agent still take
UiSender in signatures) — mechanical cleanup for a follow-up.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 22:18:07 -04:00
+								        let _scoring = crate::agent::start_activity(agent, format!("scoring: {}", key)).await;
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								        match score_divergence(&http, client, context, range, Filter::SkipKey(key)).await {
 								            Ok((divs, _)) => {
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								                let n_responses = divs.len();
 								                let max_div = divs.iter().cloned().fold(0.0f64, f64::max);
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                dbglog!(
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								                    "[scoring] {} max:{:.3} ({} responses)", key, max_div, n_responses,
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                );
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								                results.push((key.clone(), max_div));
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								            }
 								            Err(e) => {
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                dbglog!(
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								                    "[scoring] {} FAILED: {:#}", key, e,
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                );
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								            }
 								        }
 								    }
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    Ok(results)
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								}
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// ── Fine-tuning scoring ─────────────────────────────────────────
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								/// Score which recent responses are candidates for fine-tuning.
 								///
 								/// Removes all memories and scores the most recent `count` messages.
 								/// Responses with high divergence depend on memories the model hasn't
 								/// internalized — these are fine-tuning candidates.
 								///
 								/// Returns (entry_index, divergence) pairs, sorted by divergence descending.
 								pub async fn score_finetune(
 								    context: &ContextState,
 								    count: usize,
 								    client: &ApiClient,
 								) -> anyhow::Result<Vec<(usize, f64)>> {
 								    let range = context.entries.len().saturating_sub(count)..context.entries.len();
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let response_positions: Vec<usize> = range.clone()
 								        .filter(|&i| context.entries[i].message().role == Role::Assistant)
 								        .collect();
 								    if response_positions.is_empty() {
 								        return Ok(Vec::new());
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
+								    }
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let http = http_client();
 								    let (divs, _) = score_divergence(&http, client, context, range, Filter::SkipAllMemories).await?;
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let mut results: Vec<(usize, f64)> = response_positions.iter()
 								        .enumerate()
 								        .map(|(i, &entry_idx)| (entry_idx, divs.get(i).copied().unwrap_or(0.0)))
 								        .collect();
 								    results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
 								    Ok(results)
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								}