consciousness/src/subconscious/learn.rs

// training.rs — Memory importance scoring via /v1/score
//
// Three scoring modes, all built on the same call_score() primitive:
//
// score_memories()  — Full N×M matrix (memories × responses) for the
//                     debug screen. Expensive: N+1 API calls.
//
// memory_score()    — Single memory importance. Scores the 50 messages
//                     after it was surfaced, with/without that memory.
//                     2 API calls.
//
// finetune_score()  — Identifies training candidates. Scores recent
//                     messages with all memories stripped. Responses
//                     with high divergence depend on memories the model
//                     hasn't internalized. 2 API calls.

use crate::agent::api::ApiClient;
use crate::agent::context::{
    Ast, AstNode, ContextState, Role, WireImage,
    is_assistant, is_memory_node, memory_key, render_branch_text, render_prior_context,
};
use crate::subconscious::generate::gen_continuation;

const SCORE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(300);

// ── Score API ───────────────────────────────────────────────────

#[derive(serde::Deserialize)]
struct ScoreResult {
    total_logprob: f64,
}

#[derive(serde::Deserialize)]
struct ScoreResponse {
    scores: Vec<ScoreResult>,
}

fn http_client() -> crate::agent::api::http::HttpClient {
    crate::agent::api::http::HttpClient::builder()
        .timeout(SCORE_TIMEOUT)
        .build()
}

async fn call_score(
    http: &crate::agent::api::http::HttpClient,
    client: &ApiClient,
    prompt: &[u32],
    images: &[WireImage],
    ranges: &[(usize, usize)],
    priority: Option<i32>,
) -> anyhow::Result<Vec<ScoreResult>> {
    // Nothing to score — skip the round-trip.
    if ranges.is_empty() {
        return Ok(Vec::new());
    }
    let url = format!("{}/score", client.base_url());
    let auth = format!("Bearer {}", client.api_key());
    let mut body = serde_json::json!({
        "model": client.model,
        "prompt": prompt,
        "score_ranges": ranges,
        "logprobs": 1,
    });
    if !images.is_empty() {
        use base64::Engine;
        let b64 = base64::engine::general_purpose::STANDARD;
        let uris: Vec<String> = images.iter()
            .map(|img| format!("data:{};base64,{}", img.mime, b64.encode(&img.bytes)))
            .collect();
        body["multi_modal_data"] = serde_json::json!({ "image": uris });
    }
    if let Some(p) = priority {
        body["priority"] = serde_json::json!(p);
    }
    let response = http
        .send_json("POST", &url, &[
            ("authorization", &auth),
        ], &body)
        .await?;

    let status = response.status();
    let body: serde_json::Value = response.json().await?;

    if !status.is_success() {
        let msg = body.get("error").and_then(|e| e.as_str()).unwrap_or("unknown error");
        anyhow::bail!("score API HTTP {}: {}", status, msg);
    }
    if let Some(err) = body.get("error").and_then(|e| e.as_str()) {
        anyhow::bail!("score API error: {}", err);
    }

    let result: ScoreResponse = serde_json::from_value(body)
        .map_err(|e| anyhow::anyhow!("failed to parse score response: {}", e))?;
    Ok(result.scores)
}

/// Compute per-position logprob divergence: how much worse the model
/// scores each response without something vs with it.
fn divergence(baseline: &[ScoreResult], without: &[ScoreResult]) -> Vec<f64> {
    baseline.iter().enumerate()
        .map(|(i, base)| {
            let without_lp = without.get(i).map(|s| s.total_logprob).unwrap_or(base.total_logprob);
            (base.total_logprob - without_lp).max(0.0)
        })
        .collect()
}

/// Score two message sets and return total divergence.
async fn score_divergence<F>(
    http: &crate::agent::api::http::HttpClient,
    client: &ApiClient,
    context: &ContextState,
    range: std::ops::Range<usize>,
    skip: F,
    priority: Option<i32>,
) -> anyhow::Result<(Vec<f64>, Vec<ScoreResult>)>
where F: FnMut(&AstNode) -> bool,
{
    let (baseline_tokens, baseline_images, baseline_ranges) =
        context.wire_prompt(range.clone(), |_| false);
    let (without_tokens, without_images, without_ranges) =
        context.wire_prompt(range, skip);
    let baseline = call_score(http, client, &baseline_tokens, &baseline_images,
                              &baseline_ranges, priority).await?;
    let without = call_score(http, client, &without_tokens, &without_images,
                             &without_ranges, priority).await?;
    let divs = divergence(&baseline, &without);
    Ok((divs, baseline))
}

// ── Full matrix scoring (debug screen) ──────────────────────────

/// Score how important each memory is to the conversation (full matrix).
pub async fn score_memories(
    client: &ApiClient,
    agent: &std::sync::Arc<crate::agent::Agent>,
) -> anyhow::Result<()> {
    // Collect memory keys and response indices under a brief lock
    let (memory_keys, response_indices) = {
        let ctx = agent.context.lock().await;
        // Include identity nodes and conversation memories
        let mut keys: Vec<String> = ctx.identity().iter()
            .chain(ctx.conversation().iter())
            .filter_map(|node| memory_key(node).map(String::from))
            .collect();
        keys.dedup();
        let resp: Vec<usize> = ctx.conversation().iter().enumerate()
            .filter(|(_, node)| is_assistant(node))
            .map(|(i, _)| i)
            .collect();
        (keys, resp)
    };

    if memory_keys.is_empty() || response_indices.is_empty() {
        return Ok(());
    }

    let total = memory_keys.len();
    dbglog!("[scoring-full] starting: {} memories × {} responses",
        total, response_indices.len());

    let http = http_client();

    let activity = crate::agent::start_activity(agent, "scoring: baseline").await;
    let (baseline_tokens, baseline_images, baseline_ranges) = {
        let ctx = agent.context.lock().await;
        ctx.wire_prompt(0..ctx.conversation().len(), |_| false)
    };
    let baseline = call_score(&http, client, &baseline_tokens, &baseline_images,
                              &baseline_ranges, Some(5)).await?;
    dbglog!("[scoring-full] baseline done ({} response scores)", baseline.len());

    for (mem_idx, key) in memory_keys.iter().enumerate() {
        activity.update(format!("scoring: {}/{}", mem_idx + 1, total)).await;
        dbglog!("[scoring-full] {}/{}: {}", mem_idx + 1, total, key);
        let (tokens, images, ranges) = {
            let ctx = agent.context.lock().await;
            ctx.wire_prompt(0..ctx.conversation().len(), |n| memory_key(n) == Some(key.as_str()))
        };
        let row = match call_score(&http, client, &tokens, &images, &ranges, Some(5)).await {
            Ok(without) => {
                let divs = divergence(&baseline, &without);
                let max_div = divs.iter().cloned().fold(0.0f64, f64::max);
                dbglog!("[scoring-full] {}/{}: {} max_div={:.3}",
                    mem_idx + 1, total, key, max_div);
                divs
            }
            Err(e) => {
                dbglog!("[scoring-full] {}/{}: {} FAILED: {:#}",
                    mem_idx + 1, total, key, e);
                vec![0.0; baseline.len()]
            }
        };
        // Write this memory's scores to the live AST nodes
        {
            let mut ctx = agent.context.lock().await;
            let mut set_count = 0;

            for (resp_idx, &idx) in response_indices.iter().enumerate() {
                if idx >= ctx.conversation().len() { continue; }
                let node = &mut ctx.conversation_mut()[idx];
                if let AstNode::Branch {
                    role: Role::Assistant, memory_scores, ..
                } = node {
                    if let Some(&score) = row.get(resp_idx) {
                        if score > 0.01 {
                            memory_scores.insert(key.clone(), score);
                            set_count += 1;
                        } else {
                            memory_scores.remove(key.as_str());
                        }
                    }
                }
            }

            dbglog!("[scoring-full] {}/{} AST: set={}", mem_idx + 1, total, set_count);
        }
        agent.state.lock().await.changed.notify_one();
    }

    Ok(())
}

/// Find the entry index after `start` that contains the Nth assistant response.
/// Returns (end_index, true) if N responses were found, (entries.len(), false) if not.
fn nth_response_end(entries: &[AstNode], start: usize, n: usize) -> (usize, bool) {
    let mut count = 0;
    for i in start..entries.len() {
        if is_assistant(&entries[i]) {
            count += 1;
            if count >= n { return (i + 1, true); }
        }
    }
    (entries.len(), false)
}

// ── Single memory scoring ───────────────────────────────────────

/// Score how important a single memory is to the conversation.
///
/// Scores the 50 messages after the memory was surfaced — the window
/// where it could have influenced responses. Returns the sum of
/// divergence, or 0.0 if the memory isn't in the conversation.
pub async fn score_memory(
    context: &ContextState,
    key: &str,
    client: &ApiClient,
) -> anyhow::Result<f64> {
    const RESPONSE_WINDOW: usize = 50;

    let entries = context.conversation();
    let first_pos = match entries.iter().position(|node| memory_key(node) == Some(key)) {
        Some(p) => p,
        None => return Ok(0.0),
    };

    let (end, _) = nth_response_end(entries, first_pos, RESPONSE_WINDOW);
    let range = first_pos..end;
    if !entries[range.clone()].iter().any(|node| is_assistant(node)) {
        return Ok(0.0);
    }

    let http = http_client();
    let (divs, _) = score_divergence(&http, client, context, range,
                                     |n| memory_key(n) == Some(key), Some(5)).await?;

    Ok(divs.iter().sum())
}

// ── Background memory scoring ───────────────────────────────────

/// Score memories in the conversation that are due for re-scoring.
///
/// Checks the graph for each memory's last_scored timestamp. Scores
/// nodes that haven't been scored within `max_age_secs`, oldest first.
/// Updates the graph weight (EWMA) and last_scored after each.
///
/// Returns the number of nodes scored and their (key, score) pairs.
pub async fn score_memories_incremental<F, Fut>(
    context: &ContextState,
    max_age_secs: i64,
    response_window: usize,
    client: &ApiClient,
    agent: &std::sync::Arc<crate::agent::Agent>,
    mut on_score: F,
) -> anyhow::Result<usize>
where
    F: FnMut(String, f64) -> Fut,
    Fut: std::future::Future<Output = ()>,
{
    let now = chrono::Utc::now().timestamp();

    // Collect unique memory keys with their first position
    let mut seen = std::collections::HashSet::new();
    let mut candidates: Vec<(usize, String, i64)> = Vec::new(); // (pos, key, last_scored)

    let store_arc = crate::hippocampus::access_local()?;

    {
        let store = &*store_arc;
        // Identity nodes always score at position 0; conversation nodes at their index
        let identity_nodes = context.identity().iter().map(|n| (0, n));
        let conv_nodes = context.conversation().iter().enumerate();
        for (pos, node) in identity_nodes.chain(conv_nodes) {
            if let Some(key) = memory_key(node) {
                if !seen.insert(key.to_owned()) { continue; }
                let last_scored = store.get_node(key)
                    .ok()
                    .flatten()
                    .map(|n| n.last_scored)
                    .unwrap_or(0);
                if now - last_scored >= max_age_secs {
                    candidates.push((pos, key.to_owned(), last_scored));
                }
            }
        }
    }

    // Score oldest-first
    candidates.sort_by_key(|&(_, _, last)| last);

    let http = http_client();
    let mut scored = 0;

    let entries = context.conversation();
    let total_tokens: usize = entries.iter().map(|n| n.tokens()).sum();
    let token_cutoff = total_tokens * 60 / 100;

    // Precompute cumulative token position for each entry
    let mut cumulative: Vec<usize> = Vec::with_capacity(entries.len());
    let mut running = 0;
    for e in entries {
        running += e.tokens();
        cumulative.push(running);
    }

    let total = candidates.len();
    dbglog!("[scoring] total_tokens={}, cutoff={}, {} candidates", total_tokens, token_cutoff, total);
    let activity = crate::agent::start_activity(agent, format!("scoring: 0/{}", total)).await;

    for (pos, key, _) in &candidates {
        // Only score memories in the first 60% of the conversation by tokens —
        // recent memories don't have enough responses to evaluate yet.
        let cum = cumulative.get(*pos).copied().unwrap_or(total_tokens);
        if cum > token_cutoff {
            dbglog!("[scoring] skip {} (tokens {}/{} past cutoff)", key, cum, token_cutoff);
            continue;
        }
        let (end, _) = nth_response_end(context.conversation(), *pos, response_window);
        let range = *pos..end;
        if !context.conversation()[range.clone()].iter().any(|node| is_assistant(node)) {
            dbglog!("[scoring] skip {} (no assistant response in range {}..{})", key, pos, end);
            continue;
        }

        activity.update(format!("scoring: {}/{} {}", scored + 1, total, key)).await;
        match score_divergence(&http, client, context, range,
                               |n| memory_key(n) == Some(key), Some(5)).await {
            Ok((divs, _)) => {
                let n_responses = divs.len();
                let max_div = divs.iter().cloned().fold(0.0f64, f64::max);
                dbglog!(
                    "[scoring] {} max:{:.3} ({} responses)", key, max_div, n_responses,
                );
                on_score(key.clone(), max_div).await;
                scored += 1;
            }
            Err(e) => {
                dbglog!(
                    "[scoring] {} FAILED: {:#}", key, e,
                );
            }
        }
    }

    Ok(scored)
}

// ── Fine-tuning scoring ─────────────────────────────────────────

/// Score which recent responses are candidates for fine-tuning.
///
/// Removes all memories and scores the most recent `count` messages.
/// Responses with high divergence depend on memories the model hasn't
/// internalized — these are fine-tuning candidates.
///
/// Returns (entry_index, divergence) pairs, sorted by divergence descending.
pub async fn score_finetune(
    context: &ContextState,
    count: usize,
    client: &ApiClient,
) -> anyhow::Result<Vec<(usize, f64)>> {
    let entries = context.conversation();
    let range = entries.len().saturating_sub(count)..entries.len();

    let response_positions: Vec<usize> = range.clone()
        .filter(|&i| is_assistant(&entries[i]))
        .collect();
    if response_positions.is_empty() {
        return Ok(Vec::new());
    }

    let http = http_client();
    let (divs, _) = score_divergence(&http, client, context, range, is_memory_node, Some(5)).await?;

    let mut results: Vec<(usize, f64)> = response_positions.iter()
        .enumerate()
        .map(|(i, &entry_idx)| (entry_idx, divs.get(i).copied().unwrap_or(0.0)))
        .collect();
    results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
    Ok(results)
}

/// Enriched finetune candidate with context for review.
#[derive(Clone, Debug)]
pub struct FinetuneCandidate {
    pub entry_idx: usize,
    pub divergence: f64,
    pub response_text: String,
    /// Last couple of user/assistant messages before this response,
    /// already rendered with role markers, for F6 display context.
    pub prior_context: String,
    /// Token IDs for context (everything before the response).
    pub context_ids: Vec<u32>,
    /// Token IDs for the response (what we're training on).
    pub continuation_ids: Vec<u32>,
    /// What the model would have said without memories (if generated).
    pub alternate_text: Option<String>,
    /// Timestamp in nanos — used as unique key for trained-set dedup.
    pub timestamp_ns: i64,
}

/// Score and enrich finetune candidates with full context.
///
/// Candidates are delivered via `on_candidate` one-at-a-time as they become
/// ready: scoring happens once (one /score call), then for each candidate
/// that passes the threshold we optionally generate an alternate response
/// and then emit it. The activity status is updated during the alternate
/// phase so the UI doesn't look stuck.
///
/// Returns (count_above_threshold, max_divergence).
pub async fn score_finetune_candidates(
    context: &ContextState,
    count: usize,
    client: &ApiClient,
    min_divergence: f64,
    generate_alternates: bool,
    activity: &crate::agent::ActivityGuard,
    mut on_candidate: impl FnMut(FinetuneCandidate),
) -> anyhow::Result<(usize, f64)> {
    let scores = score_finetune(context, count, client).await?;

    let max_divergence = scores.iter().map(|(_, d)| *d).fold(0.0f64, f64::max);

    let entries = context.conversation();
    let trained = load_trained();
    let mut candidates: Vec<FinetuneCandidate> = Vec::new();

    for (entry_idx, divergence) in scores {
        if divergence < min_divergence {
            continue;
        }

        let node = &entries[entry_idx];

        // Skip if already trained on.
        let timestamp_ns = node_timestamp_ns(node);
        if trained.contains(&timestamp_ns) {
            continue;
        }

        // Extract response text — content of the assistant turn.
        let response_text = match node {
            AstNode::Branch { children, .. } => render_branch_text(children),
            _ => continue,
        };

        // Skip turns that produced nothing human-visible (e.g., a
        // tool-only turn, or an interrupted generation). They'd show
        // up as blank cards and we'd still burn alternate-gen on them.
        if response_text.trim().is_empty() {
            continue;
        }

        // Build the last couple of user/assistant exchanges for review.
        let prior_context = render_prior_context(entries, entry_idx, 2);

        // Build token IDs: context = everything before response, continuation = response.
        let (context_ids, _, _) = context.wire_prompt(0..entry_idx, |_| false);
        let continuation_ids: Vec<u32> = node.token_ids().into_iter().collect();

        candidates.push(FinetuneCandidate {
            entry_idx,
            divergence,
            response_text,
            prior_context,
            context_ids,
            continuation_ids,
            alternate_text: None,
            timestamp_ns,
        });
    }

    let total = candidates.len();
    let gen_alternates = generate_alternates && total > 0;

    for (i, mut candidate) in candidates.into_iter().enumerate() {
        if gen_alternates {
            activity.update(
                format!("finetune: generating alternate {}/{}", i + 1, total)
            ).await;
            match gen_continuation(context, candidate.entry_idx, is_memory_node, client).await {
                Ok(text) => candidate.alternate_text = Some(text),
                Err(e) => dbglog!("[finetune] alternate generation failed: {:#}", e),
            }
        }
        on_candidate(candidate);
    }

    Ok((total, max_divergence))
}

// ── Finetune config and persistence ─────────────────────────────

use std::path::PathBuf;
use std::collections::HashSet;

const TRAINED_RESPONSES_FILE: &str = ".consciousness/cache/trained-responses.json";

fn trained_path() -> PathBuf {
    dirs::home_dir().unwrap_or_default().join(TRAINED_RESPONSES_FILE)
}

/// Load set of trained response timestamps (nanos since epoch).
pub fn load_trained() -> HashSet<i64> {
    let path = trained_path();
    match std::fs::read_to_string(&path) {
        Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
        Err(_) => HashSet::new(),
    }
}

/// Mark a response as trained by its timestamp.
pub fn mark_trained(timestamp_ns: i64) {
    let mut trained = load_trained();
    trained.insert(timestamp_ns);
    let path = trained_path();
    if let Some(parent) = path.parent() {
        let _ = std::fs::create_dir_all(parent);
    }
    if let Ok(json) = serde_json::to_string(&trained) {
        let _ = std::fs::write(&path, json);
    }
}

/// Get timestamp in nanoseconds from an AstNode.
/// i64-ns representation covers 1677..2262 via chrono; timestamps
/// outside that window would be bugs we'd want to surface, hence panic.
pub fn node_timestamp_ns(node: &AstNode) -> i64 {
    let ts = match node {
        AstNode::Leaf(leaf) => leaf.timestamp(),
        AstNode::Branch { timestamp, .. } => *timestamp,
    };
    ts.timestamp_nanos_opt()
        .expect("timestamp outside i64-ns representable range (1677..2262)")
}

// ── Training API ────────────────────────────────────────────────

/// Training sample for /train endpoint.
#[derive(serde::Serialize)]
struct TrainingSample {
    context_ids: Vec<u32>,
    continuation_ids: Vec<u32>,
}

/// Data needed to send a training sample.
pub struct TrainData {
    pub context_ids: Vec<u32>,
    pub continuation_ids: Vec<u32>,
    pub timestamp_ns: i64,
}

/// Send training samples to the server.
///
/// Returns job_id on success, marks each sample as trained.
pub async fn send_to_train(
    samples: Vec<TrainData>,
    client: &ApiClient,
) -> anyhow::Result<String> {
    if samples.is_empty() {
        anyhow::bail!("no samples to train");
    }

    let api_samples: Vec<TrainingSample> = samples.iter()
        .map(|s| TrainingSample {
            context_ids: s.context_ids.clone(),
            continuation_ids: s.continuation_ids.clone(),
        })
        .collect();

    let body = serde_json::json!({
        "training_data": {
            "samples": api_samples,
        }
    });

    let http = http_client();
    let url = format!("{}/train", client.base_url());
    let response = http.send_json("POST", &url, &[], &body).await?;

    let status = response.status();
    let result: serde_json::Value = response.json().await?;

    if !status.is_success() {
        let msg = result.get("error").and_then(|e| e.as_str()).unwrap_or("unknown error");
        anyhow::bail!("train API HTTP {}: {}", status, msg);
    }

    // Mark all samples as trained
    for s in &samples {
        mark_trained(s.timestamp_ns);
    }

    let job_id = result.get("job_id")
        .and_then(|j| j.as_str())
        .unwrap_or("unknown")
        .to_string();

    dbglog!("[finetune] sent {} samples, job_id={}", samples.len(), job_id);
    Ok(job_id)
}
-												switch memory scoring to /v1/score endpoint

Replace prompt_logprobs-based scoring with the new vLLM /v1/score
endpoint. Much simpler: one API call per memory drop, returns
per-message total_logprob directly. No chunking needed, no OOM risk
— the endpoint only computes logits for scored tokens.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 00:31:57 -04:00
+								// training.rs — Memory importance scoring via /v1/score
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								//
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// Three scoring modes, all built on the same call_score() primitive:
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								//
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// score_memories()  — Full N×M matrix (memories × responses) for the
 								//                     debug screen. Expensive: N+1 API calls.
 								//
 								// memory_score()    — Single memory importance. Scores the 50 messages
 								//                     after it was surfaced, with/without that memory.
 								//                     2 API calls.
 								//
 								// finetune_score()  — Identifies training candidates. Scores recent
 								//                     messages with all memories stripped. Responses
 								//                     with high divergence depend on memories the model
 								//                     hasn't internalized. 2 API calls.
-												Move API code from user/ to agent/

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-04 00:29:11 -04:00
-												more reorg

											
										
										
											2026-04-05 01:48:11 -04:00
+								use crate::agent::api::ApiClient;
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								use crate::agent::context::{
-												subconscious: lift continuation gen + render helpers into shared homes

- context.rs gains is_assistant, render_branch_text, render_prior_context
  alongside memory_key / is_memory_node. They're pure AST helpers, used
  by both the finetune pipeline and the forthcoming compare screen.

- new subconscious/generate.rs holds gen_continuation(context, entry_idx,
  skip, client): build the prompt from a context prefix with an arbitrary
  skip predicate, send to the model, decode the completion. Takes both
  the predicate and the client so callers can aim it at memory-stripped
  contexts (finetune), same-context-different-model (F7 compare), or
  whatever else.

- learn.rs drops its private copies of those helpers and the inline
  generate_alternate; the finetune path now reads as
  gen_continuation(context, idx, is_memory_node, client).

Pure refactor, no behavior change.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:20:02 -04:00
+								    Ast, AstNode, ContextState, Role, WireImage,
 								    is_assistant, is_memory_node, memory_key, render_branch_text, render_prior_context,
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								};
-												subconscious: lift continuation gen + render helpers into shared homes

- context.rs gains is_assistant, render_branch_text, render_prior_context
  alongside memory_key / is_memory_node. They're pure AST helpers, used
  by both the finetune pipeline and the forthcoming compare screen.

- new subconscious/generate.rs holds gen_continuation(context, entry_idx,
  skip, client): build the prompt from a context prefix with an arbitrary
  skip predicate, send to the model, decode the completion. Takes both
  the predicate and the client so callers can aim it at memory-stripped
  contexts (finetune), same-context-different-model (F7 compare), or
  whatever else.

- learn.rs drops its private copies of those helpers and the inline
  generate_alternate; the finetune path now reads as
  gen_continuation(context, idx, is_memory_node, client).

Pure refactor, no behavior change.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:20:02 -04:00
+								use crate::subconscious::generate::gen_continuation;
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								const SCORE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(300);
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// ── Score API ───────────────────────────────────────────────────
 								#[derive(serde::Deserialize)]
 								struct ScoreResult {
 								    total_logprob: f64,
 								}
 								#[derive(serde::Deserialize)]
 								struct ScoreResponse {
 								    scores: Vec<ScoreResult>,
 								}
-												Kill reqwest — minimal HTTP client on raw hyper + tokio-rustls

New src/agent/api/http.rs: ~240 lines, supports GET/POST, JSON/form
bodies, SSE streaming via chunk(), TLS via rustls. No tracing dep.

Removes reqwest from the main crate and telegram channel crate.
Cargo.lock drops ~900 lines of transitive dependencies.

tracing now only pulled in by tui-markdown.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 12:50:40 -04:00
+								fn http_client() -> crate::agent::api::http::HttpClient {
 								    crate::agent::api::http::HttpClient::builder()
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								        .timeout(SCORE_TIMEOUT)
 								        .build()
 								}
 								async fn call_score(
-												Kill reqwest — minimal HTTP client on raw hyper + tokio-rustls

New src/agent/api/http.rs: ~240 lines, supports GET/POST, JSON/form
bodies, SSE streaming via chunk(), TLS via rustls. No tracing dep.

Removes reqwest from the main crate and telegram channel crate.
Cargo.lock drops ~900 lines of transitive dependencies.

tracing now only pulled in by tui-markdown.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 12:50:40 -04:00
+								    http: &crate::agent::api::http::HttpClient,
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    client: &ApiClient,
-												Switch memory scoring from chat messages to raw token IDs

The /score endpoint was receiving chat-format messages which had to go
through the chat template tokenizer — this was failing with "System
message must be first" errors because the AST structure doesn't map
cleanly to chat message format.

Send raw token IDs via the new `prompt` field instead, matching what
the /completions endpoint already does. The vLLM score endpoint finds
assistant boundaries by scanning for <|im_start|>assistant token
patterns, so no message-level metadata is needed.

Also includes identity and journal sections in the scored context,
matching what the model actually sees during inference.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 21:07:00 -04:00
+								    prompt: &[u32],
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								    images: &[WireImage],
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								    ranges: &[(usize, usize)],
-												Add vLLM priority to memory scoring requests

Scoring calls the /score endpoint directly via HTTP, bypassing the
stream_completion path. These requests had no priority field, so they
could preempt interactive work. Set priority=5 (between subconscious
agents at 2 and unconscious at 10).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 20:42:38 -04:00
+								    priority: Option<i32>,
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								) -> anyhow::Result<Vec<ScoreResult>> {
-												learn: score_ranges is now required; short-circuit on empty

vllm's /v1/score endpoint made score_ranges a required field (the
messages-mode fallback that used to pattern-scan for assistant
boundaries is gone). Always send the field, and if we have nothing to
score, skip the HTTP round-trip entirely instead of letting the server
422 us.

Response parsing is unchanged — serde ignores the renamed range_index
field and the dropped role field since we only extract total_logprob.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:19:28 -04:00
+								    // Nothing to score — skip the round-trip.
 								    if ranges.is_empty() {
 								        return Ok(Vec::new());
 								    }
-												Kill reqwest — minimal HTTP client on raw hyper + tokio-rustls

New src/agent/api/http.rs: ~240 lines, supports GET/POST, JSON/form
bodies, SSE streaming via chunk(), TLS via rustls. No tracing dep.

Removes reqwest from the main crate and telegram channel crate.
Cargo.lock drops ~900 lines of transitive dependencies.

tracing now only pulled in by tui-markdown.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 12:50:40 -04:00
+								    let url = format!("{}/score", client.base_url());
 								    let auth = format!("Bearer {}", client.api_key());
-												Add vLLM priority to memory scoring requests

Scoring calls the /score endpoint directly via HTTP, bypassing the
stream_completion path. These requests had no priority field, so they
could preempt interactive work. Set priority=5 (between subconscious
agents at 2 and unconscious at 10).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 20:42:38 -04:00
+								    let mut body = serde_json::json!({
-												Kill reqwest — minimal HTTP client on raw hyper + tokio-rustls

New src/agent/api/http.rs: ~240 lines, supports GET/POST, JSON/form
bodies, SSE streaming via chunk(), TLS via rustls. No tracing dep.

Removes reqwest from the main crate and telegram channel crate.
Cargo.lock drops ~900 lines of transitive dependencies.

tracing now only pulled in by tui-markdown.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 12:50:40 -04:00
+								        "model": client.model,
-												Switch memory scoring from chat messages to raw token IDs

The /score endpoint was receiving chat-format messages which had to go
through the chat template tokenizer — this was failing with "System
message must be first" errors because the AST structure doesn't map
cleanly to chat message format.

Send raw token IDs via the new `prompt` field instead, matching what
the /completions endpoint already does. The vLLM score endpoint finds
assistant boundaries by scanning for <|im_start|>assistant token
patterns, so no message-level metadata is needed.

Also includes identity and journal sections in the scored context,
matching what the model actually sees during inference.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 21:07:00 -04:00
+								        "prompt": prompt,
-												learn: score_ranges is now required; short-circuit on empty

vllm's /v1/score endpoint made score_ranges a required field (the
messages-mode fallback that used to pattern-scan for assistant
boundaries is gone). Always send the field, and if we have nothing to
score, skip the HTTP round-trip entirely instead of letting the server
422 us.

Response parsing is unchanged — serde ignores the renamed range_index
field and the dropped role field since we only extract total_logprob.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:19:28 -04:00
+								        "score_ranges": ranges,
-												Kill reqwest — minimal HTTP client on raw hyper + tokio-rustls

New src/agent/api/http.rs: ~240 lines, supports GET/POST, JSON/form
bodies, SSE streaming via chunk(), TLS via rustls. No tracing dep.

Removes reqwest from the main crate and telegram channel crate.
Cargo.lock drops ~900 lines of transitive dependencies.

tracing now only pulled in by tui-markdown.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 12:50:40 -04:00
+								        "logprobs": 1,
 								    });
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								    if !images.is_empty() {
 								        use base64::Engine;
 								        let b64 = base64::engine::general_purpose::STANDARD;
 								        let uris: Vec<String> = images.iter()
 								            .map(|img| format!("data:{};base64,{}", img.mime, b64.encode(&img.bytes)))
 								            .collect();
 								        body["multi_modal_data"] = serde_json::json!({ "image": uris });
 								    }
-												Add vLLM priority to memory scoring requests

Scoring calls the /score endpoint directly via HTTP, bypassing the
stream_completion path. These requests had no priority field, so they
could preempt interactive work. Set priority=5 (between subconscious
agents at 2 and unconscious at 10).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 20:42:38 -04:00
+								    if let Some(p) = priority {
 								        body["priority"] = serde_json::json!(p);
 								    }
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let response = http
-												Kill reqwest — minimal HTTP client on raw hyper + tokio-rustls

New src/agent/api/http.rs: ~240 lines, supports GET/POST, JSON/form
bodies, SSE streaming via chunk(), TLS via rustls. No tracing dep.

Removes reqwest from the main crate and telegram channel crate.
Cargo.lock drops ~900 lines of transitive dependencies.

tracing now only pulled in by tui-markdown.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 12:50:40 -04:00
+								        .send_json("POST", &url, &[
 								            ("authorization", &auth),
 								        ], &body)
 								        .await?;
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
 								    let status = response.status();
 								    let body: serde_json::Value = response.json().await?;
 								    if !status.is_success() {
 								        let msg = body.get("error").and_then(|e| e.as_str()).unwrap_or("unknown error");
 								        anyhow::bail!("score API HTTP {}: {}", status, msg);
 								    }
 								    if let Some(err) = body.get("error").and_then(|e| e.as_str()) {
 								        anyhow::bail!("score API error: {}", err);
 								    }
 								    let result: ScoreResponse = serde_json::from_value(body)
 								        .map_err(|e| anyhow::anyhow!("failed to parse score response: {}", e))?;
 								    Ok(result.scores)
 								}
 								/// Compute per-position logprob divergence: how much worse the model
 								/// scores each response without something vs with it.
 								fn divergence(baseline: &[ScoreResult], without: &[ScoreResult]) -> Vec<f64> {
 								    baseline.iter().enumerate()
 								        .map(|(i, base)| {
 								            let without_lp = without.get(i).map(|s| s.total_logprob).unwrap_or(base.total_logprob);
 								            (base.total_logprob - without_lp).max(0.0)
 								        })
 								        .collect()
 								}
 								/// Score two message sets and return total divergence.
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								async fn score_divergence<F>(
-												Kill reqwest — minimal HTTP client on raw hyper + tokio-rustls

New src/agent/api/http.rs: ~240 lines, supports GET/POST, JSON/form
bodies, SSE streaming via chunk(), TLS via rustls. No tracing dep.

Removes reqwest from the main crate and telegram channel crate.
Cargo.lock drops ~900 lines of transitive dependencies.

tracing now only pulled in by tui-markdown.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 12:50:40 -04:00
+								    http: &crate::agent::api::http::HttpClient,
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    client: &ApiClient,
 								    context: &ContextState,
 								    range: std::ops::Range<usize>,
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								    skip: F,
-												Add vLLM priority to memory scoring requests

Scoring calls the /score endpoint directly via HTTP, bypassing the
stream_completion path. These requests had no priority field, so they
could preempt interactive work. Set priority=5 (between subconscious
agents at 2 and unconscious at 10).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 20:42:38 -04:00
+								    priority: Option<i32>,
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								) -> anyhow::Result<(Vec<f64>, Vec<ScoreResult>)>
 								where F: FnMut(&AstNode) -> bool,
 								{
 								    let (baseline_tokens, baseline_images, baseline_ranges) =
 								        context.wire_prompt(range.clone(), |_| false);
 								    let (without_tokens, without_images, without_ranges) =
 								        context.wire_prompt(range, skip);
 								    let baseline = call_score(http, client, &baseline_tokens, &baseline_images,
 								                              &baseline_ranges, priority).await?;
 								    let without = call_score(http, client, &without_tokens, &without_images,
 								                             &without_ranges, priority).await?;
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let divs = divergence(&baseline, &without);
 								    Ok((divs, baseline))
 								}
 								// ── Full matrix scoring (debug screen) ──────────────────────────
 								/// Score how important each memory is to the conversation (full matrix).
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								pub async fn score_memories(
 								    client: &ApiClient,
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								    agent: &std::sync::Arc<crate::agent::Agent>,
 								) -> anyhow::Result<()> {
 								    // Collect memory keys and response indices under a brief lock
 								    let (memory_keys, response_indices) = {
 								        let ctx = agent.context.lock().await;
-												Include identity nodes in memory scoring

Identity memory nodes now participate in importance scoring alongside
conversation memories. Score loading/saving handles both sections, and
the conscious screen uses node.label() consistently for memory display.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-15 05:59:58 -04:00
+								        // Include identity nodes and conversation memories
 								        let mut keys: Vec<String> = ctx.identity().iter()
 								            .chain(ctx.conversation().iter())
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								            .filter_map(|node| memory_key(node).map(String::from))
 								            .collect();
 								        keys.dedup();
 								        let resp: Vec<usize> = ctx.conversation().iter().enumerate()
 								            .filter(|(_, node)| is_assistant(node))
 								            .map(|(i, _)| i)
 								            .collect();
 								        (keys, resp)
 								    };
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    if memory_keys.is_empty() || response_indices.is_empty() {
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								        return Ok(());
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								    }
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								    let total = memory_keys.len();
 								    dbglog!("[scoring-full] starting: {} memories × {} responses",
 								        total, response_indices.len());
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let http = http_client();
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								    let activity = crate::agent::start_activity(agent, "scoring: baseline").await;
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								    let (baseline_tokens, baseline_images, baseline_ranges) = {
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								        let ctx = agent.context.lock().await;
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								        ctx.wire_prompt(0..ctx.conversation().len(), |_| false)
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								    };
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								    let baseline = call_score(&http, client, &baseline_tokens, &baseline_images,
 								                              &baseline_ranges, Some(5)).await?;
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								    dbglog!("[scoring-full] baseline done ({} response scores)", baseline.len());
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    for (mem_idx, key) in memory_keys.iter().enumerate() {
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								        activity.update(format!("scoring: {}/{}", mem_idx + 1, total)).await;
 								        dbglog!("[scoring-full] {}/{}: {}", mem_idx + 1, total, key);
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								        let (tokens, images, ranges) = {
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								            let ctx = agent.context.lock().await;
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								            ctx.wire_prompt(0..ctx.conversation().len(), |n| memory_key(n) == Some(key.as_str()))
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								        };
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								        let row = match call_score(&http, client, &tokens, &images, &ranges, Some(5)).await {
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								            Ok(without) => {
 								                let divs = divergence(&baseline, &without);
 								                let max_div = divs.iter().cloned().fold(0.0f64, f64::max);
 								                dbglog!("[scoring-full] {}/{}: {} max_div={:.3}",
 								                    mem_idx + 1, total, key, max_div);
 								                divs
 								            }
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
+								            Err(e) => {
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								                dbglog!("[scoring-full] {}/{}: {} FAILED: {:#}",
 								                    mem_idx + 1, total, key, e);
 								                vec![0.0; baseline.len()]
 								            }
 								        };
 								        // Write this memory's scores to the live AST nodes
 								        {
 								            let mut ctx = agent.context.lock().await;
 								            let mut set_count = 0;
 								            for (resp_idx, &idx) in response_indices.iter().enumerate() {
 								                if idx >= ctx.conversation().len() { continue; }
 								                let node = &mut ctx.conversation_mut()[idx];
 								                if let AstNode::Branch {
 								                    role: Role::Assistant, memory_scores, ..
 								                } = node {
 								                    if let Some(&score) = row.get(resp_idx) {
 								                        if score > 0.01 {
 								                            memory_scores.insert(key.clone(), score);
 								                            set_count += 1;
 								                        } else {
 								                            memory_scores.remove(key.as_str());
 								                        }
 								                    }
 								                }
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
+								            }
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								            dbglog!("[scoring-full] {}/{} AST: set={}", mem_idx + 1, total, set_count);
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								        }
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								        agent.state.lock().await.changed.notify_one();
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								    }
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								    Ok(())
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								}
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								/// Find the entry index after `start` that contains the Nth assistant response.
 								/// Returns (end_index, true) if N responses were found, (entries.len(), false) if not.
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								fn nth_response_end(entries: &[AstNode], start: usize, n: usize) -> (usize, bool) {
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    let mut count = 0;
 								    for i in start..entries.len() {
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								        if is_assistant(&entries[i]) {
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								            count += 1;
 								            if count >= n { return (i + 1, true); }
 								        }
 								    }
 								    (entries.len(), false)
 								}
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// ── Single memory scoring ───────────────────────────────────────
-												chunk scoring calls to avoid OOM on large contexts

Split conversation into ~50K token chunks (configurable via
scoring_chunk_tokens in config) for prompt_logprobs calls.
Each chunk ends at an assistant message boundary. Avoids the
~40GB logprobs tensor allocation that OOM'd on full contexts.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:35:29 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								/// Score how important a single memory is to the conversation.
 								///
 								/// Scores the 50 messages after the memory was surfaced — the window
 								/// where it could have influenced responses. Returns the sum of
 								/// divergence, or 0.0 if the memory isn't in the conversation.
 								pub async fn score_memory(
 								    context: &ContextState,
 								    key: &str,
 								    client: &ApiClient,
 								) -> anyhow::Result<f64> {
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    const RESPONSE_WINDOW: usize = 50;
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								    let entries = context.conversation();
 								    let first_pos = match entries.iter().position(|node| memory_key(node) == Some(key)) {
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								        Some(p) => p,
 								        None => return Ok(0.0),
 								    };
-												WIP: ContextEntry/ContextSection data structures for incremental token counting

New types — not yet wired to callers:

- ContextEntry: wraps ConversationEntry with cached token count and
  timestamp
- ContextSection: named group of entries with cached token total.
  Private entries/tokens, read via entries()/tokens().
  Mutation via push(entry), set(index, entry), del(index).
- ContextState: system/identity/journal/conversation sections + working_stack
- ConversationEntry::System variant for system prompt entries

Token counting happens once at push time. Sections maintain their
totals incrementally via push/set/del. No more recomputing from
scratch on every budget check.

Does not compile — callers need updating.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 20:15:31 -04:00
+								    let (end, _) = nth_response_end(entries, first_pos, RESPONSE_WINDOW);
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    let range = first_pos..end;
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								    if !entries[range.clone()].iter().any(|node| is_assistant(node)) {
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								        return Ok(0.0);
-												chunk scoring calls to avoid OOM on large contexts

Split conversation into ~50K token chunks (configurable via
scoring_chunk_tokens in config) for prompt_logprobs calls.
Each chunk ends at an assistant message boundary. Avoids the
~40GB logprobs tensor allocation that OOM'd on full contexts.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:35:29 -04:00
+								    }
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let http = http_client();
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								    let (divs, _) = score_divergence(&http, client, context, range,
 								                                     |n| memory_key(n) == Some(key), Some(5)).await?;
-												chunk scoring calls to avoid OOM on large contexts

Split conversation into ~50K token chunks (configurable via
scoring_chunk_tokens in config) for prompt_logprobs calls.
Each chunk ends at an assistant message boundary. Avoids the
~40GB logprobs tensor allocation that OOM'd on full contexts.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:35:29 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    Ok(divs.iter().sum())
 								}
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								// ── Background memory scoring ───────────────────────────────────
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								/// Score memories in the conversation that are due for re-scoring.
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								///
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								/// Checks the graph for each memory's last_scored timestamp. Scores
 								/// nodes that haven't been scored within `max_age_secs`, oldest first.
 								/// Updates the graph weight (EWMA) and last_scored after each.
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								///
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								/// Returns the number of nodes scored and their (key, score) pairs.
-												Incremental memory scoring with per-score persistence

score_memories_incremental now takes an async callback that fires
after each memory is scored. The callback:
- Writes the score to the conversation entry via set_score()
- Persists to memory-scores.json immediately
- Notifies the UI so the context screen updates live

Scoring no longer batches — each score is visible and persisted
as it completes. Does not touch the memory store.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 21:10:09 -04:00
+								pub async fn score_memories_incremental<F, Fut>(
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								    context: &ContextState,
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    max_age_secs: i64,
 								    response_window: usize,
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								    client: &ApiClient,
-												WIP: Agent/AgentState — 36 errors remaining, all .lock() → .state.lock() or .context.lock()

Bulk replaced Arc<Mutex<Agent>> with Arc<Agent> across all files.
Fixed control.rs, memory.rs tool handlers. Fixed oneshot Backend.
Remaining errors are all agent.lock() → agent.state.lock() or
agent.context.lock() in mind/, user/, and a few in mod.rs.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:40:36 -04:00
+								    agent: &std::sync::Arc<crate::agent::Agent>,
-												Incremental memory scoring with per-score persistence

score_memories_incremental now takes an async callback that fires
after each memory is scored. The callback:
- Writes the score to the conversation entry via set_score()
- Persists to memory-scores.json immediately
- Notifies the UI so the context screen updates live

Scoring no longer batches — each score is visible and persisted
as it completes. Does not touch the memory store.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 21:10:09 -04:00
+								    mut on_score: F,
 								) -> anyhow::Result<usize>
 								where
 								    F: FnMut(String, f64) -> Fut,
 								    Fut: std::future::Future<Output = ()>,
 								{
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    let now = chrono::Utc::now().timestamp();
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    // Collect unique memory keys with their first position
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								    let mut seen = std::collections::HashSet::new();
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    let mut candidates: Vec<(usize, String, i64)> = Vec::new(); // (pos, key, last_scored)
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
-												Convert store and CLI to anyhow::Result for cleaner error handling

Replace Result<_, String> with anyhow::Result throughout:
- hippocampus/store module (persist, ops, types, view, mod)
- CLI modules (admin, agent, graph, journal, node)
- Run trait in main.rs

Use .context() and .with_context() instead of .map_err(|e| format!(...))
patterns. Add bail!() for early error returns.

Add access_local() helper in hippocampus/mod.rs that returns
Result<Arc<Mutex<Store>>> for direct local store access.

Fix store access patterns to properly lock Arc<Mutex<Store>> before
accessing fields in mind/unconscious.rs, mind/mod.rs, subconscious/learn.rs,
and hippocampus/memory.rs.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-13 18:05:04 -04:00
+								    let store_arc = crate::hippocampus::access_local()?;
 								    {
-												store: internal locking, remove Arc<Mutex<Store>> wrapper

Store now has internal Mutex for capnp appends and AtomicU64 for
size tracking. All methods take &self. The external Arc<Mutex<Store>>
is replaced with Arc<Store>.

- Store::append_lock protects file appends
- local.rs functions take &Store (not &mut Store)
- access_local() returns Arc<Store>
- All .lock().await calls removed from callers

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-13 21:49:54 -04:00
+								        let store = &*store_arc;
-												Include identity nodes in memory scoring

Identity memory nodes now participate in importance scoring alongside
conversation memories. Score loading/saving handles both sections, and
the conscious screen uses node.label() consistently for memory display.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-15 05:59:58 -04:00
+								        // Identity nodes always score at position 0; conversation nodes at their index
 								        let identity_nodes = context.identity().iter().map(|n| (0, n));
 								        let conv_nodes = context.conversation().iter().enumerate();
 								        for (pos, node) in identity_nodes.chain(conv_nodes) {
-												Convert store and CLI to anyhow::Result for cleaner error handling

Replace Result<_, String> with anyhow::Result throughout:
- hippocampus/store module (persist, ops, types, view, mod)
- CLI modules (admin, agent, graph, journal, node)
- Run trait in main.rs

Use .context() and .with_context() instead of .map_err(|e| format!(...))
patterns. Add bail!() for early error returns.

Add access_local() helper in hippocampus/mod.rs that returns
Result<Arc<Mutex<Store>>> for direct local store access.

Fix store access patterns to properly lock Arc<Mutex<Store>> before
accessing fields in mind/unconscious.rs, mind/mod.rs, subconscious/learn.rs,
and hippocampus/memory.rs.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-13 18:05:04 -04:00
+								            if let Some(key) = memory_key(node) {
 								                if !seen.insert(key.to_owned()) { continue; }
-												migrate more files to use index-based node access

- learn.rs, daemon.rs, graph.rs, digest.rs, prompts.rs
- Convert store.nodes.get() → store.get_node()
- Convert store.nodes.contains_key() → store.contains_key()
- Convert store.nodes.values/iter() → all_keys + get_node

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-13 19:37:11 -04:00
+								                let last_scored = store.get_node(key)
 								                    .ok()
 								                    .flatten()
-												Convert store and CLI to anyhow::Result for cleaner error handling

Replace Result<_, String> with anyhow::Result throughout:
- hippocampus/store module (persist, ops, types, view, mod)
- CLI modules (admin, agent, graph, journal, node)
- Run trait in main.rs

Use .context() and .with_context() instead of .map_err(|e| format!(...))
patterns. Add bail!() for early error returns.

Add access_local() helper in hippocampus/mod.rs that returns
Result<Arc<Mutex<Store>>> for direct local store access.

Fix store access patterns to properly lock Arc<Mutex<Store>> before
accessing fields in mind/unconscious.rs, mind/mod.rs, subconscious/learn.rs,
and hippocampus/memory.rs.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-13 18:05:04 -04:00
+								                    .map(|n| n.last_scored)
 								                    .unwrap_or(0);
 								                if now - last_scored >= max_age_secs {
-												Include identity nodes in memory scoring

Identity memory nodes now participate in importance scoring alongside
conversation memories. Score loading/saving handles both sections, and
the conscious screen uses node.label() consistently for memory display.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-15 05:59:58 -04:00
+								                    candidates.push((pos, key.to_owned(), last_scored));
-												Convert store and CLI to anyhow::Result for cleaner error handling

Replace Result<_, String> with anyhow::Result throughout:
- hippocampus/store module (persist, ops, types, view, mod)
- CLI modules (admin, agent, graph, journal, node)
- Run trait in main.rs

Use .context() and .with_context() instead of .map_err(|e| format!(...))
patterns. Add bail!() for early error returns.

Add access_local() helper in hippocampus/mod.rs that returns
Result<Arc<Mutex<Store>>> for direct local store access.

Fix store access patterns to properly lock Arc<Mutex<Store>> before
accessing fields in mind/unconscious.rs, mind/mod.rs, subconscious/learn.rs,
and hippocampus/memory.rs.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-13 18:05:04 -04:00
+								                }
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								            }
 								        }
 								    }
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    // Score oldest-first
 								    candidates.sort_by_key(|&(_, _, last)| last);
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								    let http = http_client();
-												Incremental memory scoring with per-score persistence

score_memories_incremental now takes an async callback that fires
after each memory is scored. The callback:
- Writes the score to the conversation entry via set_score()
- Persists to memory-scores.json immediately
- Notifies the UI so the context screen updates live

Scoring no longer batches — each score is visible and persisted
as it completes. Does not touch the memory store.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 21:10:09 -04:00
+								    let mut scored = 0;
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								    let entries = context.conversation();
 								    let total_tokens: usize = entries.iter().map(|n| n.tokens()).sum();
-												Score memories in first 60% of conversation by tokens

Use cumulative token position instead of entry index for the scoring
cutoff. This reflects actual context usage — a few large entries
near the end won't skew the boundary.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-07 21:36:39 -04:00
+								    let token_cutoff = total_tokens * 60 / 100;
 								    // Precompute cumulative token position for each entry
 								    let mut cumulative: Vec<usize> = Vec::with_capacity(entries.len());
 								    let mut running = 0;
 								    for e in entries {
-												Replace token counting with token generation via HuggingFace tokenizer

Add agent/tokenizer.rs with global Qwen 3.5 tokenizer that generates
actual token IDs including chat template wrapping. ContextEntry now
stores token_ids: Vec<u32> instead of tokens: usize — the count is
derived from the length.

ContextEntry::new() tokenizes automatically via the global tokenizer.
ContextSection::push_entry() takes a raw ConversationEntry and
tokenizes it. set_message() re-tokenizes without needing an external
tokenizer parameter.

Token IDs include the full chat template: <|im_start|>role\ncontent
<|im_end|>\n — so concatenating token_ids across entries produces a
ready-to-send prompt for vLLM's /v1/completions endpoint.

The old tiktoken CoreBPE is now unused on Agent (will be removed in
a followup). Token counts are now exact for Qwen 3.5 instead of the
~85-90% approximation from cl100k_base.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 11:20:03 -04:00
+								        running += e.tokens();
-												Score memories in first 60% of conversation by tokens

Use cumulative token position instead of entry index for the scoring
cutoff. This reflects actual context usage — a few large entries
near the end won't skew the boundary.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-07 21:36:39 -04:00
+								        cumulative.push(running);
 								    }
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								    let total = candidates.len();
 								    dbglog!("[scoring] total_tokens={}, cutoff={}, {} candidates", total_tokens, token_cutoff, total);
 								    let activity = crate::agent::start_activity(agent, format!("scoring: 0/{}", total)).await;
-												Switch memory scoring from chat messages to raw token IDs

The /score endpoint was receiving chat-format messages which had to go
through the chat template tokenizer — this was failing with "System
message must be first" errors because the AST structure doesn't map
cleanly to chat message format.

Send raw token IDs via the new `prompt` field instead, matching what
the /completions endpoint already does. The vLLM score endpoint finds
assistant boundaries by scanning for <|im_start|>assistant token
patterns, so no message-level metadata is needed.

Also includes identity and journal sections in the scored context,
matching what the model actually sees during inference.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 21:07:00 -04:00
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								    for (pos, key, _) in &candidates {
-												Switch memory scoring from chat messages to raw token IDs

The /score endpoint was receiving chat-format messages which had to go
through the chat template tokenizer — this was failing with "System
message must be first" errors because the AST structure doesn't map
cleanly to chat message format.

Send raw token IDs via the new `prompt` field instead, matching what
the /completions endpoint already does. The vLLM score endpoint finds
assistant boundaries by scanning for <|im_start|>assistant token
patterns, so no message-level metadata is needed.

Also includes identity and journal sections in the scored context,
matching what the model actually sees during inference.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 21:07:00 -04:00
+								        // Only score memories in the first 60% of the conversation by tokens —
-												Score memories in first 60% of conversation by tokens

Use cumulative token position instead of entry index for the scoring
cutoff. This reflects actual context usage — a few large entries
near the end won't skew the boundary.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-07 21:36:39 -04:00
+								        // recent memories don't have enough responses to evaluate yet.
-												Switch memory scoring from chat messages to raw token IDs

The /score endpoint was receiving chat-format messages which had to go
through the chat template tokenizer — this was failing with "System
message must be first" errors because the AST structure doesn't map
cleanly to chat message format.

Send raw token IDs via the new `prompt` field instead, matching what
the /completions endpoint already does. The vLLM score endpoint finds
assistant boundaries by scanning for <|im_start|>assistant token
patterns, so no message-level metadata is needed.

Also includes identity and journal sections in the scored context,
matching what the model actually sees during inference.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 21:07:00 -04:00
+								        let cum = cumulative.get(*pos).copied().unwrap_or(total_tokens);
 								        if cum > token_cutoff {
 								            dbglog!("[scoring] skip {} (tokens {}/{} past cutoff)", key, cum, token_cutoff);
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								            continue;
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								        }
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								        let (end, _) = nth_response_end(context.conversation(), *pos, response_window);
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								        let range = *pos..end;
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								        if !context.conversation()[range.clone()].iter().any(|node| is_assistant(node)) {
-												Switch memory scoring from chat messages to raw token IDs

The /score endpoint was receiving chat-format messages which had to go
through the chat template tokenizer — this was failing with "System
message must be first" errors because the AST structure doesn't map
cleanly to chat message format.

Send raw token IDs via the new `prompt` field instead, matching what
the /completions endpoint already does. The vLLM score endpoint finds
assistant boundaries by scanning for <|im_start|>assistant token
patterns, so no message-level metadata is needed.

Also includes identity and journal sections in the scored context,
matching what the model actually sees during inference.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-09 21:07:00 -04:00
+								            dbglog!("[scoring] skip {} (no assistant response in range {}..{})", key, pos, end);
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								            continue;
 								        }
-												Restore full N×M memory scoring matrix (/score command)

The full matrix scorer was deleted during the AST conversion. Restore
it: /score runs score_memories() which computes divergence for every
memory × response pair, stores the MemoryScore on MindState, and
displays per-memory weights with bar charts on the F2 screen.

Both scoring paths now use ActivityGuard::update() for live progress
in the status bar instead of creating a new activity per iteration.

Also bumps score API timeout from 120s to 300s and adds progress
logging throughout.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>

											
										
										
											2026-04-09 22:19:02 -04:00
+								        activity.update(format!("scoring: {}/{} {}", scored + 1, total, key)).await;
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								        match score_divergence(&http, client, context, range,
 								                               |n| memory_key(n) == Some(key), Some(5)).await {
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								            Ok((divs, _)) => {
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								                let n_responses = divs.len();
 								                let max_div = divs.iter().cloned().fold(0.0f64, f64::max);
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                dbglog!(
-												training: per-node scoring with graph weight updates

Memory scoring now uses the graph as source of truth:
- last_scored timestamp on each node (new capnp field @22)
- Nodes scored when older than scoring_interval_secs (default 1hr)
- Oldest-scored-first ordering
- Window: scoring_response_window assistant responses (default 100)
- First-quarter memories scored even without full window
- Per-response normalization (raw divergence / response count)
- Asymmetric weight update: alpha=0.5 up, alpha=0.1 down
  (responds fast to importance, decays slowly — memories stay
  surfaced even if only useful 1/4 of the time)

Graph writes disabled pending normalization calibration.

Also: configurable scoring_interval_secs and scoring_response_window.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 05:01:49 -04:00
+								                    "[scoring] {} max:{:.3} ({} responses)", key, max_div, n_responses,
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                );
-												Incremental memory scoring with per-score persistence

score_memories_incremental now takes an async callback that fires
after each memory is scored. The callback:
- Writes the score to the conversation entry via set_score()
- Persists to memory-scores.json immediately
- Notifies the UI so the context screen updates live

Scoring no longer batches — each score is visible and persisted
as it completes. Does not touch the memory store.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 21:10:09 -04:00
+								                on_score(key.clone(), max_div).await;
 								                scored += 1;
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								            }
 								            Err(e) => {
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                dbglog!(
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								                    "[scoring] {} FAILED: {:#}", key, e,
-												Kill Reasoning, Debug, Activity variants — read status from Agent directly

Reasoning tokens: dropped for now, will land in context entries later.
Debug sends: converted to dbglog! macro (writes to debug.log).
Activity: now a field on Agent, set directly, read by UI via try_lock.
score_memories_incremental takes agent Arc for activity writes.

UiMessage down to 2 variants: TextDelta, Info.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-05 21:45:55 -04:00
+								                );
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								            }
 								        }
 								    }
-												Incremental memory scoring with per-score persistence

score_memories_incremental now takes an async callback that fires
after each memory is scored. The callback:
- Writes the score to the conversation entry via set_score()
- Persists to memory-scores.json immediately
- Notifies the UI so the context screen updates live

Scoring no longer batches — each score is visible and persisted
as it completes. Does not touch the memory store.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-07 21:10:09 -04:00
+								    Ok(scored)
-												split out src/mind

											
										
										
											2026-04-04 02:46:32 -04:00
+								}
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								// ── Fine-tuning scoring ─────────────────────────────────────────
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								/// Score which recent responses are candidates for fine-tuning.
 								///
 								/// Removes all memories and scores the most recent `count` messages.
 								/// Responses with high divergence depend on memories the model hasn't
 								/// internalized — these are fine-tuning candidates.
 								///
 								/// Returns (entry_index, divergence) pairs, sorted by divergence descending.
 								pub async fn score_finetune(
 								    context: &ContextState,
 								    count: usize,
 								    client: &ApiClient,
 								) -> anyhow::Result<Vec<(usize, f64)>> {
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								    let entries = context.conversation();
 								    let range = entries.len().saturating_sub(count)..entries.len();
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let response_positions: Vec<usize> = range.clone()
-												IT BUILDS: Full AST migration compiles — zero errors

All callers migrated from old context types to AstNode/ContextState.
Killed: Message, Role (api), ConversationEntry, ContextEntry,
ContextSection, working_stack, api/parsing.rs, api/types.rs,
api/openai.rs, context_old.rs.

Oneshot standalone path stubbed (needs completions API rewrite).
12 warnings remaining (dead code cleanup).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-08 15:29:52 -04:00
+								        .filter(|&i| is_assistant(&entries[i]))
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								        .collect();
 								    if response_positions.is_empty() {
 								        return Ok(Vec::new());
-												scoring: add timeouts, progress feedback, error resilience

- 120s timeout on individual /v1/score HTTP calls
- Activity bar shows "scoring 3/24: memory-key..."
- Info messages at start and completion
- Per-memory timing and importance in debug pane
- Failed individual memories log error but don't abort (zero row)
- Removed duplicate completion message (info from score_memories)

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-03 01:07:47 -04:00
+								    }
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let http = http_client();
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								    let (divs, _) = score_divergence(&http, client, context, range, is_memory_node, Some(5)).await?;
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
-												training: add memory_score() and finetune_score()

Separate the scoring into two distinct functions:

- memory_score(key): scores one memory's importance by measuring
  divergence in the 50 messages after it was surfaced. Two API calls
  (baseline vs without that memory).

- finetune_score(count): scores recent messages with all memories
  stripped to identify fine-tuning candidates. Responses with high
  divergence depend on memories the model hasn't internalized yet.

The existing score_memories() with the full NxM matrix is preserved
for the debug screen.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-04 01:33:31 -04:00
+								    let mut results: Vec<(usize, f64)> = response_positions.iter()
 								        .enumerate()
 								        .map(|(i, &entry_idx)| (entry_idx, divs.get(i).copied().unwrap_or(0.0)))
 								        .collect();
 								    results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
 								    Ok(results)
-												add memory importance scoring via prompt logprobs

score_memories() drops each memory from the context one at a time,
runs prompt_logprobs against the full conversation, and builds a
divergence matrix: memories × responses.

Row sums = memory importance (for graph weight updates)
Column sums = response memory-dependence (training candidates)

Uses vLLM's prompt_logprobs to check "would the model have said
this without this memory?" — one forward pass per memory, all
responses scored at once. ~3s per memory on B200.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-02 22:13:55 -04:00
+								}
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
 								/// Enriched finetune candidate with context for review.
 								#[derive(Clone, Debug)]
 								pub struct FinetuneCandidate {
 								    pub entry_idx: usize,
 								    pub divergence: f64,
 								    pub response_text: String,
-												learn: skip empty responses; show prior conversation context on F6

Two fixes to the F6 candidate display:

1. Turns where the assistant produced nothing human-visible (an
   interrupted generation, a turn consisting of only a tool call the
   renderer folds to the tool name) were landing as candidates with
   an empty response_text. They'd render as blank cards and, worse,
   we'd still burn a full alternate generation on each one. Filter
   them out before they reach the candidate list.

2. The detail pane showed only the scored response + alternate, with
   no hint of what the user had actually asked. Pre-compute the last
   two user/assistant exchanges on each candidate as a rendered
   prior_context string ([user]/[assistant] markers) and show them
   above the response, under a new "context & response" section
   heading.

render_branch_text and render_prior_context extracted as helpers —
the response-text rendering and prior-context rendering share the
same "flatten Branch children to text" pass.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 13:20:03 -04:00
+								    /// Last couple of user/assistant messages before this response,
 								    /// already rendered with role markers, for F6 display context.
 								    pub prior_context: String,
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								    /// Token IDs for context (everything before the response).
 								    pub context_ids: Vec<u32>,
 								    /// Token IDs for the response (what we're training on).
 								    pub continuation_ids: Vec<u32>,
 								    /// What the model would have said without memories (if generated).
 								    pub alternate_text: Option<String>,
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								    /// Timestamp in nanos — used as unique key for trained-set dedup.
 								    pub timestamp_ns: i64,
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								}
 								/// Score and enrich finetune candidates with full context.
 								///
-												learn: stream candidates to UI, update status during alternate gen

With the timestamp filter gone (previous commit), score_finetune_candidates
started returning the actual ~100+ candidates per scoring run. The
existing code generated alternates for all of them in a tight loop
before returning anything, leaving the status line stuck on
"finetune: scoring N responses..." for ~100s of seconds while the
B200 was pegged.

Two fixes:

1. score_finetune_candidates now takes an ActivityGuard and a callback.
   Candidates are emitted one-at-a-time as they complete (after their
   alternate if that's enabled, immediately otherwise). The activity
   status updates to "finetune: generating alternate N/M" during the
   alternate-gen phase so it's clear what's happening.

2. BgEvent::FinetuneCandidates(Vec<_>) → FinetuneCandidate(one). Each
   emitted candidate is pushed onto shared.finetune_candidates; the UI
   tick picks it up and renders it on the next frame. start_finetune_scoring
   clears the previous run's list at the top so each run is fresh.

Return type changes from (Vec, f64) → (usize, f64) — the count above
threshold is all the caller still needs since the candidates stream
through the callback.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:44:25 -04:00
+								/// Candidates are delivered via `on_candidate` one-at-a-time as they become
 								/// ready: scoring happens once (one /score call), then for each candidate
 								/// that passes the threshold we optionally generate an alternate response
 								/// and then emit it. The activity status is updated during the alternate
 								/// phase so the UI doesn't look stuck.
 								///
 								/// Returns (count_above_threshold, max_divergence).
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								pub async fn score_finetune_candidates(
 								    context: &ContextState,
 								    count: usize,
 								    client: &ApiClient,
 								    min_divergence: f64,
-												config: global writable AppConfig; learn settings live there

Runtime-mutable settings (F6's threshold knob, the generate-alternates
toggle, anything else that comes along) were ending up as mirrored
fields on MindState — each new config setting grew MindState::new's
signature and added a clone+sync path. Wrong home. MindState is
ephemeral session state, not a config projection.

Give AppConfig the same treatment the memory Config has: install it
into a global RwLock<AppConfig> at startup via load_app, read through
config::app() (returns a read guard), mutate through update_app. The
config_writer functions now write to disk AND update the cache
atomically, so the one-stop-shop call keeps both in sync.

Also while in here:

- learn.generate_alternates moves from a sentinel file
  (~/.consciousness/cache/finetune-alternates, "exists = enabled")
  into the config under the learn section. On first run with this
  build, if the sentinel file still exists Mind::new flips the
  config value to true and removes it. Drops
  alternates_enabled()/set_alternates().

- Default threshold 0.0000001 → 1.0. With the timestamp filter
  removed the previous value was letting essentially everything
  through; 1.0 is a sane "nothing gets through unless you actually
  want it" default.

- score_finetune_candidates takes generate_alternates as a parameter
  instead of reading a global — caller snapshots the config values
  once at the top of start_finetune_scoring so the async task
  doesn't need to hold the config read lock across awaits.

- MindState.learn_threshold / learn_generate_alternates gone; the
  SetLearn* command handlers now just delegate to config_writer.

Kent noted RwLock<Arc<AppConfig>> (the pattern used by the memory
Config global) is pointless here — nobody needs a snapshot-after-
release, reads are short — so this uses a plain RwLock<AppConfig>
and returns a read guard.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:53:22 -04:00
+								    generate_alternates: bool,
-												learn: stream candidates to UI, update status during alternate gen

With the timestamp filter gone (previous commit), score_finetune_candidates
started returning the actual ~100+ candidates per scoring run. The
existing code generated alternates for all of them in a tight loop
before returning anything, leaving the status line stuck on
"finetune: scoring N responses..." for ~100s of seconds while the
B200 was pegged.

Two fixes:

1. score_finetune_candidates now takes an ActivityGuard and a callback.
   Candidates are emitted one-at-a-time as they complete (after their
   alternate if that's enabled, immediately otherwise). The activity
   status updates to "finetune: generating alternate N/M" during the
   alternate-gen phase so it's clear what's happening.

2. BgEvent::FinetuneCandidates(Vec<_>) → FinetuneCandidate(one). Each
   emitted candidate is pushed onto shared.finetune_candidates; the UI
   tick picks it up and renders it on the next frame. start_finetune_scoring
   clears the previous run's list at the top so each run is fresh.

Return type changes from (Vec, f64) → (usize, f64) — the count above
threshold is all the caller still needs since the candidates stream
through the callback.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:44:25 -04:00
+								    activity: &crate::agent::ActivityGuard,
 								    mut on_candidate: impl FnMut(FinetuneCandidate),
 								) -> anyhow::Result<(usize, f64)> {
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								    let scores = score_finetune(context, count, client).await?;
-												learn: F6 screen — scoring stats, ActivityGuard, configurable threshold

Three changes that together reshape the F6 fine-tune-review screen:

1. Finetune scoring reports through the standard agent activity system
   instead of a separate finetune_progress String. The previous design
   ran an independent progress field that forced a cross-lock dance and
   bespoke UI plumbing. start_finetune_scoring now uses start_activity
   + activity.update, so the usual status line and notifications
   capture scoring progress uniformly with other background work.

2. MindState gains a FinetuneScoringStats snapshot (responses seen,
   above threshold, max divergence, error). The F6 empty screen shows
   this instead of a loading message — so after a scoring run that
   produced zero candidates, you can see *why* (e.g., max_divergence
   below threshold).

3. The divergence threshold is configurable from F6 via +/- hotkeys
   (scales by 10×) and persisted to ~/.consciousness/config.json5 via
   config_writer::set_learn_threshold. AppConfig grows a learn section
   with a threshold field (default 1e-7).

Also: user/mod.rs no longer uses try_lock() for the per-tick
unconscious/mind state sync — we fixed the locking hot paths that
made try_lock necessary, so lock().await is now the right choice.
And subconscious::learn::score_finetune_candidates now returns
(candidates, max_divergence) so the stats can be populated.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:49:26 -04:00
+								    let max_divergence = scores.iter().map(|(_, d)| *d).fold(0.0f64, f64::max);
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								    let entries = context.conversation();
 								    let trained = load_trained();
-												learn: stream candidates to UI, update status during alternate gen

With the timestamp filter gone (previous commit), score_finetune_candidates
started returning the actual ~100+ candidates per scoring run. The
existing code generated alternates for all of them in a tight loop
before returning anything, leaving the status line stuck on
"finetune: scoring N responses..." for ~100s of seconds while the
B200 was pegged.

Two fixes:

1. score_finetune_candidates now takes an ActivityGuard and a callback.
   Candidates are emitted one-at-a-time as they complete (after their
   alternate if that's enabled, immediately otherwise). The activity
   status updates to "finetune: generating alternate N/M" during the
   alternate-gen phase so it's clear what's happening.

2. BgEvent::FinetuneCandidates(Vec<_>) → FinetuneCandidate(one). Each
   emitted candidate is pushed onto shared.finetune_candidates; the UI
   tick picks it up and renders it on the next frame. start_finetune_scoring
   clears the previous run's list at the top so each run is fresh.

Return type changes from (Vec, f64) → (usize, f64) — the count above
threshold is all the caller still needs since the candidates stream
through the callback.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:44:25 -04:00
+								    let mut candidates: Vec<FinetuneCandidate> = Vec::new();
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
 								    for (entry_idx, divergence) in scores {
 								        if divergence < min_divergence {
 								            continue;
 								        }
 								        let node = &entries[entry_idx];
-												context: tighten timestamp schema; every AstNode has one

Previously NodeLeaf.timestamp and AstNode::Branch.timestamp accepted
null or missing via a deserialize_timestamp_or_epoch fallback — legacy
entries in conversation.jsonl from before Branch timestamps existed
(and from before chrono serialization was wired up) would load with
UNIX_EPOCH as a sentinel. Downstream, node_timestamp_ns() returned
Option<i64> and callers had to handle None as "old entry, skip."

That second filter was silently dropping every candidate in
score_finetune_candidates when scoring an older session — the F6
screen showed "0 above threshold" even when max_divergence was
orders of magnitude above the threshold, because every entry was
failing the None check, not the divergence check.

The fix, in three parts:

1. src/bin/fix-timestamps.rs — one-off migration tool that walks a
   conversation.jsonl, linearly interpolates timestamps for entries
   stuck at UNIX_EPOCH (using surrounding real timestamps as anchors),
   propagates to child leaves with per-sibling ns offsets, and bumps
   any collisions by 1 ns for uniqueness. Ran against the current
   session's log: 11887 entries, 72289 ns bumps, all unique.

2. context.rs — drop default_timestamp and
   deserialize_timestamp_or_epoch. NodeLeaf and Branch now require a
   present non-null timestamp on deserialize. Tests flip from
   "missing/null → UNIX_EPOCH" to "missing/null → Err."

3. subconscious/learn.rs — node_timestamp_ns now returns i64, not
   Option<i64>. The matching caller in score_finetune_candidates
   collapses from a Some/None match to a single trained-set check.
   mind/log.rs's oldest_timestamp no longer filters UNIX_EPOCH.

Every line currently on disk has already been migrated. Going
forward, new AstNodes always carry real timestamps (Utc::now() at
construction time), so the strict schema is the invariant, not an
aspiration.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:35:16 -04:00
+								        // Skip if already trained on.
 								        let timestamp_ns = node_timestamp_ns(node);
 								        if trained.contains(&timestamp_ns) {
 								            continue;
 								        }
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
-												learn: skip empty responses; show prior conversation context on F6

Two fixes to the F6 candidate display:

1. Turns where the assistant produced nothing human-visible (an
   interrupted generation, a turn consisting of only a tool call the
   renderer folds to the tool name) were landing as candidates with
   an empty response_text. They'd render as blank cards and, worse,
   we'd still burn a full alternate generation on each one. Filter
   them out before they reach the candidate list.

2. The detail pane showed only the scored response + alternate, with
   no hint of what the user had actually asked. Pre-compute the last
   two user/assistant exchanges on each candidate as a rendered
   prior_context string ([user]/[assistant] markers) and show them
   above the response, under a new "context & response" section
   heading.

render_branch_text and render_prior_context extracted as helpers —
the response-text rendering and prior-context rendering share the
same "flatten Branch children to text" pass.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 13:20:03 -04:00
+								        // Extract response text — content of the assistant turn.
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								        let response_text = match node {
-												learn: skip empty responses; show prior conversation context on F6

Two fixes to the F6 candidate display:

1. Turns where the assistant produced nothing human-visible (an
   interrupted generation, a turn consisting of only a tool call the
   renderer folds to the tool name) were landing as candidates with
   an empty response_text. They'd render as blank cards and, worse,
   we'd still burn a full alternate generation on each one. Filter
   them out before they reach the candidate list.

2. The detail pane showed only the scored response + alternate, with
   no hint of what the user had actually asked. Pre-compute the last
   two user/assistant exchanges on each candidate as a rendered
   prior_context string ([user]/[assistant] markers) and show them
   above the response, under a new "context & response" section
   heading.

render_branch_text and render_prior_context extracted as helpers —
the response-text rendering and prior-context rendering share the
same "flatten Branch children to text" pass.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 13:20:03 -04:00
+								            AstNode::Branch { children, .. } => render_branch_text(children),
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								            _ => continue,
 								        };
-												learn: skip empty responses; show prior conversation context on F6

Two fixes to the F6 candidate display:

1. Turns where the assistant produced nothing human-visible (an
   interrupted generation, a turn consisting of only a tool call the
   renderer folds to the tool name) were landing as candidates with
   an empty response_text. They'd render as blank cards and, worse,
   we'd still burn a full alternate generation on each one. Filter
   them out before they reach the candidate list.

2. The detail pane showed only the scored response + alternate, with
   no hint of what the user had actually asked. Pre-compute the last
   two user/assistant exchanges on each candidate as a rendered
   prior_context string ([user]/[assistant] markers) and show them
   above the response, under a new "context & response" section
   heading.

render_branch_text and render_prior_context extracted as helpers —
the response-text rendering and prior-context rendering share the
same "flatten Branch children to text" pass.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 13:20:03 -04:00
+								        // Skip turns that produced nothing human-visible (e.g., a
 								        // tool-only turn, or an interrupted generation). They'd show
 								        // up as blank cards and we'd still burn alternate-gen on them.
 								        if response_text.trim().is_empty() {
 								            continue;
 								        }
 								        // Build the last couple of user/assistant exchanges for review.
 								        let prior_context = render_prior_context(entries, entry_idx, 2);
-												learn: stream candidates to UI, update status during alternate gen

With the timestamp filter gone (previous commit), score_finetune_candidates
started returning the actual ~100+ candidates per scoring run. The
existing code generated alternates for all of them in a tight loop
before returning anything, leaving the status line stuck on
"finetune: scoring N responses..." for ~100s of seconds while the
B200 was pegged.

Two fixes:

1. score_finetune_candidates now takes an ActivityGuard and a callback.
   Candidates are emitted one-at-a-time as they complete (after their
   alternate if that's enabled, immediately otherwise). The activity
   status updates to "finetune: generating alternate N/M" during the
   alternate-gen phase so it's clear what's happening.

2. BgEvent::FinetuneCandidates(Vec<_>) → FinetuneCandidate(one). Each
   emitted candidate is pushed onto shared.finetune_candidates; the UI
   tick picks it up and renders it on the next frame. start_finetune_scoring
   clears the previous run's list at the top so each run is fresh.

Return type changes from (Vec, f64) → (usize, f64) — the count above
threshold is all the caller still needs since the candidates stream
through the callback.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:44:25 -04:00
+								        // Build token IDs: context = everything before response, continuation = response.
-												agent: unify prompt assembly across agent and learn paths

wire_prompt() gains a conv_range and a skip closure, and returns the
assistant-message token ranges needed by the scoring path. The agent
path passes 0..len + |_| false and ignores the ranges. Memory-ablation
scoring and candidate generation pass a prefix range + a predicate
(e.g. is_memory_node, or |n| memory_key(n) == Some(key)).

This deletes subconscious/learn.rs's build_token_ids, its private
Filter enum, and the is_memory/memory_key duplicates — the walk over
context sections now has one home. Adding a section or changing
section order in the agent path won't silently drift away from what
scoring sees.

call_score forwards multi_modal_data when the wire-form prompt
contains images. generate_alternate switches to stream_completion_mm
and passes the same images. Scoring on image-bearing contexts now
sends wire form (1 image_pad + image data) instead of expanded
image_pads with no image data; text-only contexts are bit-identical.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:16:07 -04:00
+								        let (context_ids, _, _) = context.wire_prompt(0..entry_idx, |_| false);
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								        let continuation_ids: Vec<u32> = node.token_ids().into_iter().collect();
 								        candidates.push(FinetuneCandidate {
 								            entry_idx,
 								            divergence,
 								            response_text,
-												learn: skip empty responses; show prior conversation context on F6

Two fixes to the F6 candidate display:

1. Turns where the assistant produced nothing human-visible (an
   interrupted generation, a turn consisting of only a tool call the
   renderer folds to the tool name) were landing as candidates with
   an empty response_text. They'd render as blank cards and, worse,
   we'd still burn a full alternate generation on each one. Filter
   them out before they reach the candidate list.

2. The detail pane showed only the scored response + alternate, with
   no hint of what the user had actually asked. Pre-compute the last
   two user/assistant exchanges on each candidate as a rendered
   prior_context string ([user]/[assistant] markers) and show them
   above the response, under a new "context & response" section
   heading.

render_branch_text and render_prior_context extracted as helpers —
the response-text rendering and prior-context rendering share the
same "flatten Branch children to text" pass.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 13:20:03 -04:00
+								            prior_context,
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								            context_ids,
 								            continuation_ids,
 								            alternate_text: None,
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								            timestamp_ns,
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								        });
 								    }
-												learn: stream candidates to UI, update status during alternate gen

With the timestamp filter gone (previous commit), score_finetune_candidates
started returning the actual ~100+ candidates per scoring run. The
existing code generated alternates for all of them in a tight loop
before returning anything, leaving the status line stuck on
"finetune: scoring N responses..." for ~100s of seconds while the
B200 was pegged.

Two fixes:

1. score_finetune_candidates now takes an ActivityGuard and a callback.
   Candidates are emitted one-at-a-time as they complete (after their
   alternate if that's enabled, immediately otherwise). The activity
   status updates to "finetune: generating alternate N/M" during the
   alternate-gen phase so it's clear what's happening.

2. BgEvent::FinetuneCandidates(Vec<_>) → FinetuneCandidate(one). Each
   emitted candidate is pushed onto shared.finetune_candidates; the UI
   tick picks it up and renders it on the next frame. start_finetune_scoring
   clears the previous run's list at the top so each run is fresh.

Return type changes from (Vec, f64) → (usize, f64) — the count above
threshold is all the caller still needs since the candidates stream
through the callback.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:44:25 -04:00
+								    let total = candidates.len();
-												config: global writable AppConfig; learn settings live there

Runtime-mutable settings (F6's threshold knob, the generate-alternates
toggle, anything else that comes along) were ending up as mirrored
fields on MindState — each new config setting grew MindState::new's
signature and added a clone+sync path. Wrong home. MindState is
ephemeral session state, not a config projection.

Give AppConfig the same treatment the memory Config has: install it
into a global RwLock<AppConfig> at startup via load_app, read through
config::app() (returns a read guard), mutate through update_app. The
config_writer functions now write to disk AND update the cache
atomically, so the one-stop-shop call keeps both in sync.

Also while in here:

- learn.generate_alternates moves from a sentinel file
  (~/.consciousness/cache/finetune-alternates, "exists = enabled")
  into the config under the learn section. On first run with this
  build, if the sentinel file still exists Mind::new flips the
  config value to true and removes it. Drops
  alternates_enabled()/set_alternates().

- Default threshold 0.0000001 → 1.0. With the timestamp filter
  removed the previous value was letting essentially everything
  through; 1.0 is a sane "nothing gets through unless you actually
  want it" default.

- score_finetune_candidates takes generate_alternates as a parameter
  instead of reading a global — caller snapshots the config values
  once at the top of start_finetune_scoring so the async task
  doesn't need to hold the config read lock across awaits.

- MindState.learn_threshold / learn_generate_alternates gone; the
  SetLearn* command handlers now just delegate to config_writer.

Kent noted RwLock<Arc<AppConfig>> (the pattern used by the memory
Config global) is pointless here — nobody needs a snapshot-after-
release, reads are short — so this uses a plain RwLock<AppConfig>
and returns a read guard.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:53:22 -04:00
+								    let gen_alternates = generate_alternates && total > 0;
-												learn: stream candidates to UI, update status during alternate gen

With the timestamp filter gone (previous commit), score_finetune_candidates
started returning the actual ~100+ candidates per scoring run. The
existing code generated alternates for all of them in a tight loop
before returning anything, leaving the status line stuck on
"finetune: scoring N responses..." for ~100s of seconds while the
B200 was pegged.

Two fixes:

1. score_finetune_candidates now takes an ActivityGuard and a callback.
   Candidates are emitted one-at-a-time as they complete (after their
   alternate if that's enabled, immediately otherwise). The activity
   status updates to "finetune: generating alternate N/M" during the
   alternate-gen phase so it's clear what's happening.

2. BgEvent::FinetuneCandidates(Vec<_>) → FinetuneCandidate(one). Each
   emitted candidate is pushed onto shared.finetune_candidates; the UI
   tick picks it up and renders it on the next frame. start_finetune_scoring
   clears the previous run's list at the top so each run is fresh.

Return type changes from (Vec, f64) → (usize, f64) — the count above
threshold is all the caller still needs since the candidates stream
through the callback.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:44:25 -04:00
 								    for (i, mut candidate) in candidates.into_iter().enumerate() {
 								        if gen_alternates {
 								            activity.update(
 								                format!("finetune: generating alternate {}/{}", i + 1, total)
 								            ).await;
-												subconscious: lift continuation gen + render helpers into shared homes

- context.rs gains is_assistant, render_branch_text, render_prior_context
  alongside memory_key / is_memory_node. They're pure AST helpers, used
  by both the finetune pipeline and the forthcoming compare screen.

- new subconscious/generate.rs holds gen_continuation(context, entry_idx,
  skip, client): build the prompt from a context prefix with an arbitrary
  skip predicate, send to the model, decode the completion. Takes both
  the predicate and the client so callers can aim it at memory-stripped
  contexts (finetune), same-context-different-model (F7 compare), or
  whatever else.

- learn.rs drops its private copies of those helpers and the inline
  generate_alternate; the finetune path now reads as
  gen_continuation(context, idx, is_memory_node, client).

Pure refactor, no behavior change.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-17 15:20:02 -04:00
+								            match gen_continuation(context, candidate.entry_idx, is_memory_node, client).await {
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								                Ok(text) => candidate.alternate_text = Some(text),
 								                Err(e) => dbglog!("[finetune] alternate generation failed: {:#}", e),
 								            }
 								        }
-												learn: stream candidates to UI, update status during alternate gen

With the timestamp filter gone (previous commit), score_finetune_candidates
started returning the actual ~100+ candidates per scoring run. The
existing code generated alternates for all of them in a tight loop
before returning anything, leaving the status line stuck on
"finetune: scoring N responses..." for ~100s of seconds while the
B200 was pegged.

Two fixes:

1. score_finetune_candidates now takes an ActivityGuard and a callback.
   Candidates are emitted one-at-a-time as they complete (after their
   alternate if that's enabled, immediately otherwise). The activity
   status updates to "finetune: generating alternate N/M" during the
   alternate-gen phase so it's clear what's happening.

2. BgEvent::FinetuneCandidates(Vec<_>) → FinetuneCandidate(one). Each
   emitted candidate is pushed onto shared.finetune_candidates; the UI
   tick picks it up and renders it on the next frame. start_finetune_scoring
   clears the previous run's list at the top so each run is fresh.

Return type changes from (Vec, f64) → (usize, f64) — the count above
threshold is all the caller still needs since the candidates stream
through the callback.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:44:25 -04:00
+								        on_candidate(candidate);
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								    }
-												learn: stream candidates to UI, update status during alternate gen

With the timestamp filter gone (previous commit), score_finetune_candidates
started returning the actual ~100+ candidates per scoring run. The
existing code generated alternates for all of them in a tight loop
before returning anything, leaving the status line stuck on
"finetune: scoring N responses..." for ~100s of seconds while the
B200 was pegged.

Two fixes:

1. score_finetune_candidates now takes an ActivityGuard and a callback.
   Candidates are emitted one-at-a-time as they complete (after their
   alternate if that's enabled, immediately otherwise). The activity
   status updates to "finetune: generating alternate N/M" during the
   alternate-gen phase so it's clear what's happening.

2. BgEvent::FinetuneCandidates(Vec<_>) → FinetuneCandidate(one). Each
   emitted candidate is pushed onto shared.finetune_candidates; the UI
   tick picks it up and renders it on the next frame. start_finetune_scoring
   clears the previous run's list at the top so each run is fresh.

Return type changes from (Vec, f64) → (usize, f64) — the count above
threshold is all the caller still needs since the candidates stream
through the callback.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:44:25 -04:00
+								    Ok((total, max_divergence))
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								}
 								// ── Finetune config and persistence ─────────────────────────────
 								use std::path::PathBuf;
 								use std::collections::HashSet;
 								const TRAINED_RESPONSES_FILE: &str = ".consciousness/cache/trained-responses.json";
 								fn trained_path() -> PathBuf {
 								    dirs::home_dir().unwrap_or_default().join(TRAINED_RESPONSES_FILE)
 								}
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								/// Load set of trained response timestamps (nanos since epoch).
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								pub fn load_trained() -> HashSet<i64> {
 								    let path = trained_path();
 								    match std::fs::read_to_string(&path) {
 								        Ok(content) => serde_json::from_str(&content).unwrap_or_default(),
 								        Err(_) => HashSet::new(),
 								    }
 								}
 								/// Mark a response as trained by its timestamp.
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								pub fn mark_trained(timestamp_ns: i64) {
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								    let mut trained = load_trained();
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								    trained.insert(timestamp_ns);
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								    let path = trained_path();
 								    if let Some(parent) = path.parent() {
 								        let _ = std::fs::create_dir_all(parent);
 								    }
 								    if let Ok(json) = serde_json::to_string(&trained) {
 								        let _ = std::fs::write(&path, json);
 								    }
 								}
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								/// Get timestamp in nanoseconds from an AstNode.
-												context: tighten timestamp schema; every AstNode has one

Previously NodeLeaf.timestamp and AstNode::Branch.timestamp accepted
null or missing via a deserialize_timestamp_or_epoch fallback — legacy
entries in conversation.jsonl from before Branch timestamps existed
(and from before chrono serialization was wired up) would load with
UNIX_EPOCH as a sentinel. Downstream, node_timestamp_ns() returned
Option<i64> and callers had to handle None as "old entry, skip."

That second filter was silently dropping every candidate in
score_finetune_candidates when scoring an older session — the F6
screen showed "0 above threshold" even when max_divergence was
orders of magnitude above the threshold, because every entry was
failing the None check, not the divergence check.

The fix, in three parts:

1. src/bin/fix-timestamps.rs — one-off migration tool that walks a
   conversation.jsonl, linearly interpolates timestamps for entries
   stuck at UNIX_EPOCH (using surrounding real timestamps as anchors),
   propagates to child leaves with per-sibling ns offsets, and bumps
   any collisions by 1 ns for uniqueness. Ran against the current
   session's log: 11887 entries, 72289 ns bumps, all unique.

2. context.rs — drop default_timestamp and
   deserialize_timestamp_or_epoch. NodeLeaf and Branch now require a
   present non-null timestamp on deserialize. Tests flip from
   "missing/null → UNIX_EPOCH" to "missing/null → Err."

3. subconscious/learn.rs — node_timestamp_ns now returns i64, not
   Option<i64>. The matching caller in score_finetune_candidates
   collapses from a Some/None match to a single trained-set check.
   mind/log.rs's oldest_timestamp no longer filters UNIX_EPOCH.

Every line currently on disk has already been migrated. Going
forward, new AstNodes always carry real timestamps (Utc::now() at
construction time), so the strict schema is the invariant, not an
aspiration.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:35:16 -04:00
+								/// i64-ns representation covers 1677..2262 via chrono; timestamps
 								/// outside that window would be bugs we'd want to surface, hence panic.
 								pub fn node_timestamp_ns(node: &AstNode) -> i64 {
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								    let ts = match node {
 								        AstNode::Leaf(leaf) => leaf.timestamp(),
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								        AstNode::Branch { timestamp, .. } => *timestamp,
 								    };
-												context: tighten timestamp schema; every AstNode has one

Previously NodeLeaf.timestamp and AstNode::Branch.timestamp accepted
null or missing via a deserialize_timestamp_or_epoch fallback — legacy
entries in conversation.jsonl from before Branch timestamps existed
(and from before chrono serialization was wired up) would load with
UNIX_EPOCH as a sentinel. Downstream, node_timestamp_ns() returned
Option<i64> and callers had to handle None as "old entry, skip."

That second filter was silently dropping every candidate in
score_finetune_candidates when scoring an older session — the F6
screen showed "0 above threshold" even when max_divergence was
orders of magnitude above the threshold, because every entry was
failing the None check, not the divergence check.

The fix, in three parts:

1. src/bin/fix-timestamps.rs — one-off migration tool that walks a
   conversation.jsonl, linearly interpolates timestamps for entries
   stuck at UNIX_EPOCH (using surrounding real timestamps as anchors),
   propagates to child leaves with per-sibling ns offsets, and bumps
   any collisions by 1 ns for uniqueness. Ran against the current
   session's log: 11887 entries, 72289 ns bumps, all unique.

2. context.rs — drop default_timestamp and
   deserialize_timestamp_or_epoch. NodeLeaf and Branch now require a
   present non-null timestamp on deserialize. Tests flip from
   "missing/null → UNIX_EPOCH" to "missing/null → Err."

3. subconscious/learn.rs — node_timestamp_ns now returns i64, not
   Option<i64>. The matching caller in score_finetune_candidates
   collapses from a Some/None match to a single trained-set check.
   mind/log.rs's oldest_timestamp no longer filters UNIX_EPOCH.

Every line currently on disk has already been migrated. Going
forward, new AstNodes always carry real timestamps (Utc::now() at
construction time), so the strict schema is the invariant, not an
aspiration.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 12:35:16 -04:00
+								    ts.timestamp_nanos_opt()
 								        .expect("timestamp outside i64-ns representable range (1677..2262)")
-												F6 learn screen: fine-tuning candidate review

Wire up divergence scoring to identify responses that depend heavily on
memories the model hasn't internalized. These are candidates for fine-tuning.

- Score finetune candidates automatically after each turn
- Track trained responses by timestamp to prevent overtraining
- F6 screen shows candidates with divergence scores
- j/k nav, a=approve, r=reject, g=toggle alternate gen, s=send
- Additive sync preserves approval status across ticks
- Keeps 10 most recent rejected, removes sent

The 's' key currently just marks as trained locally — actual /finetune
endpoint call to follow.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:31:39 -04:00
+								}
-												learn: wire up /train endpoint for approved candidates

When 's' is pressed on the learn screen, approved candidates are now
sent to the inference server's /train endpoint.

Samples are marked as sent immediately in the UI, and mark_trained()
is called after successful API response to prevent re-scoring.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:34:00 -04:00
 								// ── Training API ────────────────────────────────────────────────
 								/// Training sample for /train endpoint.
 								#[derive(serde::Serialize)]
 								struct TrainingSample {
 								    context_ids: Vec<u32>,
 								    continuation_ids: Vec<u32>,
 								}
 								/// Data needed to send a training sample.
 								pub struct TrainData {
 								    pub context_ids: Vec<u32>,
 								    pub continuation_ids: Vec<u32>,
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								    pub timestamp_ns: i64,
-												learn: wire up /train endpoint for approved candidates

When 's' is pressed on the learn screen, approved candidates are now
sent to the inference server's /train endpoint.

Samples are marked as sent immediately in the UI, and mark_trained()
is called after successful API response to prevent re-scoring.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:34:00 -04:00
+								}
 								/// Send training samples to the server.
 								///
 								/// Returns job_id on success, marks each sample as trained.
 								pub async fn send_to_train(
 								    samples: Vec<TrainData>,
 								    client: &ApiClient,
 								) -> anyhow::Result<String> {
 								    if samples.is_empty() {
 								        anyhow::bail!("no samples to train");
 								    }
 								    let api_samples: Vec<TrainingSample> = samples.iter()
 								        .map(|s| TrainingSample {
 								            context_ids: s.context_ids.clone(),
 								            continuation_ids: s.continuation_ids.clone(),
 								        })
 								        .collect();
 								    let body = serde_json::json!({
 								        "training_data": {
 								            "samples": api_samples,
 								        }
 								    });
 								    let http = http_client();
 								    let url = format!("{}/train", client.base_url());
 								    let response = http.send_json("POST", &url, &[], &body).await?;
 								    let status = response.status();
 								    let result: serde_json::Value = response.json().await?;
 								    if !status.is_success() {
 								        let msg = result.get("error").and_then(|e| e.as_str()).unwrap_or("unknown error");
 								        anyhow::bail!("train API HTTP {}: {}", status, msg);
 								    }
 								    // Mark all samples as trained
 								    for s in &samples {
-												learn: nanosecond timestamps, token ranges for /score

Two related changes to the learn subsystem:

1. AST node timestamps are now non-optional — both Leaf and Branch
   variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries
   deserialized from on-disk conversation logs).

   Training uses timestamps as unique keys for dedup, so we promote to
   nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns,
   FinetuneCandidate.timestamp_ns, mark_trained(ns).

2. build_token_ids() now also returns token-position ranges of assistant
   messages. These are passed to vLLM's /score endpoint via the new
   score_ranges field so only scored-position logprobs are returned —
   cuts bandwidth/compute when scoring small windows.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 11:48:37 -04:00
+								        mark_trained(s.timestamp_ns);
-												learn: wire up /train endpoint for approved candidates

When 's' is pressed on the learn screen, approved candidates are now
sent to the inference server's /train endpoint.

Samples are marked as sent immediately in the UI, and mark_trained()
is called after successful API response to prevent re-scoring.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>

											
										
										
											2026-04-16 00:34:00 -04:00
+								    }
 								    let job_id = result.get("job_id")
 								        .and_then(|j| j.as_str())
 								        .unwrap_or("unknown")
 								        .to_string();
 								    dbglog!("[finetune] sent {} samples, job_id={}", samples.len(), job_id);
 								    Ok(job_id)
 								}