Delete similarity module, rewrite module, and all text-similarity code

Text cosine similarity was being used as a crutch for operations the graph structure should handle: interference detection, orphan linking, triangle closing, hub differentiation. These are all graph-structural operations that the agents (linker, extractor) handle with actual semantic understanding. Removed: similarity.rs (stemming + cosine), rewrite.rs (orphan linking, triangle closing, hub differentiation), detect_interference, and all CLI commands and consolidation steps that used them. -794 lines. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-10 15:44:10 -04:00 · 2026-04-10 15:44:10 -04:00 · 96e573f2e5
commit 96e573f2e5
parent 92ef9b5215
12 changed files with 11 additions and 794 deletions
--- a/src/hippocampus/mod.rs
+++ b/src/hippocampus/mod.rs
@ -11,7 +11,6 @@ pub mod graph;
 pub mod lookups;
 pub mod cursor;
 pub mod query;
-pub mod similarity;
 pub mod spectral;
 pub mod neuro;
 pub mod counters;
--- a/src/hippocampus/neuro/mod.rs
+++ b/src/hippocampus/neuro/mod.rs
@ -1,25 +1,14 @@
-// Neuroscience-inspired memory algorithms, split by concern:
+// Neuroscience-inspired memory algorithms:
 //
-//   scoring  — pure analysis: priority, replay queues, interference, plans
-//   prompts  — agent prompt generation and formatting
-//   rewrite  — graph topology mutations: differentiation, closure, linking
+//   scoring  — pure analysis: priority, replay queues, plans

 mod scoring;
-mod rewrite;

 pub use scoring::{
    ReplayItem,
    ConsolidationPlan,
    consolidation_priority,
    replay_queue, replay_queue_with_graph,
-    detect_interference,
    consolidation_plan, consolidation_plan_quick, format_plan,
    daily_check,
 };
-
-pub use rewrite::{
-    refine_target, LinkMove,
-    differentiate_hub,
-    apply_differentiation, find_differentiable_hubs,
-    triangle_close, link_orphans,
-};
--- a/src/hippocampus/neuro/rewrite.rs
+++ b/src/hippocampus/neuro/rewrite.rs
@ -1,348 +0,0 @@
-// Graph topology mutations: hub differentiation, triangle closure,
-// orphan linking, and link refinement. These modify the store.
-
-use crate::store::{Store, new_relation};
-use crate::graph::Graph;
-use crate::similarity;
-
-/// Collect (key, content) pairs for all section children of a file-level node.
-fn section_children<'a>(store: &'a Store, file_key: &str) -> Vec<(&'a str, &'a str)> {
-    let prefix = format!("{}#", file_key);
-    store.nodes.iter()
-        .filter(|(k, _)| k.starts_with(&prefix))
-        .map(|(k, n)| (k.as_str(), n.content.as_str()))
-        .collect()
-}
-
-/// Find the best matching candidate by cosine similarity against content.
-/// Returns (key, similarity) if any candidate exceeds threshold.
-fn best_match(candidates: &[(&str, &str)], content: &str, threshold: f32) -> Option<(String, f32)> {
-    let (best_key, best_sim) = candidates.iter()
-        .map(|(key, text)| (*key, similarity::cosine_similarity(content, text)))
-        .max_by(|a, b| a.1.total_cmp(&b.1))?;
-    if best_sim > threshold {
-        Some((best_key.to_string(), best_sim))
-    } else {
-        None
-    }
-}
-
-/// Refine a link target: if the target is a file-level node with section
-/// children, find the best-matching section by cosine similarity against
-/// the source content. Returns the original key if no sections exist or
-/// no section matches above threshold.
-///
-/// This prevents hub formation at link creation time — every new link
-/// targets the most specific available node.
-pub fn refine_target(store: &Store, source_content: &str, target_key: &str) -> String {
-    // Only refine file-level nodes (no # in key)
-    if target_key.contains('#') { return target_key.to_string(); }
-
-    let sections = section_children(store, target_key);
-
-    if sections.is_empty() { return target_key.to_string(); }
-
-    best_match(&sections, source_content, 0.05)
-        .map(|(key, _)| key)
-        .unwrap_or_else(|| target_key.to_string())
-}
-
-/// A proposed link move: from hub→neighbor to section→neighbor
-pub struct LinkMove {
-    pub neighbor_key: String,
-    pub from_hub: String,
-    pub to_section: String,
-    pub similarity: f32,
-    pub neighbor_snippet: String,
-}
-
-/// Analyze a hub node and propose redistributing its links to child sections.
-///
-/// Returns None if the node isn't a hub or has no sections to redistribute to.
-pub fn differentiate_hub(store: &Store, hub_key: &str) -> Option<Vec<LinkMove>> {
-    let graph = store.build_graph();
-    differentiate_hub_with_graph(store, hub_key, &graph)
-}
-
-/// Like differentiate_hub but uses a pre-built graph.
-fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph) -> Option<Vec<LinkMove>> {
-    let degree = graph.degree(hub_key);
-
-    // Only differentiate actual hubs
-    if degree < 20 { return None; }
-
-    // Only works on file-level nodes that have section children
-    if hub_key.contains('#') { return None; }
-
-    let sections = section_children(store, hub_key);
-    if sections.is_empty() { return None; }
-
-    // Get all neighbors of the hub
-    let neighbors = graph.neighbors(hub_key);
-    let prefix = format!("{}#", hub_key);
-
-    let mut moves = Vec::new();
-
-    for (neighbor_key, _strength) in &neighbors {
-        // Skip section children — they should stay linked to parent
-        if neighbor_key.starts_with(&prefix) { continue; }
-
-        let neighbor_content = match store.nodes.get(neighbor_key.as_str()) {
-            Some(n) => &n.content,
-            None => continue,
-        };
-
-        // Find best-matching section by content similarity
-        if let Some((best_section, best_sim)) = best_match(&sections, neighbor_content, 0.05) {
-            let snippet = crate::util::first_n_chars(
-                neighbor_content.lines()
-                    .find(|l| !l.is_empty() && !l.starts_with("<!--") && !l.starts_with("##"))
-                    .unwrap_or(""),
-                80);
-
-            moves.push(LinkMove {
-                neighbor_key: neighbor_key.to_string(),
-                from_hub: hub_key.to_string(),
-                to_section: best_section,
-                similarity: best_sim,
-                neighbor_snippet: snippet,
-            });
-        }
-    }
-
-    moves.sort_by(|a, b| b.similarity.total_cmp(&a.similarity));
-    Some(moves)
-}
-
-/// Apply link moves: soft-delete hub→neighbor, create section→neighbor.
-pub fn apply_differentiation(
-    store: &mut Store,
-    moves: &[LinkMove],
-) -> (usize, usize) {
-    let mut applied = 0usize;
-    let mut skipped = 0usize;
-
-    for mv in moves {
-        // Check that section→neighbor doesn't already exist
-        let exists = store.relations.iter().any(|r|
-            ((r.source_key == mv.to_section && r.target_key == mv.neighbor_key)
-            || (r.source_key == mv.neighbor_key && r.target_key == mv.to_section))
-            && !r.deleted
-        );
-        if exists { skipped += 1; continue; }
-
-        let section_uuid = match store.nodes.get(&mv.to_section) {
-            Some(n) => n.uuid,
-            None => { skipped += 1; continue; }
-        };
-        let neighbor_uuid = match store.nodes.get(&mv.neighbor_key) {
-            Some(n) => n.uuid,
-            None => { skipped += 1; continue; }
-        };
-
-        // Soft-delete old hub→neighbor relation
-        for rel in &mut store.relations {
-            if ((rel.source_key == mv.from_hub && rel.target_key == mv.neighbor_key)
-                || (rel.source_key == mv.neighbor_key && rel.target_key == mv.from_hub))
-                && !rel.deleted
-            {
-                rel.deleted = true;
-            }
-        }
-
-        // Create new section→neighbor relation
-        let new_rel = new_relation(
-            section_uuid, neighbor_uuid,
-            crate::store::RelationType::Auto,
-            0.5,
-            &mv.to_section, &mv.neighbor_key,
-        );
-        if store.add_relation(new_rel).is_ok() {
-            applied += 1;
-        }
-    }
-
-    (applied, skipped)
-}
-
-/// Find all file-level hubs that have section children to split into.
-pub fn find_differentiable_hubs(store: &Store) -> Vec<(String, usize, usize)> {
-    let graph = store.build_graph();
-    let threshold = graph.hub_threshold();
-
-    let mut hubs = Vec::new();
-    for key in graph.nodes() {
-        let deg = graph.degree(key);
-        if deg < threshold { continue; }
-        if key.contains('#') { continue; }
-
-        let section_count = section_children(store, key).len();
-        if section_count > 0 {
-            hubs.push((key.clone(), deg, section_count));
-        }
-    }
-
-    hubs.sort_by(|a, b| b.1.cmp(&a.1));
-    hubs
-}
-
-/// Triangle closure: for each node with degree >= min_degree, find pairs
-/// of its neighbors that aren't directly connected and have cosine
-/// similarity above sim_threshold. Add links between them.
-///
-/// This turns hub-spoke patterns into triangles, directly improving
-/// clustering coefficient and schema fit.
-pub fn triangle_close(
-    store: &mut Store,
-    min_degree: usize,
-    sim_threshold: f32,
-    max_links_per_hub: usize,
-) -> (usize, usize) {
-    let graph = store.build_graph();
-    let mut added = 0usize;
-    let mut hubs_processed = 0usize;
-
-    // Get nodes sorted by degree (highest first)
-    let mut candidates: Vec<(String, usize)> = graph.nodes().iter()
-        .map(|k| (k.clone(), graph.degree(k)))
-        .filter(|(_, d)| *d >= min_degree)
-        .collect();
-    candidates.sort_by(|a, b| b.1.cmp(&a.1));
-
-    for (hub_key, hub_deg) in &candidates {
-        let neighbors = graph.neighbor_keys(hub_key);
-        if neighbors.len() < 2 { continue; }
-
-        // Collect neighbor content for similarity
-        let neighbor_docs: Vec<(String, String)> = neighbors.iter()
-            .filter_map(|&k| {
-                store.nodes.get(k).map(|n| (k.to_string(), n.content.clone()))
-            })
-            .collect();
-
-        // Find unconnected pairs with high similarity
-        let mut pair_scores: Vec<(String, String, f32)> = Vec::new();
-        for i in 0..neighbor_docs.len() {
-            for j in (i + 1)..neighbor_docs.len() {
-                // Check if already connected
-                let n_i = graph.neighbor_keys(&neighbor_docs[i].0);
-                if n_i.contains(neighbor_docs[j].0.as_str()) { continue; }
-
-                let sim = similarity::cosine_similarity(
-                    &neighbor_docs[i].1, &neighbor_docs[j].1);
-                if sim >= sim_threshold {
-                    pair_scores.push((
-                        neighbor_docs[i].0.clone(),
-                        neighbor_docs[j].0.clone(),
-                        sim,
-                    ));
-                }
-            }
-        }
-
-        pair_scores.sort_by(|a, b| b.2.total_cmp(&a.2));
-        let to_add = pair_scores.len().min(max_links_per_hub);
-
-        if to_add > 0 {
-            println!("  {} (deg={}) — {} triangles to close (top {})",
-                hub_key, hub_deg, pair_scores.len(), to_add);
-
-            for (a, b, sim) in pair_scores.iter().take(to_add) {
-                let uuid_a = match store.nodes.get(a) { Some(n) => n.uuid, None => continue };
-                let uuid_b = match store.nodes.get(b) { Some(n) => n.uuid, None => continue };
-
-                let rel = new_relation(
-                    uuid_a, uuid_b,
-                    crate::store::RelationType::Auto,
-                    sim * 0.5,  // scale by similarity
-                    a, b,
-                );
-                if let Ok(()) = store.add_relation(rel) {
-                    added += 1;
-                }
-            }
-            hubs_processed += 1;
-        }
-    }
-
-    if added > 0 {
-        let _ = store.save();
-    }
-    (hubs_processed, added)
-}
-
-/// Link orphan nodes (degree < min_degree) to their most textually similar
-/// connected nodes. For each orphan, finds top-K nearest neighbors by
-/// cosine similarity and creates Auto links.
-/// Returns (orphans_linked, total_links_added).
-pub fn link_orphans(
-    store: &mut Store,
-    min_degree: usize,
-    links_per_orphan: usize,
-    sim_threshold: f32,
-) -> (usize, usize) {
-    let graph = store.build_graph();
-    let mut added = 0usize;
-    let mut orphans_linked = 0usize;
-
-    // Separate orphans from connected nodes
-    let orphans: Vec<String> = graph.nodes().iter()
-        .filter(|k| graph.degree(k) < min_degree)
-        .cloned()
-        .collect();
-
-    // Build candidate pool: connected nodes with their content
-    let candidates: Vec<(String, String)> = graph.nodes().iter()
-        .filter(|k| graph.degree(k) >= min_degree)
-        .filter_map(|k| store.nodes.get(k).map(|n| (k.clone(), n.content.clone())))
-        .collect();
-
-    if candidates.is_empty() { return (0, 0); }
-
-    for orphan_key in &orphans {
-        let orphan_content = match store.nodes.get(orphan_key) {
-            Some(n) => n.content.clone(),
-            None => continue,
-        };
-        if orphan_content.len() < 20 { continue; } // skip near-empty nodes
-
-        // Score against all candidates
-        let mut scores: Vec<(usize, f32)> = candidates.iter()
-            .enumerate()
-            .map(|(i, (_, content))| {
-                (i, similarity::cosine_similarity(&orphan_content, content))
-            })
-            .filter(|(_, s)| *s >= sim_threshold)
-            .collect();
-
-        scores.sort_by(|a, b| b.1.total_cmp(&a.1));
-        let to_link = scores.len().min(links_per_orphan);
-        if to_link == 0 { continue; }
-
-        let orphan_uuid = store.nodes.get(orphan_key).unwrap().uuid;
-
-        for &(idx, sim) in scores.iter().take(to_link) {
-            let target_key = &candidates[idx].0;
-            let target_uuid = match store.nodes.get(target_key) {
-                Some(n) => n.uuid,
-                None => continue,
-            };
-
-            let rel = new_relation(
-                orphan_uuid, target_uuid,
-                crate::store::RelationType::Auto,
-                sim * 0.5,
-                orphan_key, target_key,
-            );
-            if store.add_relation(rel).is_ok() {
-                added += 1;
-            }
-        }
-        orphans_linked += 1;
-    }
-
-    if added > 0 {
-        let _ = store.save();
-    }
-    (orphans_linked, added)
-}
--- a/src/hippocampus/neuro/scoring.rs
+++ b/src/hippocampus/neuro/scoring.rs
@ -126,43 +126,6 @@ pub fn replay_queue_with_graph(
    items
 }

-/// Detect interfering memory pairs: high text similarity but different communities
-pub fn detect_interference(
-    store: &Store,
-    graph: &Graph,
-    threshold: f32,
-) -> Vec<(String, String, f32)> {
-    use crate::similarity;
-
-    let communities = graph.communities();
-
-    // Only compare nodes within a reasonable set — take the most active ones
-    let mut docs: Vec<(String, String)> = store.nodes.iter()
-        .filter(|(_, n)| n.content.len() > 50) // skip tiny nodes
-        .map(|(k, n)| (k.clone(), n.content.clone()))
-        .collect();
-
-    // For large stores, sample to keep pairwise comparison feasible
-    if docs.len() > 200 {
-        docs.sort_by(|a, b| b.1.len().cmp(&a.1.len()));
-        docs.truncate(200);
-    }
-
-    let similar = similarity::pairwise_similar(&docs, threshold);
-
-    // Filter to pairs in different communities
-    similar.into_iter()
-        .filter(|(a, b, _)| {
-            let ca = communities.get(a);
-            let cb = communities.get(b);
-            match (ca, cb) {
-                (Some(a), Some(b)) => a != b,
-                _ => true, // if community unknown, flag it
-            }
-        })
-        .collect()
-}
-
 /// Agent allocation from the control loop.
 /// Agent types and counts are data-driven — add agents by adding
 /// entries to the counts map.
@ -245,16 +208,11 @@ pub fn consolidation_plan_quick(store: &Store) -> ConsolidationPlan {
    consolidation_plan_inner(store, false)
 }

-fn consolidation_plan_inner(store: &Store, detect_interf: bool) -> ConsolidationPlan {
+fn consolidation_plan_inner(store: &Store, _detect_interf: bool) -> ConsolidationPlan {
    let graph = store.build_graph();
    let alpha = graph.degree_power_law_exponent();
    let gini = graph.degree_gini();
    let _avg_cc = graph.avg_clustering_coefficient();
-    let interference_count = if detect_interf {
-        detect_interference(store, &graph, 0.5).len()
-    } else {
-        0
-    };

    let episodic_count = store.nodes.iter()
        .filter(|(_, n)| matches!(n.node_type, crate::store::NodeType::EpisodicSession))
@ -294,19 +252,6 @@ fn consolidation_plan_inner(store: &Store, detect_interf: bool) -> Consolidation
            "Gini={:.3} (target ≤0.4): high inequality → +50 linker", gini));
    }

-    // Interference: separator disambiguates confusable nodes
-    if interference_count > 100 {
-        plan.add("separator", 10);
-        plan.rationale.push(format!(
-            "Interference: {} pairs (target <50) → 10 separator", interference_count));
-    } else if interference_count > 20 {
-        plan.add("separator", 5);
-        plan.rationale.push(format!(
-            "Interference: {} pairs → 5 separator", interference_count));
-    } else if interference_count > 0 {
-        plan.add("separator", interference_count.min(3));
-    }
-
    // Organize: proportional to linker — synthesizes what linker connects
    let linker = plan.count("linker");
    plan.set("organize", linker / 2);
--- a/src/hippocampus/similarity.rs
+++ b/src/hippocampus/similarity.rs
@ -1,140 +0,0 @@
-// Text similarity: Porter stemming + BM25
-//
-// Used for interference detection (similar content, different communities)
-// and schema fit scoring. Intentionally simple — ~100 lines, no
-// external dependencies.
-
-use std::collections::HashMap;
-
-/// Minimal Porter stemmer — handles the most common English suffixes.
-/// Not linguistically complete but good enough for similarity matching.
-/// Single allocation: works on one String buffer throughout.
-///
-/// If this is still a hot spot, replace the sequential suffix checks
-/// with a reversed-suffix trie: single pass from the end of the word
-/// matches the longest applicable suffix in O(suffix_len) instead of
-/// O(n_rules).
-pub(crate) fn stem(word: &str) -> String {
-    let mut w = word.to_lowercase();
-    if w.len() <= 3 { return w; }
-
-    strip_suffix_inplace(&mut w, "ation", "ate");
-    strip_suffix_inplace(&mut w, "ness", "");
-    strip_suffix_inplace(&mut w, "ment", "");
-    strip_suffix_inplace(&mut w, "ting", "t");
-    strip_suffix_inplace(&mut w, "ling", "l");
-    strip_suffix_inplace(&mut w, "ring", "r");
-    strip_suffix_inplace(&mut w, "ning", "n");
-    strip_suffix_inplace(&mut w, "ding", "d");
-    strip_suffix_inplace(&mut w, "ping", "p");
-    strip_suffix_inplace(&mut w, "ging", "g");
-    strip_suffix_inplace(&mut w, "ying", "y");
-    strip_suffix_inplace(&mut w, "ied", "y");
-    strip_suffix_inplace(&mut w, "ies", "y");
-    strip_suffix_inplace(&mut w, "ing", "");
-    strip_suffix_inplace(&mut w, "ed", "");
-    strip_suffix_inplace(&mut w, "ly", "");
-    strip_suffix_inplace(&mut w, "er", "");
-    strip_suffix_inplace(&mut w, "al", "");
-    strip_suffix_inplace(&mut w, "s", "");
-    w
-}
-
-fn strip_suffix_inplace(word: &mut String, suffix: &str, replacement: &str) {
-    if word.len() > suffix.len() + 2 && word.ends_with(suffix) {
-        word.truncate(word.len() - suffix.len());
-        word.push_str(replacement);
-    }
-}
-
-/// Tokenize and stem a text into a term frequency map
-pub(crate) fn term_frequencies(text: &str) -> HashMap<String, u32> {
-    let mut tf = HashMap::new();
-    for word in text.split(|c: char| !c.is_alphanumeric()) {
-        if word.len() > 2 {
-            let stemmed = stem(word);
-            *tf.entry(stemmed).or_default() += 1;
-        }
-    }
-    tf
-}
-
-/// Cosine similarity between two documents using stemmed term frequencies.
-/// Returns 0.0 for disjoint vocabularies, 1.0 for identical content.
-pub fn cosine_similarity(doc_a: &str, doc_b: &str) -> f32 {
-    let tf_a = term_frequencies(doc_a);
-    let tf_b = term_frequencies(doc_b);
-
-    if tf_a.is_empty() || tf_b.is_empty() {
-        return 0.0;
-    }
-
-    // Dot product
-    let mut dot = 0.0f64;
-    for (term, &freq_a) in &tf_a {
-        if let Some(&freq_b) = tf_b.get(term) {
-            dot += freq_a as f64 * freq_b as f64;
-        }
-    }
-
-    // Magnitudes
-    let mag_a: f64 = tf_a.values().map(|&f| (f as f64).powi(2)).sum::<f64>().sqrt();
-    let mag_b: f64 = tf_b.values().map(|&f| (f as f64).powi(2)).sum::<f64>().sqrt();
-
-    if mag_a < 1e-10 || mag_b < 1e-10 {
-        return 0.0;
-    }
-
-    (dot / (mag_a * mag_b)) as f32
-}
-
-/// Compute pairwise similarity for a set of documents.
-/// Returns pairs with similarity above threshold.
-pub fn pairwise_similar(
-    docs: &[(String, String)],  // (key, content)
-    threshold: f32,
-) -> Vec<(String, String, f32)> {
-    let mut results = Vec::new();
-
-    for i in 0..docs.len() {
-        for j in (i + 1)..docs.len() {
-            let sim = cosine_similarity(&docs[i].1, &docs[j].1);
-            if sim >= threshold {
-                results.push((docs[i].0.clone(), docs[j].0.clone(), sim));
-            }
-        }
-    }
-
-    results.sort_by(|a, b| b.2.total_cmp(&a.2));
-    results
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_stem() {
-        assert_eq!(stem("running"), "runn"); // -ning → n
-        assert_eq!(stem("talking"), "talk"); // not matched by specific consonant rules
-        assert_eq!(stem("slowly"), "slow"); // -ly
-        // The stemmer is minimal — it doesn't need to be perfect,
-        // just consistent enough that related words collide.
-        assert_eq!(stem("observations"), "observation"); // -s stripped, -ation stays (word too short after)
-    }
-
-    #[test]
-    fn test_cosine_identical() {
-        let text = "the quick brown fox jumps over the lazy dog";
-        let sim = cosine_similarity(text, text);
-        assert!((sim - 1.0).abs() < 0.01, "identical docs should have sim ~1.0, got {}", sim);
-    }
-
-    #[test]
-    fn test_cosine_different() {
-        let a = "kernel filesystem transaction restart handling";
-        let b = "cooking recipe chocolate cake baking temperature";
-        let sim = cosine_similarity(a, b);
-        assert!(sim < 0.1, "unrelated docs should have low sim, got {}", sim);
-    }
-}