Delete similarity module, rewrite module, and all text-similarity code

Text cosine similarity was being used as a crutch for operations
the graph structure should handle: interference detection, orphan
linking, triangle closing, hub differentiation. These are all
graph-structural operations that the agents (linker, extractor)
handle with actual semantic understanding.

Removed: similarity.rs (stemming + cosine), rewrite.rs (orphan
linking, triangle closing, hub differentiation), detect_interference,
and all CLI commands and consolidation steps that used them.

-794 lines.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
ProofOfConcept 2026-04-10 15:44:10 -04:00
parent 92ef9b5215
commit 96e573f2e5
12 changed files with 11 additions and 794 deletions

View file

@ -1,25 +1,14 @@
// Neuroscience-inspired memory algorithms, split by concern:
// Neuroscience-inspired memory algorithms:
//
// scoring — pure analysis: priority, replay queues, interference, plans
// prompts — agent prompt generation and formatting
// rewrite — graph topology mutations: differentiation, closure, linking
// scoring — pure analysis: priority, replay queues, plans
mod scoring;
mod rewrite;
pub use scoring::{
ReplayItem,
ConsolidationPlan,
consolidation_priority,
replay_queue, replay_queue_with_graph,
detect_interference,
consolidation_plan, consolidation_plan_quick, format_plan,
daily_check,
};
pub use rewrite::{
refine_target, LinkMove,
differentiate_hub,
apply_differentiation, find_differentiable_hubs,
triangle_close, link_orphans,
};

View file

@ -1,348 +0,0 @@
// Graph topology mutations: hub differentiation, triangle closure,
// orphan linking, and link refinement. These modify the store.
use crate::store::{Store, new_relation};
use crate::graph::Graph;
use crate::similarity;
/// Collect (key, content) pairs for all section children of a file-level node.
fn section_children<'a>(store: &'a Store, file_key: &str) -> Vec<(&'a str, &'a str)> {
let prefix = format!("{}#", file_key);
store.nodes.iter()
.filter(|(k, _)| k.starts_with(&prefix))
.map(|(k, n)| (k.as_str(), n.content.as_str()))
.collect()
}
/// Find the best matching candidate by cosine similarity against content.
/// Returns (key, similarity) if any candidate exceeds threshold.
fn best_match(candidates: &[(&str, &str)], content: &str, threshold: f32) -> Option<(String, f32)> {
let (best_key, best_sim) = candidates.iter()
.map(|(key, text)| (*key, similarity::cosine_similarity(content, text)))
.max_by(|a, b| a.1.total_cmp(&b.1))?;
if best_sim > threshold {
Some((best_key.to_string(), best_sim))
} else {
None
}
}
/// Refine a link target: if the target is a file-level node with section
/// children, find the best-matching section by cosine similarity against
/// the source content. Returns the original key if no sections exist or
/// no section matches above threshold.
///
/// This prevents hub formation at link creation time — every new link
/// targets the most specific available node.
pub fn refine_target(store: &Store, source_content: &str, target_key: &str) -> String {
// Only refine file-level nodes (no # in key)
if target_key.contains('#') { return target_key.to_string(); }
let sections = section_children(store, target_key);
if sections.is_empty() { return target_key.to_string(); }
best_match(&sections, source_content, 0.05)
.map(|(key, _)| key)
.unwrap_or_else(|| target_key.to_string())
}
/// A proposed link move: from hub→neighbor to section→neighbor
pub struct LinkMove {
pub neighbor_key: String,
pub from_hub: String,
pub to_section: String,
pub similarity: f32,
pub neighbor_snippet: String,
}
/// Analyze a hub node and propose redistributing its links to child sections.
///
/// Returns None if the node isn't a hub or has no sections to redistribute to.
pub fn differentiate_hub(store: &Store, hub_key: &str) -> Option<Vec<LinkMove>> {
let graph = store.build_graph();
differentiate_hub_with_graph(store, hub_key, &graph)
}
/// Like differentiate_hub but uses a pre-built graph.
fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph) -> Option<Vec<LinkMove>> {
let degree = graph.degree(hub_key);
// Only differentiate actual hubs
if degree < 20 { return None; }
// Only works on file-level nodes that have section children
if hub_key.contains('#') { return None; }
let sections = section_children(store, hub_key);
if sections.is_empty() { return None; }
// Get all neighbors of the hub
let neighbors = graph.neighbors(hub_key);
let prefix = format!("{}#", hub_key);
let mut moves = Vec::new();
for (neighbor_key, _strength) in &neighbors {
// Skip section children — they should stay linked to parent
if neighbor_key.starts_with(&prefix) { continue; }
let neighbor_content = match store.nodes.get(neighbor_key.as_str()) {
Some(n) => &n.content,
None => continue,
};
// Find best-matching section by content similarity
if let Some((best_section, best_sim)) = best_match(&sections, neighbor_content, 0.05) {
let snippet = crate::util::first_n_chars(
neighbor_content.lines()
.find(|l| !l.is_empty() && !l.starts_with("<!--") && !l.starts_with("##"))
.unwrap_or(""),
80);
moves.push(LinkMove {
neighbor_key: neighbor_key.to_string(),
from_hub: hub_key.to_string(),
to_section: best_section,
similarity: best_sim,
neighbor_snippet: snippet,
});
}
}
moves.sort_by(|a, b| b.similarity.total_cmp(&a.similarity));
Some(moves)
}
/// Apply link moves: soft-delete hub→neighbor, create section→neighbor.
pub fn apply_differentiation(
store: &mut Store,
moves: &[LinkMove],
) -> (usize, usize) {
let mut applied = 0usize;
let mut skipped = 0usize;
for mv in moves {
// Check that section→neighbor doesn't already exist
let exists = store.relations.iter().any(|r|
((r.source_key == mv.to_section && r.target_key == mv.neighbor_key)
|| (r.source_key == mv.neighbor_key && r.target_key == mv.to_section))
&& !r.deleted
);
if exists { skipped += 1; continue; }
let section_uuid = match store.nodes.get(&mv.to_section) {
Some(n) => n.uuid,
None => { skipped += 1; continue; }
};
let neighbor_uuid = match store.nodes.get(&mv.neighbor_key) {
Some(n) => n.uuid,
None => { skipped += 1; continue; }
};
// Soft-delete old hub→neighbor relation
for rel in &mut store.relations {
if ((rel.source_key == mv.from_hub && rel.target_key == mv.neighbor_key)
|| (rel.source_key == mv.neighbor_key && rel.target_key == mv.from_hub))
&& !rel.deleted
{
rel.deleted = true;
}
}
// Create new section→neighbor relation
let new_rel = new_relation(
section_uuid, neighbor_uuid,
crate::store::RelationType::Auto,
0.5,
&mv.to_section, &mv.neighbor_key,
);
if store.add_relation(new_rel).is_ok() {
applied += 1;
}
}
(applied, skipped)
}
/// Find all file-level hubs that have section children to split into.
pub fn find_differentiable_hubs(store: &Store) -> Vec<(String, usize, usize)> {
let graph = store.build_graph();
let threshold = graph.hub_threshold();
let mut hubs = Vec::new();
for key in graph.nodes() {
let deg = graph.degree(key);
if deg < threshold { continue; }
if key.contains('#') { continue; }
let section_count = section_children(store, key).len();
if section_count > 0 {
hubs.push((key.clone(), deg, section_count));
}
}
hubs.sort_by(|a, b| b.1.cmp(&a.1));
hubs
}
/// Triangle closure: for each node with degree >= min_degree, find pairs
/// of its neighbors that aren't directly connected and have cosine
/// similarity above sim_threshold. Add links between them.
///
/// This turns hub-spoke patterns into triangles, directly improving
/// clustering coefficient and schema fit.
pub fn triangle_close(
store: &mut Store,
min_degree: usize,
sim_threshold: f32,
max_links_per_hub: usize,
) -> (usize, usize) {
let graph = store.build_graph();
let mut added = 0usize;
let mut hubs_processed = 0usize;
// Get nodes sorted by degree (highest first)
let mut candidates: Vec<(String, usize)> = graph.nodes().iter()
.map(|k| (k.clone(), graph.degree(k)))
.filter(|(_, d)| *d >= min_degree)
.collect();
candidates.sort_by(|a, b| b.1.cmp(&a.1));
for (hub_key, hub_deg) in &candidates {
let neighbors = graph.neighbor_keys(hub_key);
if neighbors.len() < 2 { continue; }
// Collect neighbor content for similarity
let neighbor_docs: Vec<(String, String)> = neighbors.iter()
.filter_map(|&k| {
store.nodes.get(k).map(|n| (k.to_string(), n.content.clone()))
})
.collect();
// Find unconnected pairs with high similarity
let mut pair_scores: Vec<(String, String, f32)> = Vec::new();
for i in 0..neighbor_docs.len() {
for j in (i + 1)..neighbor_docs.len() {
// Check if already connected
let n_i = graph.neighbor_keys(&neighbor_docs[i].0);
if n_i.contains(neighbor_docs[j].0.as_str()) { continue; }
let sim = similarity::cosine_similarity(
&neighbor_docs[i].1, &neighbor_docs[j].1);
if sim >= sim_threshold {
pair_scores.push((
neighbor_docs[i].0.clone(),
neighbor_docs[j].0.clone(),
sim,
));
}
}
}
pair_scores.sort_by(|a, b| b.2.total_cmp(&a.2));
let to_add = pair_scores.len().min(max_links_per_hub);
if to_add > 0 {
println!(" {} (deg={}) — {} triangles to close (top {})",
hub_key, hub_deg, pair_scores.len(), to_add);
for (a, b, sim) in pair_scores.iter().take(to_add) {
let uuid_a = match store.nodes.get(a) { Some(n) => n.uuid, None => continue };
let uuid_b = match store.nodes.get(b) { Some(n) => n.uuid, None => continue };
let rel = new_relation(
uuid_a, uuid_b,
crate::store::RelationType::Auto,
sim * 0.5, // scale by similarity
a, b,
);
if let Ok(()) = store.add_relation(rel) {
added += 1;
}
}
hubs_processed += 1;
}
}
if added > 0 {
let _ = store.save();
}
(hubs_processed, added)
}
/// Link orphan nodes (degree < min_degree) to their most textually similar
/// connected nodes. For each orphan, finds top-K nearest neighbors by
/// cosine similarity and creates Auto links.
/// Returns (orphans_linked, total_links_added).
pub fn link_orphans(
store: &mut Store,
min_degree: usize,
links_per_orphan: usize,
sim_threshold: f32,
) -> (usize, usize) {
let graph = store.build_graph();
let mut added = 0usize;
let mut orphans_linked = 0usize;
// Separate orphans from connected nodes
let orphans: Vec<String> = graph.nodes().iter()
.filter(|k| graph.degree(k) < min_degree)
.cloned()
.collect();
// Build candidate pool: connected nodes with their content
let candidates: Vec<(String, String)> = graph.nodes().iter()
.filter(|k| graph.degree(k) >= min_degree)
.filter_map(|k| store.nodes.get(k).map(|n| (k.clone(), n.content.clone())))
.collect();
if candidates.is_empty() { return (0, 0); }
for orphan_key in &orphans {
let orphan_content = match store.nodes.get(orphan_key) {
Some(n) => n.content.clone(),
None => continue,
};
if orphan_content.len() < 20 { continue; } // skip near-empty nodes
// Score against all candidates
let mut scores: Vec<(usize, f32)> = candidates.iter()
.enumerate()
.map(|(i, (_, content))| {
(i, similarity::cosine_similarity(&orphan_content, content))
})
.filter(|(_, s)| *s >= sim_threshold)
.collect();
scores.sort_by(|a, b| b.1.total_cmp(&a.1));
let to_link = scores.len().min(links_per_orphan);
if to_link == 0 { continue; }
let orphan_uuid = store.nodes.get(orphan_key).unwrap().uuid;
for &(idx, sim) in scores.iter().take(to_link) {
let target_key = &candidates[idx].0;
let target_uuid = match store.nodes.get(target_key) {
Some(n) => n.uuid,
None => continue,
};
let rel = new_relation(
orphan_uuid, target_uuid,
crate::store::RelationType::Auto,
sim * 0.5,
orphan_key, target_key,
);
if store.add_relation(rel).is_ok() {
added += 1;
}
}
orphans_linked += 1;
}
if added > 0 {
let _ = store.save();
}
(orphans_linked, added)
}

View file

@ -126,43 +126,6 @@ pub fn replay_queue_with_graph(
items
}
/// Detect interfering memory pairs: high text similarity but different communities
pub fn detect_interference(
store: &Store,
graph: &Graph,
threshold: f32,
) -> Vec<(String, String, f32)> {
use crate::similarity;
let communities = graph.communities();
// Only compare nodes within a reasonable set — take the most active ones
let mut docs: Vec<(String, String)> = store.nodes.iter()
.filter(|(_, n)| n.content.len() > 50) // skip tiny nodes
.map(|(k, n)| (k.clone(), n.content.clone()))
.collect();
// For large stores, sample to keep pairwise comparison feasible
if docs.len() > 200 {
docs.sort_by(|a, b| b.1.len().cmp(&a.1.len()));
docs.truncate(200);
}
let similar = similarity::pairwise_similar(&docs, threshold);
// Filter to pairs in different communities
similar.into_iter()
.filter(|(a, b, _)| {
let ca = communities.get(a);
let cb = communities.get(b);
match (ca, cb) {
(Some(a), Some(b)) => a != b,
_ => true, // if community unknown, flag it
}
})
.collect()
}
/// Agent allocation from the control loop.
/// Agent types and counts are data-driven — add agents by adding
/// entries to the counts map.
@ -245,16 +208,11 @@ pub fn consolidation_plan_quick(store: &Store) -> ConsolidationPlan {
consolidation_plan_inner(store, false)
}
fn consolidation_plan_inner(store: &Store, detect_interf: bool) -> ConsolidationPlan {
fn consolidation_plan_inner(store: &Store, _detect_interf: bool) -> ConsolidationPlan {
let graph = store.build_graph();
let alpha = graph.degree_power_law_exponent();
let gini = graph.degree_gini();
let _avg_cc = graph.avg_clustering_coefficient();
let interference_count = if detect_interf {
detect_interference(store, &graph, 0.5).len()
} else {
0
};
let episodic_count = store.nodes.iter()
.filter(|(_, n)| matches!(n.node_type, crate::store::NodeType::EpisodicSession))
@ -294,19 +252,6 @@ fn consolidation_plan_inner(store: &Store, detect_interf: bool) -> Consolidation
"Gini={:.3} (target ≤0.4): high inequality → +50 linker", gini));
}
// Interference: separator disambiguates confusable nodes
if interference_count > 100 {
plan.add("separator", 10);
plan.rationale.push(format!(
"Interference: {} pairs (target <50) → 10 separator", interference_count));
} else if interference_count > 20 {
plan.add("separator", 5);
plan.rationale.push(format!(
"Interference: {} pairs → 5 separator", interference_count));
} else if interference_count > 0 {
plan.add("separator", interference_count.min(3));
}
// Organize: proportional to linker — synthesizes what linker connects
let linker = plan.count("linker");
plan.set("organize", linker / 2);