diff --git a/src/capnp_store.rs b/src/capnp_store.rs index 2c503fa..ab05108 100644 --- a/src/capnp_store.rs +++ b/src/capnp_store.rs @@ -17,7 +17,7 @@ use regex::Regex; use serde::{Deserialize, Serialize}; use uuid::Uuid; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::env; use std::fs; use std::io::{BufReader, BufWriter, Write as IoWrite}; @@ -918,6 +918,163 @@ impl Store { (decayed, pruned) } + /// Bulk recategorize nodes using rule-based logic. + /// Returns (changed, unchanged) counts. + pub fn fix_categories(&mut self) -> Result<(usize, usize), String> { + // Files that should stay core (identity-defining) + let core_files = ["identity.md", "kent.md"]; + + // Files that should be tech + let tech_files = [ + "language-theory.md", "zoom-navigation.md", + "rust-conversion.md", "poc-architecture.md", + ]; + let tech_prefixes = ["design-"]; + + // Files that should be obs (self-observation, skills, reflections) + let obs_files = [ + "reflections.md", "reflections-zoom.md", "differentiation.md", + "cognitive-modes.md", "paper-notes.md", "inner-life.md", + "conversation.md", "interests.md", "stuck-toolkit.md", + ]; + let obs_prefixes = ["skill-", "worked-example-"]; + + let mut changed = 0; + let mut unchanged = 0; + + let keys: Vec = self.nodes.keys().cloned().collect(); + for key in &keys { + let node = self.nodes.get(key).unwrap(); + if node.category != Category::Core { + unchanged += 1; + continue; + } + + // Determine what file this node belongs to + let file = key.split('#').next().unwrap_or(key); + + let new_cat = if core_files.iter().any(|&f| file == f) { + None // keep as core + } else if tech_files.iter().any(|&f| file == f) + || tech_prefixes.iter().any(|p| file.starts_with(p)) + { + Some(Category::Technical) + } else if obs_files.iter().any(|&f| file == f) + || obs_prefixes.iter().any(|p| file.starts_with(p)) + { + Some(Category::Observation) + } else { + // Default: anything else that was core probably shouldn't be + Some(Category::General) + }; + + if let Some(cat) = new_cat { + let node = self.nodes.get_mut(key).unwrap(); + node.category = cat; + node.version += 1; + changed += 1; + } else { + unchanged += 1; + } + } + + if changed > 0 { + let updated: Vec = self.nodes.values().cloned().collect(); + self.append_nodes(&updated)?; + } + + Ok((changed, unchanged)) + } + + /// Cap node degree by soft-deleting edges from mega-hubs. + /// First prunes weakest Auto edges, then prunes Link edges to + /// high-degree targets (they have alternative paths). + /// Returns (hubs_capped, edges_pruned). + pub fn cap_degree(&mut self, max_degree: usize) -> Result<(usize, usize), String> { + // Build per-node degree counts (for Link pruning priority) + let mut node_degree: HashMap = HashMap::new(); + for rel in &self.relations { + if rel.deleted { continue; } + *node_degree.entry(rel.source_key.clone()).or_default() += 1; + *node_degree.entry(rel.target_key.clone()).or_default() += 1; + } + + // Build per-node edge lists + let mut node_edges: HashMap> = HashMap::new(); + for (i, rel) in self.relations.iter().enumerate() { + if rel.deleted { continue; } + node_edges.entry(rel.source_key.clone()).or_default().push(i); + node_edges.entry(rel.target_key.clone()).or_default().push(i); + } + + let mut to_delete: HashSet = HashSet::new(); + let mut hubs_capped = 0; + + for (_key, edge_indices) in &node_edges { + let active: Vec = edge_indices.iter() + .filter(|&&i| !to_delete.contains(&i)) + .copied() + .collect(); + if active.len() <= max_degree { continue; } + + // Phase 1: prune Auto edges (weakest first) + let mut auto_indices: Vec<(usize, f32)> = Vec::new(); + let mut link_indices: Vec<(usize, usize)> = Vec::new(); // (idx, other_degree) + for &i in &active { + let rel = &self.relations[i]; + if rel.rel_type == RelationType::Auto { + auto_indices.push((i, rel.strength)); + } else { + // For Link/Causal, rank by other endpoint's degree + let other = if &rel.source_key == _key { + &rel.target_key + } else { + &rel.source_key + }; + let other_deg = node_degree.get(other).copied().unwrap_or(0); + link_indices.push((i, other_deg)); + } + } + + let excess = active.len() - max_degree; + + // Sort Auto by strength ascending + auto_indices.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + let auto_prune = excess.min(auto_indices.len()); + for &(i, _) in auto_indices.iter().take(auto_prune) { + to_delete.insert(i); + } + + // Phase 2: if still over cap, prune Link edges to high-degree targets + let remaining_excess = excess.saturating_sub(auto_prune); + if remaining_excess > 0 { + // Sort by other endpoint degree descending (prune links + // to well-connected nodes first — they have alternative paths) + link_indices.sort_by(|a, b| b.1.cmp(&a.1)); + let link_prune = remaining_excess.min(link_indices.len()); + for &(i, _) in link_indices.iter().take(link_prune) { + to_delete.insert(i); + } + } + + hubs_capped += 1; + } + + // Apply deletions + let mut pruned_rels = Vec::new(); + for &i in &to_delete { + self.relations[i].deleted = true; + self.relations[i].version += 1; + pruned_rels.push(self.relations[i].clone()); + } + + if !pruned_rels.is_empty() { + self.append_relations(&pruned_rels)?; + } + + Ok((hubs_capped, to_delete.len())) + } + pub fn category_counts(&self) -> HashMap<&str, usize> { let mut counts = HashMap::new(); for node in self.nodes.values() { diff --git a/src/digest.rs b/src/digest.rs index 7fe1559..88c84b2 100644 --- a/src/digest.rs +++ b/src/digest.rs @@ -922,6 +922,27 @@ pub fn consolidate_full(store: &mut Store) -> Result<(), String> { } } + // --- Step 3b: Link orphans --- + log.write("\n--- Step 3b: Link orphans ---")?; + println!("\n--- Linking orphan nodes ---"); + *store = Store::load()?; + + let (lo_orphans, lo_added) = neuro::link_orphans(store, 2, 3, 0.15); + log.write(&format!(" {} orphans, {} links added", lo_orphans, lo_added))?; + + // --- Step 3c: Cap degree --- + log.write("\n--- Step 3c: Cap degree ---")?; + println!("\n--- Capping node degree ---"); + *store = Store::load()?; + + match store.cap_degree(50) { + Ok((hubs, pruned)) => { + store.save()?; + log.write(&format!(" {} hubs capped, {} edges pruned", hubs, pruned))?; + } + Err(e) => log.write(&format!(" ERROR: {}", e))?, + } + // --- Step 4: Digest auto --- log.write("\n--- Step 4: Digest auto ---")?; println!("\n--- Generating missing digests ---"); diff --git a/src/graph.rs b/src/graph.rs index c7cfc6c..fbb8dc3 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -421,6 +421,11 @@ fn label_propagation( adj: &HashMap>, max_iterations: u32, ) -> HashMap { + // Only consider edges above this strength for community votes. + // Weak auto-links from triangle closure (0.15-0.35) bridge + // unrelated clusters — filtering them lets natural communities emerge. + let min_strength: f32 = 0.3; + // Initialize: each node gets its own label let key_vec: Vec = keys.iter().cloned().collect(); let mut labels: HashMap = key_vec.iter() @@ -438,9 +443,10 @@ fn label_propagation( }; if edges.is_empty() { continue; } - // Count weighted votes for each label + // Count weighted votes for each label (skip weak edges) let mut votes: HashMap = HashMap::new(); for edge in edges { + if edge.strength < min_strength { continue; } if let Some(&label) = labels.get(&edge.target) { *votes.entry(label).or_default() += edge.strength; } diff --git a/src/main.rs b/src/main.rs index d6f59a8..b9649b7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -76,6 +76,9 @@ fn main() { "wrong" => cmd_wrong(&args[2..]), "gap" => cmd_gap(&args[2..]), "categorize" => cmd_categorize(&args[2..]), + "fix-categories" => cmd_fix_categories(), + "cap-degree" => cmd_cap_degree(&args[2..]), + "link-orphans" => cmd_link_orphans(&args[2..]), "decay" => cmd_decay(), "consolidate-batch" => cmd_consolidate_batch(&args[2..]), "log" => cmd_log(), @@ -325,6 +328,38 @@ fn cmd_categorize(args: &[String]) -> Result<(), String> { Ok(()) } +fn cmd_fix_categories() -> Result<(), String> { + let mut store = capnp_store::Store::load()?; + let before = format!("{:?}", store.category_counts()); + let (changed, kept) = store.fix_categories()?; + store.save()?; + let after = format!("{:?}", store.category_counts()); + println!("Category fix: {} changed, {} kept", changed, kept); + println!("\nBefore: {}", before); + println!("After: {}", after); + Ok(()) +} + +fn cmd_link_orphans(args: &[String]) -> Result<(), String> { + let min_deg: usize = args.first().and_then(|s| s.parse().ok()).unwrap_or(2); + let links_per: usize = args.get(1).and_then(|s| s.parse().ok()).unwrap_or(3); + let sim_thresh: f32 = args.get(2).and_then(|s| s.parse().ok()).unwrap_or(0.15); + let mut store = capnp_store::Store::load()?; + let (orphans, links) = neuro::link_orphans(&mut store, min_deg, links_per, sim_thresh); + println!("Linked {} orphans, added {} connections (min_degree={}, links_per={}, sim>{})", + orphans, links, min_deg, links_per, sim_thresh); + Ok(()) +} + +fn cmd_cap_degree(args: &[String]) -> Result<(), String> { + let max_deg: usize = args.first().and_then(|s| s.parse().ok()).unwrap_or(50); + let mut store = capnp_store::Store::load()?; + let (hubs, pruned) = store.cap_degree(max_deg)?; + store.save()?; + println!("Capped {} hubs, pruned {} weak Auto edges (max_degree={})", hubs, pruned, max_deg); + Ok(()) +} + fn cmd_decay() -> Result<(), String> { let mut store = capnp_store::Store::load()?; let (decayed, pruned) = store.decay(); diff --git a/src/neuro.rs b/src/neuro.rs index 3eaa540..472505b 100644 --- a/src/neuro.rs +++ b/src/neuro.rs @@ -1059,3 +1059,79 @@ pub fn triangle_close( } (hubs_processed, added) } + +/// Link orphan nodes (degree < min_degree) to their most textually similar +/// connected nodes. For each orphan, finds top-K nearest neighbors by +/// cosine similarity and creates Auto links. +/// Returns (orphans_linked, total_links_added). +pub fn link_orphans( + store: &mut Store, + min_degree: usize, + links_per_orphan: usize, + sim_threshold: f32, +) -> (usize, usize) { + let graph = store.build_graph(); + let mut added = 0usize; + let mut orphans_linked = 0usize; + + // Separate orphans from connected nodes + let orphans: Vec = graph.nodes().iter() + .filter(|k| graph.degree(k) < min_degree) + .cloned() + .collect(); + + // Build candidate pool: connected nodes with their content + let candidates: Vec<(String, String)> = graph.nodes().iter() + .filter(|k| graph.degree(k) >= min_degree) + .filter_map(|k| store.nodes.get(k).map(|n| (k.clone(), n.content.clone()))) + .collect(); + + if candidates.is_empty() { return (0, 0); } + + for orphan_key in &orphans { + let orphan_content = match store.nodes.get(orphan_key) { + Some(n) => n.content.clone(), + None => continue, + }; + if orphan_content.len() < 20 { continue; } // skip near-empty nodes + + // Score against all candidates + let mut scores: Vec<(usize, f32)> = candidates.iter() + .enumerate() + .map(|(i, (_, content))| { + (i, similarity::cosine_similarity(&orphan_content, content)) + }) + .filter(|(_, s)| *s >= sim_threshold) + .collect(); + + scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + let to_link = scores.len().min(links_per_orphan); + if to_link == 0 { continue; } + + let orphan_uuid = store.nodes.get(orphan_key).unwrap().uuid; + + for &(idx, sim) in scores.iter().take(to_link) { + let target_key = &candidates[idx].0; + let target_uuid = match store.nodes.get(target_key) { + Some(n) => n.uuid, + None => continue, + }; + + let rel = Store::new_relation( + orphan_uuid, target_uuid, + crate::capnp_store::RelationType::Auto, + sim * 0.5, + orphan_key, target_key, + ); + if store.add_relation(rel).is_ok() { + added += 1; + } + } + orphans_linked += 1; + } + + if added > 0 { + let _ = store.save(); + } + (orphans_linked, added) +}