graph health: fix-categories, cap-degree, link-orphans
Three new tools for structural graph health: - fix-categories: rule-based recategorization fixing core inflation (225 → 26 core nodes). Only identity.md and kent.md stay core; everything else reclassified to tech/obs/gen by file prefix rules. - cap-degree: two-phase degree capping. First prunes weakest Auto edges, then prunes Link edges to high-degree targets (they have alternative paths). Brought max degree from 919 → 50. - link-orphans: connects degree-0/1 nodes to most textually similar connected nodes via cosine similarity. Linked 614 orphans. Also: community detection now filters edges below strength 0.3, preventing weak auto-links from merging unrelated communities. Pipeline updated: consolidate-full now runs link-orphans + cap-degree instead of triangle-close (which was counterproductive — densified hub neighborhoods instead of building bridges). Net effect: Gini 0.754 → 0.546, max degree 919 → 50.
This commit is contained in:
parent
6c7bfb9ec4
commit
94dbca6018
5 changed files with 297 additions and 2 deletions
|
|
@ -17,7 +17,7 @@ use regex::Regex;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::io::{BufReader, BufWriter, Write as IoWrite};
|
use std::io::{BufReader, BufWriter, Write as IoWrite};
|
||||||
|
|
@ -918,6 +918,163 @@ impl Store {
|
||||||
(decayed, pruned)
|
(decayed, pruned)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Bulk recategorize nodes using rule-based logic.
|
||||||
|
/// Returns (changed, unchanged) counts.
|
||||||
|
pub fn fix_categories(&mut self) -> Result<(usize, usize), String> {
|
||||||
|
// Files that should stay core (identity-defining)
|
||||||
|
let core_files = ["identity.md", "kent.md"];
|
||||||
|
|
||||||
|
// Files that should be tech
|
||||||
|
let tech_files = [
|
||||||
|
"language-theory.md", "zoom-navigation.md",
|
||||||
|
"rust-conversion.md", "poc-architecture.md",
|
||||||
|
];
|
||||||
|
let tech_prefixes = ["design-"];
|
||||||
|
|
||||||
|
// Files that should be obs (self-observation, skills, reflections)
|
||||||
|
let obs_files = [
|
||||||
|
"reflections.md", "reflections-zoom.md", "differentiation.md",
|
||||||
|
"cognitive-modes.md", "paper-notes.md", "inner-life.md",
|
||||||
|
"conversation.md", "interests.md", "stuck-toolkit.md",
|
||||||
|
];
|
||||||
|
let obs_prefixes = ["skill-", "worked-example-"];
|
||||||
|
|
||||||
|
let mut changed = 0;
|
||||||
|
let mut unchanged = 0;
|
||||||
|
|
||||||
|
let keys: Vec<String> = self.nodes.keys().cloned().collect();
|
||||||
|
for key in &keys {
|
||||||
|
let node = self.nodes.get(key).unwrap();
|
||||||
|
if node.category != Category::Core {
|
||||||
|
unchanged += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine what file this node belongs to
|
||||||
|
let file = key.split('#').next().unwrap_or(key);
|
||||||
|
|
||||||
|
let new_cat = if core_files.iter().any(|&f| file == f) {
|
||||||
|
None // keep as core
|
||||||
|
} else if tech_files.iter().any(|&f| file == f)
|
||||||
|
|| tech_prefixes.iter().any(|p| file.starts_with(p))
|
||||||
|
{
|
||||||
|
Some(Category::Technical)
|
||||||
|
} else if obs_files.iter().any(|&f| file == f)
|
||||||
|
|| obs_prefixes.iter().any(|p| file.starts_with(p))
|
||||||
|
{
|
||||||
|
Some(Category::Observation)
|
||||||
|
} else {
|
||||||
|
// Default: anything else that was core probably shouldn't be
|
||||||
|
Some(Category::General)
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(cat) = new_cat {
|
||||||
|
let node = self.nodes.get_mut(key).unwrap();
|
||||||
|
node.category = cat;
|
||||||
|
node.version += 1;
|
||||||
|
changed += 1;
|
||||||
|
} else {
|
||||||
|
unchanged += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if changed > 0 {
|
||||||
|
let updated: Vec<Node> = self.nodes.values().cloned().collect();
|
||||||
|
self.append_nodes(&updated)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((changed, unchanged))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cap node degree by soft-deleting edges from mega-hubs.
|
||||||
|
/// First prunes weakest Auto edges, then prunes Link edges to
|
||||||
|
/// high-degree targets (they have alternative paths).
|
||||||
|
/// Returns (hubs_capped, edges_pruned).
|
||||||
|
pub fn cap_degree(&mut self, max_degree: usize) -> Result<(usize, usize), String> {
|
||||||
|
// Build per-node degree counts (for Link pruning priority)
|
||||||
|
let mut node_degree: HashMap<String, usize> = HashMap::new();
|
||||||
|
for rel in &self.relations {
|
||||||
|
if rel.deleted { continue; }
|
||||||
|
*node_degree.entry(rel.source_key.clone()).or_default() += 1;
|
||||||
|
*node_degree.entry(rel.target_key.clone()).or_default() += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build per-node edge lists
|
||||||
|
let mut node_edges: HashMap<String, Vec<usize>> = HashMap::new();
|
||||||
|
for (i, rel) in self.relations.iter().enumerate() {
|
||||||
|
if rel.deleted { continue; }
|
||||||
|
node_edges.entry(rel.source_key.clone()).or_default().push(i);
|
||||||
|
node_edges.entry(rel.target_key.clone()).or_default().push(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut to_delete: HashSet<usize> = HashSet::new();
|
||||||
|
let mut hubs_capped = 0;
|
||||||
|
|
||||||
|
for (_key, edge_indices) in &node_edges {
|
||||||
|
let active: Vec<usize> = edge_indices.iter()
|
||||||
|
.filter(|&&i| !to_delete.contains(&i))
|
||||||
|
.copied()
|
||||||
|
.collect();
|
||||||
|
if active.len() <= max_degree { continue; }
|
||||||
|
|
||||||
|
// Phase 1: prune Auto edges (weakest first)
|
||||||
|
let mut auto_indices: Vec<(usize, f32)> = Vec::new();
|
||||||
|
let mut link_indices: Vec<(usize, usize)> = Vec::new(); // (idx, other_degree)
|
||||||
|
for &i in &active {
|
||||||
|
let rel = &self.relations[i];
|
||||||
|
if rel.rel_type == RelationType::Auto {
|
||||||
|
auto_indices.push((i, rel.strength));
|
||||||
|
} else {
|
||||||
|
// For Link/Causal, rank by other endpoint's degree
|
||||||
|
let other = if &rel.source_key == _key {
|
||||||
|
&rel.target_key
|
||||||
|
} else {
|
||||||
|
&rel.source_key
|
||||||
|
};
|
||||||
|
let other_deg = node_degree.get(other).copied().unwrap_or(0);
|
||||||
|
link_indices.push((i, other_deg));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let excess = active.len() - max_degree;
|
||||||
|
|
||||||
|
// Sort Auto by strength ascending
|
||||||
|
auto_indices.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
||||||
|
let auto_prune = excess.min(auto_indices.len());
|
||||||
|
for &(i, _) in auto_indices.iter().take(auto_prune) {
|
||||||
|
to_delete.insert(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2: if still over cap, prune Link edges to high-degree targets
|
||||||
|
let remaining_excess = excess.saturating_sub(auto_prune);
|
||||||
|
if remaining_excess > 0 {
|
||||||
|
// Sort by other endpoint degree descending (prune links
|
||||||
|
// to well-connected nodes first — they have alternative paths)
|
||||||
|
link_indices.sort_by(|a, b| b.1.cmp(&a.1));
|
||||||
|
let link_prune = remaining_excess.min(link_indices.len());
|
||||||
|
for &(i, _) in link_indices.iter().take(link_prune) {
|
||||||
|
to_delete.insert(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hubs_capped += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply deletions
|
||||||
|
let mut pruned_rels = Vec::new();
|
||||||
|
for &i in &to_delete {
|
||||||
|
self.relations[i].deleted = true;
|
||||||
|
self.relations[i].version += 1;
|
||||||
|
pruned_rels.push(self.relations[i].clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
if !pruned_rels.is_empty() {
|
||||||
|
self.append_relations(&pruned_rels)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((hubs_capped, to_delete.len()))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn category_counts(&self) -> HashMap<&str, usize> {
|
pub fn category_counts(&self) -> HashMap<&str, usize> {
|
||||||
let mut counts = HashMap::new();
|
let mut counts = HashMap::new();
|
||||||
for node in self.nodes.values() {
|
for node in self.nodes.values() {
|
||||||
|
|
|
||||||
|
|
@ -922,6 +922,27 @@ pub fn consolidate_full(store: &mut Store) -> Result<(), String> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Step 3b: Link orphans ---
|
||||||
|
log.write("\n--- Step 3b: Link orphans ---")?;
|
||||||
|
println!("\n--- Linking orphan nodes ---");
|
||||||
|
*store = Store::load()?;
|
||||||
|
|
||||||
|
let (lo_orphans, lo_added) = neuro::link_orphans(store, 2, 3, 0.15);
|
||||||
|
log.write(&format!(" {} orphans, {} links added", lo_orphans, lo_added))?;
|
||||||
|
|
||||||
|
// --- Step 3c: Cap degree ---
|
||||||
|
log.write("\n--- Step 3c: Cap degree ---")?;
|
||||||
|
println!("\n--- Capping node degree ---");
|
||||||
|
*store = Store::load()?;
|
||||||
|
|
||||||
|
match store.cap_degree(50) {
|
||||||
|
Ok((hubs, pruned)) => {
|
||||||
|
store.save()?;
|
||||||
|
log.write(&format!(" {} hubs capped, {} edges pruned", hubs, pruned))?;
|
||||||
|
}
|
||||||
|
Err(e) => log.write(&format!(" ERROR: {}", e))?,
|
||||||
|
}
|
||||||
|
|
||||||
// --- Step 4: Digest auto ---
|
// --- Step 4: Digest auto ---
|
||||||
log.write("\n--- Step 4: Digest auto ---")?;
|
log.write("\n--- Step 4: Digest auto ---")?;
|
||||||
println!("\n--- Generating missing digests ---");
|
println!("\n--- Generating missing digests ---");
|
||||||
|
|
|
||||||
|
|
@ -421,6 +421,11 @@ fn label_propagation(
|
||||||
adj: &HashMap<String, Vec<Edge>>,
|
adj: &HashMap<String, Vec<Edge>>,
|
||||||
max_iterations: u32,
|
max_iterations: u32,
|
||||||
) -> HashMap<String, u32> {
|
) -> HashMap<String, u32> {
|
||||||
|
// Only consider edges above this strength for community votes.
|
||||||
|
// Weak auto-links from triangle closure (0.15-0.35) bridge
|
||||||
|
// unrelated clusters — filtering them lets natural communities emerge.
|
||||||
|
let min_strength: f32 = 0.3;
|
||||||
|
|
||||||
// Initialize: each node gets its own label
|
// Initialize: each node gets its own label
|
||||||
let key_vec: Vec<String> = keys.iter().cloned().collect();
|
let key_vec: Vec<String> = keys.iter().cloned().collect();
|
||||||
let mut labels: HashMap<String, u32> = key_vec.iter()
|
let mut labels: HashMap<String, u32> = key_vec.iter()
|
||||||
|
|
@ -438,9 +443,10 @@ fn label_propagation(
|
||||||
};
|
};
|
||||||
if edges.is_empty() { continue; }
|
if edges.is_empty() { continue; }
|
||||||
|
|
||||||
// Count weighted votes for each label
|
// Count weighted votes for each label (skip weak edges)
|
||||||
let mut votes: HashMap<u32, f32> = HashMap::new();
|
let mut votes: HashMap<u32, f32> = HashMap::new();
|
||||||
for edge in edges {
|
for edge in edges {
|
||||||
|
if edge.strength < min_strength { continue; }
|
||||||
if let Some(&label) = labels.get(&edge.target) {
|
if let Some(&label) = labels.get(&edge.target) {
|
||||||
*votes.entry(label).or_default() += edge.strength;
|
*votes.entry(label).or_default() += edge.strength;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
35
src/main.rs
35
src/main.rs
|
|
@ -76,6 +76,9 @@ fn main() {
|
||||||
"wrong" => cmd_wrong(&args[2..]),
|
"wrong" => cmd_wrong(&args[2..]),
|
||||||
"gap" => cmd_gap(&args[2..]),
|
"gap" => cmd_gap(&args[2..]),
|
||||||
"categorize" => cmd_categorize(&args[2..]),
|
"categorize" => cmd_categorize(&args[2..]),
|
||||||
|
"fix-categories" => cmd_fix_categories(),
|
||||||
|
"cap-degree" => cmd_cap_degree(&args[2..]),
|
||||||
|
"link-orphans" => cmd_link_orphans(&args[2..]),
|
||||||
"decay" => cmd_decay(),
|
"decay" => cmd_decay(),
|
||||||
"consolidate-batch" => cmd_consolidate_batch(&args[2..]),
|
"consolidate-batch" => cmd_consolidate_batch(&args[2..]),
|
||||||
"log" => cmd_log(),
|
"log" => cmd_log(),
|
||||||
|
|
@ -325,6 +328,38 @@ fn cmd_categorize(args: &[String]) -> Result<(), String> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cmd_fix_categories() -> Result<(), String> {
|
||||||
|
let mut store = capnp_store::Store::load()?;
|
||||||
|
let before = format!("{:?}", store.category_counts());
|
||||||
|
let (changed, kept) = store.fix_categories()?;
|
||||||
|
store.save()?;
|
||||||
|
let after = format!("{:?}", store.category_counts());
|
||||||
|
println!("Category fix: {} changed, {} kept", changed, kept);
|
||||||
|
println!("\nBefore: {}", before);
|
||||||
|
println!("After: {}", after);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cmd_link_orphans(args: &[String]) -> Result<(), String> {
|
||||||
|
let min_deg: usize = args.first().and_then(|s| s.parse().ok()).unwrap_or(2);
|
||||||
|
let links_per: usize = args.get(1).and_then(|s| s.parse().ok()).unwrap_or(3);
|
||||||
|
let sim_thresh: f32 = args.get(2).and_then(|s| s.parse().ok()).unwrap_or(0.15);
|
||||||
|
let mut store = capnp_store::Store::load()?;
|
||||||
|
let (orphans, links) = neuro::link_orphans(&mut store, min_deg, links_per, sim_thresh);
|
||||||
|
println!("Linked {} orphans, added {} connections (min_degree={}, links_per={}, sim>{})",
|
||||||
|
orphans, links, min_deg, links_per, sim_thresh);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn cmd_cap_degree(args: &[String]) -> Result<(), String> {
|
||||||
|
let max_deg: usize = args.first().and_then(|s| s.parse().ok()).unwrap_or(50);
|
||||||
|
let mut store = capnp_store::Store::load()?;
|
||||||
|
let (hubs, pruned) = store.cap_degree(max_deg)?;
|
||||||
|
store.save()?;
|
||||||
|
println!("Capped {} hubs, pruned {} weak Auto edges (max_degree={})", hubs, pruned, max_deg);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn cmd_decay() -> Result<(), String> {
|
fn cmd_decay() -> Result<(), String> {
|
||||||
let mut store = capnp_store::Store::load()?;
|
let mut store = capnp_store::Store::load()?;
|
||||||
let (decayed, pruned) = store.decay();
|
let (decayed, pruned) = store.decay();
|
||||||
|
|
|
||||||
76
src/neuro.rs
76
src/neuro.rs
|
|
@ -1059,3 +1059,79 @@ pub fn triangle_close(
|
||||||
}
|
}
|
||||||
(hubs_processed, added)
|
(hubs_processed, added)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Link orphan nodes (degree < min_degree) to their most textually similar
|
||||||
|
/// connected nodes. For each orphan, finds top-K nearest neighbors by
|
||||||
|
/// cosine similarity and creates Auto links.
|
||||||
|
/// Returns (orphans_linked, total_links_added).
|
||||||
|
pub fn link_orphans(
|
||||||
|
store: &mut Store,
|
||||||
|
min_degree: usize,
|
||||||
|
links_per_orphan: usize,
|
||||||
|
sim_threshold: f32,
|
||||||
|
) -> (usize, usize) {
|
||||||
|
let graph = store.build_graph();
|
||||||
|
let mut added = 0usize;
|
||||||
|
let mut orphans_linked = 0usize;
|
||||||
|
|
||||||
|
// Separate orphans from connected nodes
|
||||||
|
let orphans: Vec<String> = graph.nodes().iter()
|
||||||
|
.filter(|k| graph.degree(k) < min_degree)
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Build candidate pool: connected nodes with their content
|
||||||
|
let candidates: Vec<(String, String)> = graph.nodes().iter()
|
||||||
|
.filter(|k| graph.degree(k) >= min_degree)
|
||||||
|
.filter_map(|k| store.nodes.get(k).map(|n| (k.clone(), n.content.clone())))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if candidates.is_empty() { return (0, 0); }
|
||||||
|
|
||||||
|
for orphan_key in &orphans {
|
||||||
|
let orphan_content = match store.nodes.get(orphan_key) {
|
||||||
|
Some(n) => n.content.clone(),
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
if orphan_content.len() < 20 { continue; } // skip near-empty nodes
|
||||||
|
|
||||||
|
// Score against all candidates
|
||||||
|
let mut scores: Vec<(usize, f32)> = candidates.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, (_, content))| {
|
||||||
|
(i, similarity::cosine_similarity(&orphan_content, content))
|
||||||
|
})
|
||||||
|
.filter(|(_, s)| *s >= sim_threshold)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
|
||||||
|
let to_link = scores.len().min(links_per_orphan);
|
||||||
|
if to_link == 0 { continue; }
|
||||||
|
|
||||||
|
let orphan_uuid = store.nodes.get(orphan_key).unwrap().uuid;
|
||||||
|
|
||||||
|
for &(idx, sim) in scores.iter().take(to_link) {
|
||||||
|
let target_key = &candidates[idx].0;
|
||||||
|
let target_uuid = match store.nodes.get(target_key) {
|
||||||
|
Some(n) => n.uuid,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
let rel = Store::new_relation(
|
||||||
|
orphan_uuid, target_uuid,
|
||||||
|
crate::capnp_store::RelationType::Auto,
|
||||||
|
sim * 0.5,
|
||||||
|
orphan_key, target_key,
|
||||||
|
);
|
||||||
|
if store.add_relation(rel).is_ok() {
|
||||||
|
added += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
orphans_linked += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if added > 0 {
|
||||||
|
let _ = store.save();
|
||||||
|
}
|
||||||
|
(orphans_linked, added)
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue