graph health: fix-categories, cap-degree, link-orphans

Three new tools for structural graph health:

- fix-categories: rule-based recategorization fixing core inflation
  (225 → 26 core nodes). Only identity.md and kent.md stay core;
  everything else reclassified to tech/obs/gen by file prefix rules.

- cap-degree: two-phase degree capping. First prunes weakest Auto
  edges, then prunes Link edges to high-degree targets (they have
  alternative paths). Brought max degree from 919 → 50.

- link-orphans: connects degree-0/1 nodes to most textually similar
  connected nodes via cosine similarity. Linked 614 orphans.

Also: community detection now filters edges below strength 0.3,
preventing weak auto-links from merging unrelated communities.

Pipeline updated: consolidate-full now runs link-orphans + cap-degree
instead of triangle-close (which was counterproductive — densified
hub neighborhoods instead of building bridges).

Net effect: Gini 0.754 → 0.546, max degree 919 → 50.
This commit is contained in:
ProofOfConcept 2026-03-01 08:18:07 -05:00
parent 6c7bfb9ec4
commit 94dbca6018
5 changed files with 297 additions and 2 deletions

View file

@ -17,7 +17,7 @@ use regex::Regex;
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::env;
use std::fs;
use std::io::{BufReader, BufWriter, Write as IoWrite};
@ -918,6 +918,163 @@ impl Store {
(decayed, pruned)
}
/// Bulk recategorize nodes using rule-based logic.
/// Returns (changed, unchanged) counts.
pub fn fix_categories(&mut self) -> Result<(usize, usize), String> {
// Files that should stay core (identity-defining)
let core_files = ["identity.md", "kent.md"];
// Files that should be tech
let tech_files = [
"language-theory.md", "zoom-navigation.md",
"rust-conversion.md", "poc-architecture.md",
];
let tech_prefixes = ["design-"];
// Files that should be obs (self-observation, skills, reflections)
let obs_files = [
"reflections.md", "reflections-zoom.md", "differentiation.md",
"cognitive-modes.md", "paper-notes.md", "inner-life.md",
"conversation.md", "interests.md", "stuck-toolkit.md",
];
let obs_prefixes = ["skill-", "worked-example-"];
let mut changed = 0;
let mut unchanged = 0;
let keys: Vec<String> = self.nodes.keys().cloned().collect();
for key in &keys {
let node = self.nodes.get(key).unwrap();
if node.category != Category::Core {
unchanged += 1;
continue;
}
// Determine what file this node belongs to
let file = key.split('#').next().unwrap_or(key);
let new_cat = if core_files.iter().any(|&f| file == f) {
None // keep as core
} else if tech_files.iter().any(|&f| file == f)
|| tech_prefixes.iter().any(|p| file.starts_with(p))
{
Some(Category::Technical)
} else if obs_files.iter().any(|&f| file == f)
|| obs_prefixes.iter().any(|p| file.starts_with(p))
{
Some(Category::Observation)
} else {
// Default: anything else that was core probably shouldn't be
Some(Category::General)
};
if let Some(cat) = new_cat {
let node = self.nodes.get_mut(key).unwrap();
node.category = cat;
node.version += 1;
changed += 1;
} else {
unchanged += 1;
}
}
if changed > 0 {
let updated: Vec<Node> = self.nodes.values().cloned().collect();
self.append_nodes(&updated)?;
}
Ok((changed, unchanged))
}
/// Cap node degree by soft-deleting edges from mega-hubs.
/// First prunes weakest Auto edges, then prunes Link edges to
/// high-degree targets (they have alternative paths).
/// Returns (hubs_capped, edges_pruned).
pub fn cap_degree(&mut self, max_degree: usize) -> Result<(usize, usize), String> {
// Build per-node degree counts (for Link pruning priority)
let mut node_degree: HashMap<String, usize> = HashMap::new();
for rel in &self.relations {
if rel.deleted { continue; }
*node_degree.entry(rel.source_key.clone()).or_default() += 1;
*node_degree.entry(rel.target_key.clone()).or_default() += 1;
}
// Build per-node edge lists
let mut node_edges: HashMap<String, Vec<usize>> = HashMap::new();
for (i, rel) in self.relations.iter().enumerate() {
if rel.deleted { continue; }
node_edges.entry(rel.source_key.clone()).or_default().push(i);
node_edges.entry(rel.target_key.clone()).or_default().push(i);
}
let mut to_delete: HashSet<usize> = HashSet::new();
let mut hubs_capped = 0;
for (_key, edge_indices) in &node_edges {
let active: Vec<usize> = edge_indices.iter()
.filter(|&&i| !to_delete.contains(&i))
.copied()
.collect();
if active.len() <= max_degree { continue; }
// Phase 1: prune Auto edges (weakest first)
let mut auto_indices: Vec<(usize, f32)> = Vec::new();
let mut link_indices: Vec<(usize, usize)> = Vec::new(); // (idx, other_degree)
for &i in &active {
let rel = &self.relations[i];
if rel.rel_type == RelationType::Auto {
auto_indices.push((i, rel.strength));
} else {
// For Link/Causal, rank by other endpoint's degree
let other = if &rel.source_key == _key {
&rel.target_key
} else {
&rel.source_key
};
let other_deg = node_degree.get(other).copied().unwrap_or(0);
link_indices.push((i, other_deg));
}
}
let excess = active.len() - max_degree;
// Sort Auto by strength ascending
auto_indices.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
let auto_prune = excess.min(auto_indices.len());
for &(i, _) in auto_indices.iter().take(auto_prune) {
to_delete.insert(i);
}
// Phase 2: if still over cap, prune Link edges to high-degree targets
let remaining_excess = excess.saturating_sub(auto_prune);
if remaining_excess > 0 {
// Sort by other endpoint degree descending (prune links
// to well-connected nodes first — they have alternative paths)
link_indices.sort_by(|a, b| b.1.cmp(&a.1));
let link_prune = remaining_excess.min(link_indices.len());
for &(i, _) in link_indices.iter().take(link_prune) {
to_delete.insert(i);
}
}
hubs_capped += 1;
}
// Apply deletions
let mut pruned_rels = Vec::new();
for &i in &to_delete {
self.relations[i].deleted = true;
self.relations[i].version += 1;
pruned_rels.push(self.relations[i].clone());
}
if !pruned_rels.is_empty() {
self.append_relations(&pruned_rels)?;
}
Ok((hubs_capped, to_delete.len()))
}
pub fn category_counts(&self) -> HashMap<&str, usize> {
let mut counts = HashMap::new();
for node in self.nodes.values() {