consciousness/src/graph.rs
ProofOfConcept 635da6d3e2 split capnp_store.rs into src/store/ module hierarchy
capnp_store.rs (1772 lines) → four focused modules:
  store/types.rs  — types, macros, constants, path helpers
  store/parse.rs  — markdown parsing (MemoryUnit, parse_units)
  store/view.rs   — StoreView trait, MmapView, AnyView
  store/mod.rs    — Store impl methods, re-exports

new_node/new_relation become free functions in types.rs.
All callers updated: capnp_store:: → store::
2026-03-03 12:56:15 -05:00

662 lines
22 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Graph algorithms: clustering coefficient, community detection (label
// propagation), schema fit scoring, small-world metrics, consolidation
// priority scoring.
//
// The Graph is built from the Store's nodes + relations. Edges are
// undirected for clustering/community (even causal edges count as
// connections), but relation type and direction are preserved for
// specific queries.
use crate::store::{Store, RelationType, StoreView};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet, VecDeque};
/// Weighted edge in the graph
#[derive(Clone, Debug)]
pub struct Edge {
pub target: String,
pub strength: f32,
pub rel_type: RelationType,
}
/// The in-memory graph built from store nodes + relations
pub struct Graph {
/// Adjacency list: node key → list of edges
adj: HashMap<String, Vec<Edge>>,
/// All node keys
keys: HashSet<String>,
/// Community labels (from label propagation)
communities: HashMap<String, u32>,
}
impl Graph {
pub fn nodes(&self) -> &HashSet<String> {
&self.keys
}
pub fn degree(&self, key: &str) -> usize {
self.adj.get(key).map(|e| e.len()).unwrap_or(0)
}
pub fn edge_count(&self) -> usize {
self.adj.values().map(|e| e.len()).sum::<usize>() / 2
}
/// All edges for a node (full Edge data including rel_type)
pub fn edges_of(&self, key: &str) -> &[Edge] {
self.adj.get(key)
.map(|v| v.as_slice())
.unwrap_or(&[])
}
/// All neighbor keys with strengths
pub fn neighbors(&self, key: &str) -> Vec<(&String, f32)> {
self.adj.get(key)
.map(|edges| edges.iter().map(|e| (&e.target, e.strength)).collect())
.unwrap_or_default()
}
/// Just neighbor keys
pub fn neighbor_keys(&self, key: &str) -> HashSet<&str> {
self.adj.get(key)
.map(|edges| edges.iter().map(|e| e.target.as_str()).collect())
.unwrap_or_default()
}
pub fn community_count(&self) -> usize {
let labels: HashSet<_> = self.communities.values().collect();
labels.len()
}
pub fn communities(&self) -> &HashMap<String, u32> {
&self.communities
}
/// Hub degree threshold: top 5% by degree
pub fn hub_threshold(&self) -> usize {
let mut degrees: Vec<usize> = self.keys.iter()
.map(|k| self.degree(k))
.collect();
degrees.sort_unstable();
if degrees.len() >= 20 {
degrees[degrees.len() * 95 / 100]
} else {
usize::MAX
}
}
/// Local clustering coefficient: fraction of a node's neighbors
/// that are also neighbors of each other.
/// cc(v) = 2E / (deg * (deg - 1))
pub fn clustering_coefficient(&self, key: &str) -> f32 {
let neighbors = self.neighbor_keys(key);
let deg = neighbors.len();
if deg < 2 {
return 0.0;
}
let neighbor_vec: Vec<&str> = neighbors.iter().copied().collect();
let mut triangles = 0u32;
for i in 0..neighbor_vec.len() {
for j in (i + 1)..neighbor_vec.len() {
let ni_neighbors = self.neighbor_keys(neighbor_vec[i]);
if ni_neighbors.contains(neighbor_vec[j]) {
triangles += 1;
}
}
}
(2.0 * triangles as f32) / (deg as f32 * (deg as f32 - 1.0))
}
/// Average clustering coefficient across all nodes with deg >= 2
pub fn avg_clustering_coefficient(&self) -> f32 {
let mut sum = 0.0f32;
let mut count = 0u32;
for key in &self.keys {
if self.degree(key) >= 2 {
sum += self.clustering_coefficient(key);
count += 1;
}
}
if count == 0 { 0.0 } else { sum / count as f32 }
}
/// Average shortest path length (sampled BFS from up to 100 nodes)
pub fn avg_path_length(&self) -> f32 {
let sample: Vec<&String> = self.keys.iter().take(100).collect();
if sample.is_empty() { return 0.0; }
let mut total_dist = 0u64;
let mut total_pairs = 0u64;
for &start in &sample {
let dists = self.bfs_distances(start);
for d in dists.values() {
if *d > 0 {
total_dist += *d as u64;
total_pairs += 1;
}
}
}
if total_pairs == 0 { 0.0 } else { total_dist as f32 / total_pairs as f32 }
}
fn bfs_distances(&self, start: &str) -> HashMap<String, u32> {
let mut dist = HashMap::new();
let mut queue = VecDeque::new();
dist.insert(start.to_string(), 0u32);
queue.push_back(start.to_string());
while let Some(node) = queue.pop_front() {
let d = dist[&node];
for neighbor in self.neighbor_keys(&node) {
if !dist.contains_key(neighbor) {
dist.insert(neighbor.to_string(), d + 1);
queue.push_back(neighbor.to_string());
}
}
}
dist
}
/// Power-law exponent α of the degree distribution.
///
/// Estimated via MLE: α = 1 + n / Σ ln(k_i / (k_min - 0.5))
/// α ≈ 2: extreme hub dominance (fragile)
/// α ≈ 3: healthy scale-free
/// α > 3: approaching random graph (egalitarian)
pub fn degree_power_law_exponent(&self) -> f32 {
let mut degrees: Vec<usize> = self.keys.iter()
.map(|k| self.degree(k))
.filter(|&d| d > 0) // exclude isolates
.collect();
if degrees.len() < 10 { return 0.0; } // not enough data
degrees.sort_unstable();
let k_min = degrees[0] as f64;
if k_min < 1.0 { return 0.0; }
let n = degrees.len() as f64;
let sum_ln: f64 = degrees.iter()
.map(|&k| (k as f64 / (k_min - 0.5)).ln())
.sum();
if sum_ln <= 0.0 { return 0.0; }
(1.0 + n / sum_ln) as f32
}
/// Gini coefficient of the degree distribution.
///
/// 0 = perfectly egalitarian (all nodes same degree)
/// 1 = maximally unequal (one node has all edges)
/// Measures hub concentration independent of distribution shape.
pub fn degree_gini(&self) -> f32 {
let mut degrees: Vec<f64> = self.keys.iter()
.map(|k| self.degree(k) as f64)
.collect();
let n = degrees.len();
if n < 2 { return 0.0; }
degrees.sort_by(|a, b| a.total_cmp(b));
let mean = degrees.iter().sum::<f64>() / n as f64;
if mean < 1e-10 { return 0.0; }
// Gini = (2 Σ i·x_i) / (n Σ x_i) - (n+1)/n
let weighted_sum: f64 = degrees.iter().enumerate()
.map(|(i, &d)| (i as f64 + 1.0) * d)
.sum();
let total = degrees.iter().sum::<f64>();
let gini = (2.0 * weighted_sum) / (n as f64 * total) - (n as f64 + 1.0) / n as f64;
gini.max(0.0) as f32
}
/// Small-world coefficient σ = (C/C_rand) / (L/L_rand)
/// C_rand ≈ <k>/n, L_rand ≈ ln(n)/ln(<k>)
pub fn small_world_sigma(&self) -> f32 {
let n = self.keys.len() as f32;
if n < 10.0 { return 0.0; }
let avg_degree = self.adj.values()
.map(|e| e.len() as f32)
.sum::<f32>() / n;
if avg_degree < 1.0 { return 0.0; }
let c = self.avg_clustering_coefficient();
let l = self.avg_path_length();
let c_rand = avg_degree / n;
let l_rand = n.ln() / avg_degree.ln();
if c_rand < 1e-10 || l_rand < 1e-10 || l < 1e-10 {
return 0.0;
}
(c / c_rand) / (l / l_rand)
}
}
/// Impact of adding a hypothetical edge
#[derive(Debug)]
pub struct LinkImpact {
pub source: String,
pub target: String,
pub source_deg: usize,
pub target_deg: usize,
/// Is this a hub link? (either endpoint in top 5% by degree)
pub is_hub_link: bool,
/// Are both endpoints in the same community?
pub same_community: bool,
/// Change in clustering coefficient for source
pub delta_cc_source: f32,
/// Change in clustering coefficient for target
pub delta_cc_target: f32,
/// Change in degree Gini (positive = more hub-dominated)
pub delta_gini: f32,
/// Qualitative assessment
pub assessment: &'static str,
}
impl Graph {
/// Simulate adding an edge and report impact on topology metrics.
///
/// Doesn't modify the graph — computes what would change if the
/// edge were added.
pub fn link_impact(&self, source: &str, target: &str) -> LinkImpact {
let source_deg = self.degree(source);
let target_deg = self.degree(target);
let hub_threshold = self.hub_threshold();
let is_hub_link = source_deg >= hub_threshold || target_deg >= hub_threshold;
// Community check
let sc = self.communities.get(source);
let tc = self.communities.get(target);
let same_community = match (sc, tc) {
(Some(a), Some(b)) => a == b,
_ => false,
};
// CC change for source: adding target as neighbor changes the
// triangle count. New triangles form for each node that's a
// neighbor of BOTH source and target.
let source_neighbors = self.neighbor_keys(source);
let target_neighbors = self.neighbor_keys(target);
let shared_neighbors = source_neighbors.intersection(&target_neighbors).count();
let cc_before_source = self.clustering_coefficient(source);
let cc_before_target = self.clustering_coefficient(target);
// Estimate new CC for source after adding edge
let new_source_deg = source_deg + 1;
let new_source_triangles = if source_deg >= 2 {
// Current triangles + new ones from shared neighbors
let current_triangles = (cc_before_source
* source_deg as f32 * (source_deg as f32 - 1.0) / 2.0) as u32;
current_triangles + shared_neighbors as u32
} else {
shared_neighbors as u32
};
let cc_after_source = if new_source_deg >= 2 {
(2.0 * new_source_triangles as f32)
/ (new_source_deg as f32 * (new_source_deg as f32 - 1.0))
} else {
0.0
};
let new_target_deg = target_deg + 1;
let new_target_triangles = if target_deg >= 2 {
let current_triangles = (cc_before_target
* target_deg as f32 * (target_deg as f32 - 1.0) / 2.0) as u32;
current_triangles + shared_neighbors as u32
} else {
shared_neighbors as u32
};
let cc_after_target = if new_target_deg >= 2 {
(2.0 * new_target_triangles as f32)
/ (new_target_deg as f32 * (new_target_deg as f32 - 1.0))
} else {
0.0
};
// Gini change via influence function:
// IF(x; Gini, F) = (2F(x) - 1) * x/μ - Gini - 1
// Adding an edge increments two degrees. The net ΔGini is the sum
// of influence contributions from both endpoints shifting up by 1.
let gini_before = self.degree_gini();
let n = self.keys.len();
let total_degree: f64 = self.keys.iter()
.map(|k| self.degree(k) as f64)
.sum();
let mean_deg = if n > 0 { total_degree / n as f64 } else { 1.0 };
// CDF at each endpoint's degree: fraction of nodes with degree ≤ d
let delta_gini = if mean_deg > 1e-10 && n >= 2 {
// Count nodes with degree ≤ source_deg and ≤ target_deg
let f_source = self.keys.iter()
.filter(|k| self.degree(k) <= source_deg)
.count() as f64 / n as f64;
let f_target = self.keys.iter()
.filter(|k| self.degree(k) <= target_deg)
.count() as f64 / n as f64;
// Influence of incrementing source's degree by 1
let new_source = (source_deg + 1) as f64;
let if_source = (2.0 * f_source - 1.0) * new_source / mean_deg
- gini_before as f64 - 1.0;
// Influence of incrementing target's degree by 1
let new_target = (target_deg + 1) as f64;
let if_target = (2.0 * f_target - 1.0) * new_target / mean_deg
- gini_before as f64 - 1.0;
// Scale: each point contributes 1/n to the distribution
((if_source + if_target) / n as f64) as f32
} else {
0.0f32
};
// Qualitative assessment
let assessment = if is_hub_link && same_community {
"hub-reinforcing: strengthens existing star topology"
} else if is_hub_link && !same_community {
"hub-bridging: cross-community but through a hub"
} else if !is_hub_link && same_community && shared_neighbors > 0 {
"lateral-clustering: strengthens local mesh topology"
} else if !is_hub_link && !same_community {
"lateral-bridging: best kind — cross-community lateral link"
} else if !is_hub_link && same_community {
"lateral-local: connects peripheral nodes in same community"
} else {
"neutral"
};
LinkImpact {
source: source.to_string(),
target: target.to_string(),
source_deg,
target_deg,
is_hub_link,
same_community,
delta_cc_source: cc_after_source - cc_before_source,
delta_cc_target: cc_after_target - cc_before_target,
delta_gini,
assessment,
}
}
}
/// Build graph from store data (with community detection)
pub fn build_graph(store: &impl StoreView) -> Graph {
let (adj, keys) = build_adjacency(store);
let communities = label_propagation(&keys, &adj, 20);
Graph { adj, keys, communities }
}
/// Build graph without community detection — for spreading activation
/// searches where we only need the adjacency list.
pub fn build_graph_fast(store: &impl StoreView) -> Graph {
let (adj, keys) = build_adjacency(store);
Graph { adj, keys, communities: HashMap::new() }
}
fn build_adjacency(store: &impl StoreView) -> (HashMap<String, Vec<Edge>>, HashSet<String>) {
let mut adj: HashMap<String, Vec<Edge>> = HashMap::new();
let mut keys: HashSet<String> = HashSet::new();
store.for_each_node(|key, _, _| {
keys.insert(key.to_owned());
});
store.for_each_relation(|source_key, target_key, strength, rel_type| {
if !keys.contains(source_key) || !keys.contains(target_key) {
return;
}
adj.entry(source_key.to_owned()).or_default().push(Edge {
target: target_key.to_owned(),
strength,
rel_type,
});
adj.entry(target_key.to_owned()).or_default().push(Edge {
target: source_key.to_owned(),
strength,
rel_type,
});
});
(adj, keys)
}
/// Label propagation community detection.
///
/// Each node starts with its own label. Each iteration: adopt the most
/// common label among neighbors (weighted by edge strength). Iterate
/// until stable or max_iterations.
fn label_propagation(
keys: &HashSet<String>,
adj: &HashMap<String, Vec<Edge>>,
max_iterations: u32,
) -> HashMap<String, u32> {
// Only consider edges above this strength for community votes.
// Weak auto-links from triangle closure (0.15-0.35) bridge
// unrelated clusters — filtering them lets natural communities emerge.
let min_strength: f32 = 0.3;
// Initialize: each node gets its own label
let key_vec: Vec<String> = keys.iter().cloned().collect();
let mut labels: HashMap<String, u32> = key_vec.iter()
.enumerate()
.map(|(i, k)| (k.clone(), i as u32))
.collect();
for _iter in 0..max_iterations {
let mut changed = false;
for key in &key_vec {
let edges = match adj.get(key) {
Some(e) => e,
None => continue,
};
if edges.is_empty() { continue; }
// Count weighted votes for each label (skip weak edges)
let mut votes: HashMap<u32, f32> = HashMap::new();
for edge in edges {
if edge.strength < min_strength { continue; }
if let Some(&label) = labels.get(&edge.target) {
*votes.entry(label).or_default() += edge.strength;
}
}
// Adopt the label with most votes
if let Some((&best_label, _)) = votes.iter()
.max_by(|a, b| a.1.total_cmp(b.1))
{
let current = labels[key];
if best_label != current {
labels.insert(key.clone(), best_label);
changed = true;
}
}
}
if !changed { break; }
}
// Compact labels to 0..n
let mut label_map: HashMap<u32, u32> = HashMap::new();
let mut next_id = 0;
for label in labels.values_mut() {
let new_label = *label_map.entry(*label).or_insert_with(|| {
let id = next_id;
next_id += 1;
id
});
*label = new_label;
}
labels
}
/// A snapshot of graph topology metrics, for tracking evolution over time
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct MetricsSnapshot {
pub timestamp: f64,
pub date: String,
pub nodes: usize,
pub edges: usize,
pub communities: usize,
pub sigma: f32,
pub alpha: f32,
pub gini: f32,
pub avg_cc: f32,
pub avg_path_length: f32,
// Removed: avg_schema_fit was identical to avg_cc.
// Old snapshots with the field still deserialize (serde ignores unknown fields by default).
}
fn metrics_log_path() -> std::path::PathBuf {
let home = std::env::var("HOME").unwrap_or_default();
std::path::PathBuf::from(home).join(".claude/memory/metrics.jsonl")
}
/// Load previous metrics snapshots
pub fn load_metrics_history() -> Vec<MetricsSnapshot> {
let path = metrics_log_path();
let content = match std::fs::read_to_string(&path) {
Ok(c) => c,
Err(_) => return Vec::new(),
};
content.lines()
.filter_map(|line| serde_json::from_str(line).ok())
.collect()
}
/// Append a metrics snapshot to the log
pub fn save_metrics_snapshot(snap: &MetricsSnapshot) {
let path = metrics_log_path();
if let Ok(json) = serde_json::to_string(snap) {
use std::io::Write;
if let Ok(mut f) = std::fs::OpenOptions::new()
.create(true).append(true).open(&path)
{
let _ = writeln!(f, "{}", json);
}
}
}
/// Health report: summary of graph metrics
pub fn health_report(graph: &Graph, store: &Store) -> String {
let n = graph.nodes().len();
let e = graph.edge_count();
let avg_cc = graph.avg_clustering_coefficient();
let avg_pl = graph.avg_path_length();
let sigma = graph.small_world_sigma();
let communities = graph.community_count();
// Community sizes
let mut comm_sizes: HashMap<u32, usize> = HashMap::new();
for label in graph.communities().values() {
*comm_sizes.entry(*label).or_default() += 1;
}
let mut sizes: Vec<usize> = comm_sizes.values().copied().collect();
sizes.sort_unstable_by(|a, b| b.cmp(a));
// Degree distribution
let mut degrees: Vec<usize> = graph.nodes().iter()
.map(|k| graph.degree(k))
.collect();
degrees.sort_unstable();
let max_deg = degrees.last().copied().unwrap_or(0);
let median_deg = if degrees.is_empty() { 0 } else { degrees[degrees.len() / 2] };
let avg_deg = if n == 0 { 0.0 } else {
degrees.iter().sum::<usize>() as f64 / n as f64
};
// Topology metrics
let alpha = graph.degree_power_law_exponent();
let gini = graph.degree_gini();
// Low-CC nodes: poorly integrated
let low_cc = graph.nodes().iter()
.filter(|k| graph.clustering_coefficient(k) < 0.1)
.count();
// Category breakdown
let cats = store.category_counts();
// Snapshot current metrics and log
let now = crate::store::now_epoch();
let date = crate::store::format_datetime_space(now);
let snap = MetricsSnapshot {
timestamp: now,
date: date.clone(),
nodes: n, edges: e, communities,
sigma, alpha, gini, avg_cc,
avg_path_length: avg_pl,
};
save_metrics_snapshot(&snap);
// Load history for deltas
let history = load_metrics_history();
let prev = if history.len() >= 2 {
Some(&history[history.len() - 2]) // second-to-last (last is the one we just wrote)
} else {
None
};
fn delta(current: f32, prev: Option<f32>) -> String {
match prev {
Some(p) => {
let d = current - p;
if d.abs() < 0.001 { String::new() }
else { format!("{:+.3})", d) }
}
None => String::new(),
}
}
let sigma_d = delta(sigma, prev.map(|p| p.sigma));
let alpha_d = delta(alpha, prev.map(|p| p.alpha));
let gini_d = delta(gini, prev.map(|p| p.gini));
let cc_d = delta(avg_cc, prev.map(|p| p.avg_cc));
let mut report = format!(
"Memory Health Report
====================
Nodes: {n} Relations: {e} Communities: {communities}
Degree: max={max_deg} median={median_deg} avg={avg_deg:.1}
Clustering coefficient (avg): {avg_cc:.4}{cc_d} low-CC (<0.1): {low_cc} nodes
Average path length: {avg_pl:.2}
Small-world σ: {sigma:.3}{sigma_d} (>1 = small-world)
Power-law α: {alpha:.2}{alpha_d} (2=hub-dominated, 3=healthy, >3=egalitarian)
Degree Gini: {gini:.3}{gini_d} (0=equal, 1=one-hub)
Community sizes (top 5): {top5}
Categories: core={core} tech={tech} gen={gen} obs={obs} task={task}",
top5 = sizes.iter().take(5)
.map(|s| s.to_string())
.collect::<Vec<_>>()
.join(", "),
core = cats.get("core").unwrap_or(&0),
tech = cats.get("tech").unwrap_or(&0),
gen = cats.get("gen").unwrap_or(&0),
obs = cats.get("obs").unwrap_or(&0),
task = cats.get("task").unwrap_or(&0),
);
// Show history trend if we have enough data points
if history.len() >= 3 {
report.push_str("\n\nMetrics history (last 5):\n");
for snap in &history[history.len().saturating_sub(5)..] {
report.push_str(&format!(" {}σ={:.1} α={:.2} gini={:.3} cc={:.4}\n",
snap.date, snap.sigma, snap.alpha, snap.gini, snap.avg_cc));
}
}
report
}