Add jaccard() and jaccard_strengths() to Graph. Jaccard similarity measures neighborhood overlap between linked nodes — nodes sharing many neighbors get stronger links, nodes with no shared neighbors get weak links. New subcommand: `poc-memory graph normalize-strengths [--apply]` Scales raw Jaccard (typically 0.0-0.3) to useful range via j*3 clamped to [0.1, 1.0]. Skips implicit temporal edges (strength=1.0). Applied to 64,969 edges. Distribution is bimodal: large cluster at 0.1-0.2 (weak) and spike at 0.9-1.0 (strong), with smooth gradient between. Replaces the meaningless 0.3/0.8 split from manual/agent creation methods. Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
916 lines
32 KiB
Rust
916 lines
32 KiB
Rust
// Graph algorithms: clustering coefficient, community detection (label
|
||
// propagation), schema fit scoring, small-world metrics, consolidation
|
||
// priority scoring.
|
||
//
|
||
// The Graph is built from the Store's nodes + relations. Edges are
|
||
// undirected for clustering/community (even causal edges count as
|
||
// connections), but relation type and direction are preserved for
|
||
// specific queries.
|
||
|
||
use crate::store::{Store, RelationType, StoreView};
|
||
|
||
use serde::{Deserialize, Serialize};
|
||
use std::collections::{HashMap, HashSet, VecDeque};
|
||
|
||
/// Weighted edge in the graph
|
||
#[derive(Clone, Debug)]
|
||
pub struct Edge {
|
||
pub target: String,
|
||
pub strength: f32,
|
||
pub rel_type: RelationType,
|
||
}
|
||
|
||
/// The in-memory graph built from store nodes + relations
|
||
pub struct Graph {
|
||
/// Adjacency list: node key → list of edges
|
||
adj: HashMap<String, Vec<Edge>>,
|
||
/// All node keys
|
||
keys: HashSet<String>,
|
||
/// Community labels (from label propagation)
|
||
communities: HashMap<String, u32>,
|
||
}
|
||
|
||
impl Graph {
|
||
pub fn nodes(&self) -> &HashSet<String> {
|
||
&self.keys
|
||
}
|
||
|
||
pub fn degree(&self, key: &str) -> usize {
|
||
self.adj.get(key).map(|e| e.len()).unwrap_or(0)
|
||
}
|
||
|
||
pub fn edge_count(&self) -> usize {
|
||
self.adj.values().map(|e| e.len()).sum::<usize>() / 2
|
||
}
|
||
|
||
/// All edges for a node (full Edge data including rel_type)
|
||
pub fn edges_of(&self, key: &str) -> &[Edge] {
|
||
self.adj.get(key)
|
||
.map(|v| v.as_slice())
|
||
.unwrap_or(&[])
|
||
}
|
||
|
||
/// All neighbor keys with strengths
|
||
pub fn neighbors(&self, key: &str) -> Vec<(&String, f32)> {
|
||
self.adj.get(key)
|
||
.map(|edges| edges.iter().map(|e| (&e.target, e.strength)).collect())
|
||
.unwrap_or_default()
|
||
}
|
||
|
||
/// Just neighbor keys
|
||
pub fn neighbor_keys(&self, key: &str) -> HashSet<&str> {
|
||
self.adj.get(key)
|
||
.map(|edges| edges.iter().map(|e| e.target.as_str()).collect())
|
||
.unwrap_or_default()
|
||
}
|
||
|
||
/// Jaccard similarity between two nodes' neighborhoods.
|
||
/// Measures overlap: |intersection| / |union| of their neighbor sets.
|
||
pub fn jaccard(&self, a: &str, b: &str) -> f32 {
|
||
let na = self.neighbor_keys(a);
|
||
let nb = self.neighbor_keys(b);
|
||
let intersection = na.intersection(&nb).count();
|
||
let union = na.union(&nb).count();
|
||
if union == 0 { 0.0 } else { intersection as f32 / union as f32 }
|
||
}
|
||
|
||
/// Compute Jaccard-based strength for every edge in the graph.
|
||
/// Returns (source_key, target_key, jaccard_strength) triples.
|
||
/// Scales raw Jaccard (typically 0.0-0.3) to a useful range.
|
||
pub fn jaccard_strengths(&self) -> Vec<(String, String, f32)> {
|
||
let mut result = Vec::new();
|
||
let mut seen = HashSet::new();
|
||
for (key, edges) in &self.adj {
|
||
for edge in edges {
|
||
// Deduplicate undirected edges
|
||
let pair = if key < &edge.target {
|
||
(key.as_str(), edge.target.as_str())
|
||
} else {
|
||
(edge.target.as_str(), key.as_str())
|
||
};
|
||
if !seen.insert((pair.0.to_string(), pair.1.to_string())) {
|
||
continue;
|
||
}
|
||
let j = self.jaccard(key, &edge.target);
|
||
// Scale: raw Jaccard 0.05 → 0.15, 0.15 → 0.45, 0.30 → 0.90
|
||
// Formula: clamp(j * 3, 0.1, 1.0)
|
||
let strength = (j * 3.0).clamp(0.1, 1.0);
|
||
result.push((key.clone(), edge.target.clone(), strength));
|
||
}
|
||
}
|
||
result
|
||
}
|
||
|
||
pub fn community_count(&self) -> usize {
|
||
let labels: HashSet<_> = self.communities.values().collect();
|
||
labels.len()
|
||
}
|
||
|
||
pub fn communities(&self) -> &HashMap<String, u32> {
|
||
&self.communities
|
||
}
|
||
|
||
/// Hub degree threshold: top 5% by degree
|
||
pub fn hub_threshold(&self) -> usize {
|
||
let mut degrees: Vec<usize> = self.keys.iter()
|
||
.map(|k| self.degree(k))
|
||
.collect();
|
||
degrees.sort_unstable();
|
||
if degrees.len() >= 20 {
|
||
degrees[degrees.len() * 95 / 100]
|
||
} else {
|
||
usize::MAX
|
||
}
|
||
}
|
||
|
||
/// Local clustering coefficient: fraction of a node's neighbors
|
||
/// that are also neighbors of each other.
|
||
/// cc(v) = 2E / (deg * (deg - 1))
|
||
pub fn clustering_coefficient(&self, key: &str) -> f32 {
|
||
let neighbors = self.neighbor_keys(key);
|
||
let deg = neighbors.len();
|
||
if deg < 2 {
|
||
return 0.0;
|
||
}
|
||
|
||
let neighbor_vec: Vec<&str> = neighbors.iter().copied().collect();
|
||
let mut triangles = 0u32;
|
||
for i in 0..neighbor_vec.len() {
|
||
for j in (i + 1)..neighbor_vec.len() {
|
||
let ni_neighbors = self.neighbor_keys(neighbor_vec[i]);
|
||
if ni_neighbors.contains(neighbor_vec[j]) {
|
||
triangles += 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
(2.0 * triangles as f32) / (deg as f32 * (deg as f32 - 1.0))
|
||
}
|
||
|
||
/// Average clustering coefficient across all nodes with deg >= 2
|
||
pub fn avg_clustering_coefficient(&self) -> f32 {
|
||
let mut sum = 0.0f32;
|
||
let mut count = 0u32;
|
||
for key in &self.keys {
|
||
if self.degree(key) >= 2 {
|
||
sum += self.clustering_coefficient(key);
|
||
count += 1;
|
||
}
|
||
}
|
||
if count == 0 { 0.0 } else { sum / count as f32 }
|
||
}
|
||
|
||
/// Average shortest path length (sampled BFS from up to 100 nodes)
|
||
pub fn avg_path_length(&self) -> f32 {
|
||
let sample: Vec<&String> = self.keys.iter().take(100).collect();
|
||
if sample.is_empty() { return 0.0; }
|
||
|
||
let mut total_dist = 0u64;
|
||
let mut total_pairs = 0u64;
|
||
|
||
for &start in &sample {
|
||
let dists = self.bfs_distances(start);
|
||
for d in dists.values() {
|
||
if *d > 0 {
|
||
total_dist += *d as u64;
|
||
total_pairs += 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
if total_pairs == 0 { 0.0 } else { total_dist as f32 / total_pairs as f32 }
|
||
}
|
||
|
||
fn bfs_distances(&self, start: &str) -> HashMap<String, u32> {
|
||
let mut dist = HashMap::new();
|
||
let mut queue = VecDeque::new();
|
||
dist.insert(start.to_string(), 0u32);
|
||
queue.push_back(start.to_string());
|
||
|
||
while let Some(node) = queue.pop_front() {
|
||
let d = dist[&node];
|
||
for neighbor in self.neighbor_keys(&node) {
|
||
if !dist.contains_key(neighbor) {
|
||
dist.insert(neighbor.to_string(), d + 1);
|
||
queue.push_back(neighbor.to_string());
|
||
}
|
||
}
|
||
}
|
||
dist
|
||
}
|
||
|
||
/// Power-law exponent α of the degree distribution.
|
||
///
|
||
/// Estimated via MLE: α = 1 + n / Σ ln(k_i / (k_min - 0.5))
|
||
/// α ≈ 2: extreme hub dominance (fragile)
|
||
/// α ≈ 3: healthy scale-free
|
||
/// α > 3: approaching random graph (egalitarian)
|
||
pub fn degree_power_law_exponent(&self) -> f32 {
|
||
let mut degrees: Vec<usize> = self.keys.iter()
|
||
.map(|k| self.degree(k))
|
||
.filter(|&d| d > 0) // exclude isolates
|
||
.collect();
|
||
if degrees.len() < 10 { return 0.0; } // not enough data
|
||
|
||
degrees.sort_unstable();
|
||
let k_min = degrees[0] as f64;
|
||
if k_min < 1.0 { return 0.0; }
|
||
|
||
let n = degrees.len() as f64;
|
||
let sum_ln: f64 = degrees.iter()
|
||
.map(|&k| (k as f64 / (k_min - 0.5)).ln())
|
||
.sum();
|
||
|
||
if sum_ln <= 0.0 { return 0.0; }
|
||
(1.0 + n / sum_ln) as f32
|
||
}
|
||
|
||
/// Gini coefficient of the degree distribution.
|
||
///
|
||
/// 0 = perfectly egalitarian (all nodes same degree)
|
||
/// 1 = maximally unequal (one node has all edges)
|
||
/// Measures hub concentration independent of distribution shape.
|
||
pub fn degree_gini(&self) -> f32 {
|
||
let mut degrees: Vec<f64> = self.keys.iter()
|
||
.map(|k| self.degree(k) as f64)
|
||
.collect();
|
||
let n = degrees.len();
|
||
if n < 2 { return 0.0; }
|
||
|
||
degrees.sort_by(|a, b| a.total_cmp(b));
|
||
let mean = degrees.iter().sum::<f64>() / n as f64;
|
||
if mean < 1e-10 { return 0.0; }
|
||
|
||
// Gini = (2 Σ i·x_i) / (n Σ x_i) - (n+1)/n
|
||
let weighted_sum: f64 = degrees.iter().enumerate()
|
||
.map(|(i, &d)| (i as f64 + 1.0) * d)
|
||
.sum();
|
||
let total = degrees.iter().sum::<f64>();
|
||
|
||
let gini = (2.0 * weighted_sum) / (n as f64 * total) - (n as f64 + 1.0) / n as f64;
|
||
gini.max(0.0) as f32
|
||
}
|
||
|
||
/// Small-world coefficient σ = (C/C_rand) / (L/L_rand)
|
||
/// C_rand ≈ <k>/n, L_rand ≈ ln(n)/ln(<k>)
|
||
pub fn small_world_sigma(&self) -> f32 {
|
||
let n = self.keys.len() as f32;
|
||
if n < 10.0 { return 0.0; }
|
||
|
||
let avg_degree = self.adj.values()
|
||
.map(|e| e.len() as f32)
|
||
.sum::<f32>() / n;
|
||
if avg_degree < 1.0 { return 0.0; }
|
||
|
||
let c = self.avg_clustering_coefficient();
|
||
let l = self.avg_path_length();
|
||
|
||
let c_rand = avg_degree / n;
|
||
let l_rand = n.ln() / avg_degree.ln();
|
||
|
||
if c_rand < 1e-10 || l_rand < 1e-10 || l < 1e-10 {
|
||
return 0.0;
|
||
}
|
||
|
||
(c / c_rand) / (l / l_rand)
|
||
}
|
||
}
|
||
|
||
/// Impact of adding a hypothetical edge
|
||
#[derive(Debug)]
|
||
pub struct LinkImpact {
|
||
pub source: String,
|
||
pub target: String,
|
||
pub source_deg: usize,
|
||
pub target_deg: usize,
|
||
/// Is this a hub link? (either endpoint in top 5% by degree)
|
||
pub is_hub_link: bool,
|
||
/// Are both endpoints in the same community?
|
||
pub same_community: bool,
|
||
/// Change in clustering coefficient for source
|
||
pub delta_cc_source: f32,
|
||
/// Change in clustering coefficient for target
|
||
pub delta_cc_target: f32,
|
||
/// Change in degree Gini (positive = more hub-dominated)
|
||
pub delta_gini: f32,
|
||
/// Qualitative assessment
|
||
pub assessment: &'static str,
|
||
}
|
||
|
||
impl Graph {
|
||
/// Simulate adding an edge and report impact on topology metrics.
|
||
///
|
||
/// Doesn't modify the graph — computes what would change if the
|
||
/// edge were added.
|
||
pub fn link_impact(&self, source: &str, target: &str) -> LinkImpact {
|
||
let source_deg = self.degree(source);
|
||
let target_deg = self.degree(target);
|
||
let hub_threshold = self.hub_threshold();
|
||
let is_hub_link = source_deg >= hub_threshold || target_deg >= hub_threshold;
|
||
|
||
// Community check
|
||
let sc = self.communities.get(source);
|
||
let tc = self.communities.get(target);
|
||
let same_community = match (sc, tc) {
|
||
(Some(a), Some(b)) => a == b,
|
||
_ => false,
|
||
};
|
||
|
||
// CC change for source: adding target as neighbor changes the
|
||
// triangle count. New triangles form for each node that's a
|
||
// neighbor of BOTH source and target.
|
||
let source_neighbors = self.neighbor_keys(source);
|
||
let target_neighbors = self.neighbor_keys(target);
|
||
let shared_neighbors = source_neighbors.intersection(&target_neighbors).count();
|
||
|
||
let cc_before_source = self.clustering_coefficient(source);
|
||
let cc_before_target = self.clustering_coefficient(target);
|
||
|
||
// Estimate new CC for source after adding edge
|
||
let new_source_deg = source_deg + 1;
|
||
let new_source_triangles = if source_deg >= 2 {
|
||
// Current triangles + new ones from shared neighbors
|
||
let current_triangles = (cc_before_source
|
||
* source_deg as f32 * (source_deg as f32 - 1.0) / 2.0) as u32;
|
||
current_triangles + shared_neighbors as u32
|
||
} else {
|
||
shared_neighbors as u32
|
||
};
|
||
let cc_after_source = if new_source_deg >= 2 {
|
||
(2.0 * new_source_triangles as f32)
|
||
/ (new_source_deg as f32 * (new_source_deg as f32 - 1.0))
|
||
} else {
|
||
0.0
|
||
};
|
||
|
||
let new_target_deg = target_deg + 1;
|
||
let new_target_triangles = if target_deg >= 2 {
|
||
let current_triangles = (cc_before_target
|
||
* target_deg as f32 * (target_deg as f32 - 1.0) / 2.0) as u32;
|
||
current_triangles + shared_neighbors as u32
|
||
} else {
|
||
shared_neighbors as u32
|
||
};
|
||
let cc_after_target = if new_target_deg >= 2 {
|
||
(2.0 * new_target_triangles as f32)
|
||
/ (new_target_deg as f32 * (new_target_deg as f32 - 1.0))
|
||
} else {
|
||
0.0
|
||
};
|
||
|
||
// Gini change via influence function:
|
||
// IF(x; Gini, F) = (2F(x) - 1) * x/μ - Gini - 1
|
||
// Adding an edge increments two degrees. The net ΔGini is the sum
|
||
// of influence contributions from both endpoints shifting up by 1.
|
||
let gini_before = self.degree_gini();
|
||
let n = self.keys.len();
|
||
let total_degree: f64 = self.keys.iter()
|
||
.map(|k| self.degree(k) as f64)
|
||
.sum();
|
||
let mean_deg = if n > 0 { total_degree / n as f64 } else { 1.0 };
|
||
|
||
// CDF at each endpoint's degree: fraction of nodes with degree ≤ d
|
||
let delta_gini = if mean_deg > 1e-10 && n >= 2 {
|
||
// Count nodes with degree ≤ source_deg and ≤ target_deg
|
||
let f_source = self.keys.iter()
|
||
.filter(|k| self.degree(k) <= source_deg)
|
||
.count() as f64 / n as f64;
|
||
let f_target = self.keys.iter()
|
||
.filter(|k| self.degree(k) <= target_deg)
|
||
.count() as f64 / n as f64;
|
||
|
||
// Influence of incrementing source's degree by 1
|
||
let new_source = (source_deg + 1) as f64;
|
||
let if_source = (2.0 * f_source - 1.0) * new_source / mean_deg
|
||
- gini_before as f64 - 1.0;
|
||
// Influence of incrementing target's degree by 1
|
||
let new_target = (target_deg + 1) as f64;
|
||
let if_target = (2.0 * f_target - 1.0) * new_target / mean_deg
|
||
- gini_before as f64 - 1.0;
|
||
|
||
// Scale: each point contributes 1/n to the distribution
|
||
((if_source + if_target) / n as f64) as f32
|
||
} else {
|
||
0.0f32
|
||
};
|
||
|
||
// Qualitative assessment
|
||
let assessment = if is_hub_link && same_community {
|
||
"hub-reinforcing: strengthens existing star topology"
|
||
} else if is_hub_link && !same_community {
|
||
"hub-bridging: cross-community but through a hub"
|
||
} else if !is_hub_link && same_community && shared_neighbors > 0 {
|
||
"lateral-clustering: strengthens local mesh topology"
|
||
} else if !is_hub_link && !same_community {
|
||
"lateral-bridging: best kind — cross-community lateral link"
|
||
} else if !is_hub_link && same_community {
|
||
"lateral-local: connects peripheral nodes in same community"
|
||
} else {
|
||
"neutral"
|
||
};
|
||
|
||
LinkImpact {
|
||
source: source.to_string(),
|
||
target: target.to_string(),
|
||
source_deg,
|
||
target_deg,
|
||
is_hub_link,
|
||
same_community,
|
||
delta_cc_source: cc_after_source - cc_before_source,
|
||
delta_cc_target: cc_after_target - cc_before_target,
|
||
delta_gini,
|
||
assessment,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Build graph from store data (with community detection)
|
||
pub fn build_graph(store: &impl StoreView) -> Graph {
|
||
let (adj, keys) = build_adjacency(store);
|
||
let communities = label_propagation(&keys, &adj, 20);
|
||
Graph { adj, keys, communities }
|
||
}
|
||
|
||
/// Build graph without community detection — for spreading activation
|
||
/// searches where we only need the adjacency list.
|
||
pub fn build_graph_fast(store: &impl StoreView) -> Graph {
|
||
let (adj, keys) = build_adjacency(store);
|
||
Graph { adj, keys, communities: HashMap::new() }
|
||
}
|
||
|
||
fn build_adjacency(store: &impl StoreView) -> (HashMap<String, Vec<Edge>>, HashSet<String>) {
|
||
let mut adj: HashMap<String, Vec<Edge>> = HashMap::new();
|
||
let mut keys: HashSet<String> = HashSet::new();
|
||
|
||
store.for_each_node(|key, _, _| {
|
||
keys.insert(key.to_owned());
|
||
});
|
||
|
||
store.for_each_relation(|source_key, target_key, strength, rel_type| {
|
||
if !keys.contains(source_key) || !keys.contains(target_key) {
|
||
return;
|
||
}
|
||
|
||
adj.entry(source_key.to_owned()).or_default().push(Edge {
|
||
target: target_key.to_owned(),
|
||
strength,
|
||
rel_type,
|
||
});
|
||
adj.entry(target_key.to_owned()).or_default().push(Edge {
|
||
target: source_key.to_owned(),
|
||
strength,
|
||
rel_type,
|
||
});
|
||
});
|
||
|
||
add_implicit_temporal_edges(store, &keys, &mut adj);
|
||
|
||
(adj, keys)
|
||
}
|
||
|
||
/// Add implicit edges for the temporal/digest hierarchy.
|
||
///
|
||
/// These edges are derived from node types and dates — they don't
|
||
/// need to be stored. Two kinds:
|
||
/// - parent/child: session→daily→weekly→monthly (by date containment)
|
||
/// - prev/next: chronological ordering within each level
|
||
///
|
||
/// Sessions use their timestamp for date. Digest nodes (daily/weekly/monthly)
|
||
/// extract the date they *cover* from the key name, since their timestamp
|
||
/// is when the digest was created, not what period it covers.
|
||
fn add_implicit_temporal_edges(
|
||
store: &impl StoreView,
|
||
keys: &HashSet<String>,
|
||
adj: &mut HashMap<String, Vec<Edge>>,
|
||
) {
|
||
use crate::store::NodeType::*;
|
||
use chrono::{Datelike, DateTime, NaiveDate};
|
||
|
||
// Extract the covered date from a key name.
|
||
// Patterns: "daily-2026-03-06", "daily-2026-03-06-identity",
|
||
// "weekly-2026-W09", "monthly-2026-02"
|
||
// "journal#j-2026-03-13t...", "journal#2026-03-13-..."
|
||
fn date_from_key(key: &str) -> Option<NaiveDate> {
|
||
// Try extracting YYYY-MM-DD after known prefixes
|
||
for prefix in ["daily-", "journal#j-", "journal#"] {
|
||
if let Some(rest) = key.strip_prefix(prefix) {
|
||
if rest.len() >= 10 {
|
||
if let Ok(d) = NaiveDate::parse_from_str(&rest[..10], "%Y-%m-%d") {
|
||
return Some(d);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
None
|
||
}
|
||
|
||
fn week_from_key(key: &str) -> Option<(i32, u32)> {
|
||
// "weekly-2026-W09" → (2026, 9)
|
||
let rest = key.strip_prefix("weekly-")?;
|
||
let (year_str, w_str) = rest.split_once("-W")?;
|
||
let year: i32 = year_str.parse().ok()?;
|
||
// Week string might have a suffix like "-foo"
|
||
let week_str = w_str.split('-').next()?;
|
||
let week: u32 = week_str.parse().ok()?;
|
||
Some((year, week))
|
||
}
|
||
|
||
fn month_from_key(key: &str) -> Option<(i32, u32)> {
|
||
// "monthly-2026-02" → (2026, 2)
|
||
let rest = key.strip_prefix("monthly-")?;
|
||
let (year_str, month_str) = rest.split_once('-')?;
|
||
let year: i32 = year_str.parse().ok()?;
|
||
let month_str = month_str.split('-').next()?;
|
||
let month: u32 = month_str.parse().ok()?;
|
||
Some((year, month))
|
||
}
|
||
|
||
// Collect episodic nodes by type
|
||
struct Dated { key: String, ts: i64, date: NaiveDate }
|
||
|
||
let mut sessions: Vec<Dated> = Vec::new();
|
||
let mut dailies: Vec<(String, NaiveDate)> = Vec::new();
|
||
let mut weeklies: Vec<(String, (i32, u32))> = Vec::new();
|
||
let mut monthlies: Vec<(String, (i32, u32))> = Vec::new();
|
||
|
||
store.for_each_node_meta(|key, node_type, ts| {
|
||
if !keys.contains(key) { return; }
|
||
match node_type {
|
||
EpisodicSession => {
|
||
// Prefer date from key (local time) over timestamp (UTC)
|
||
// to avoid timezone mismatches
|
||
let date = date_from_key(key).or_else(|| {
|
||
DateTime::from_timestamp(ts, 0).map(|dt| dt.date_naive())
|
||
});
|
||
if let Some(date) = date {
|
||
sessions.push(Dated { key: key.to_owned(), ts, date });
|
||
}
|
||
}
|
||
EpisodicDaily => {
|
||
if let Some(date) = date_from_key(key) {
|
||
dailies.push((key.to_owned(), date));
|
||
}
|
||
}
|
||
EpisodicWeekly => {
|
||
if let Some(yw) = week_from_key(key) {
|
||
weeklies.push((key.to_owned(), yw));
|
||
}
|
||
}
|
||
EpisodicMonthly => {
|
||
if let Some(ym) = month_from_key(key) {
|
||
monthlies.push((key.to_owned(), ym));
|
||
}
|
||
}
|
||
_ => {}
|
||
}
|
||
});
|
||
|
||
sessions.sort_by_key(|d| d.ts);
|
||
dailies.sort_by_key(|(_, d)| *d);
|
||
weeklies.sort_by_key(|(_, yw)| *yw);
|
||
monthlies.sort_by_key(|(_, ym)| *ym);
|
||
|
||
let add_edge = |adj: &mut HashMap<String, Vec<Edge>>, a: &str, b: &str| {
|
||
if let Some(edges) = adj.get(a) {
|
||
if edges.iter().any(|e| e.target == b) { return; }
|
||
}
|
||
adj.entry(a.to_owned()).or_default().push(Edge {
|
||
target: b.to_owned(),
|
||
strength: 1.0,
|
||
rel_type: RelationType::Auto,
|
||
});
|
||
adj.entry(b.to_owned()).or_default().push(Edge {
|
||
target: a.to_owned(),
|
||
strength: 1.0,
|
||
rel_type: RelationType::Auto,
|
||
});
|
||
};
|
||
|
||
// Build indexes: date→dailies, (year,week)→weekly, (year,month)→monthly
|
||
// Note: multiple dailies can share a date (e.g. daily-2026-03-06-identity,
|
||
// daily-2026-03-06-technical), so we collect all of them.
|
||
let mut date_to_dailies: HashMap<NaiveDate, Vec<String>> = HashMap::new();
|
||
for (key, date) in &dailies {
|
||
date_to_dailies.entry(*date).or_default().push(key.clone());
|
||
}
|
||
|
||
let mut yw_to_weekly: HashMap<(i32, u32), String> = HashMap::new();
|
||
for (key, yw) in &weeklies {
|
||
yw_to_weekly.insert(*yw, key.clone());
|
||
}
|
||
|
||
let mut ym_to_monthly: HashMap<(i32, u32), String> = HashMap::new();
|
||
for (key, ym) in &monthlies {
|
||
ym_to_monthly.insert(*ym, key.clone());
|
||
}
|
||
|
||
// Session → Daily (parent): each session links to all dailies for its date
|
||
for sess in &sessions {
|
||
if let Some(daily_keys) = date_to_dailies.get(&sess.date) {
|
||
for daily in daily_keys {
|
||
add_edge(adj, &sess.key, daily);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Daily → Weekly (parent)
|
||
for (key, date) in &dailies {
|
||
let yw = (date.iso_week().year(), date.iso_week().week());
|
||
if let Some(weekly) = yw_to_weekly.get(&yw) {
|
||
add_edge(adj, key, weekly);
|
||
}
|
||
}
|
||
|
||
// Weekly → Monthly (parent)
|
||
for (key, yw) in &weeklies {
|
||
// A week can span two months; use the Thursday date (ISO week convention)
|
||
let thursday = NaiveDate::from_isoywd_opt(yw.0, yw.1, chrono::Weekday::Thu);
|
||
if let Some(d) = thursday {
|
||
let ym = (d.year(), d.month());
|
||
if let Some(monthly) = ym_to_monthly.get(&ym) {
|
||
add_edge(adj, key, monthly);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Prev/next within each level
|
||
for pair in sessions.windows(2) {
|
||
add_edge(adj, &pair[0].key, &pair[1].key);
|
||
}
|
||
for pair in dailies.windows(2) {
|
||
add_edge(adj, &pair[0].0, &pair[1].0);
|
||
}
|
||
for pair in weeklies.windows(2) {
|
||
add_edge(adj, &pair[0].0, &pair[1].0);
|
||
}
|
||
for pair in monthlies.windows(2) {
|
||
add_edge(adj, &pair[0].0, &pair[1].0);
|
||
}
|
||
|
||
}
|
||
|
||
/// Label propagation community detection.
|
||
///
|
||
/// Each node starts with its own label. Each iteration: adopt the most
|
||
/// common label among neighbors (weighted by edge strength). Iterate
|
||
/// until stable or max_iterations.
|
||
fn label_propagation(
|
||
keys: &HashSet<String>,
|
||
adj: &HashMap<String, Vec<Edge>>,
|
||
max_iterations: u32,
|
||
) -> HashMap<String, u32> {
|
||
// Only consider edges above this strength for community votes.
|
||
// Weak auto-links from triangle closure (0.15-0.35) bridge
|
||
// unrelated clusters — filtering them lets natural communities emerge.
|
||
let min_strength: f32 = 0.3;
|
||
|
||
// Initialize: each node gets its own label
|
||
let key_vec: Vec<String> = keys.iter().cloned().collect();
|
||
let mut labels: HashMap<String, u32> = key_vec.iter()
|
||
.enumerate()
|
||
.map(|(i, k)| (k.clone(), i as u32))
|
||
.collect();
|
||
|
||
for _iter in 0..max_iterations {
|
||
let mut changed = false;
|
||
|
||
for key in &key_vec {
|
||
let edges = match adj.get(key) {
|
||
Some(e) => e,
|
||
None => continue,
|
||
};
|
||
if edges.is_empty() { continue; }
|
||
|
||
// Count weighted votes for each label (skip weak edges)
|
||
let mut votes: HashMap<u32, f32> = HashMap::new();
|
||
for edge in edges {
|
||
if edge.strength < min_strength { continue; }
|
||
if let Some(&label) = labels.get(&edge.target) {
|
||
*votes.entry(label).or_default() += edge.strength;
|
||
}
|
||
}
|
||
|
||
// Adopt the label with most votes
|
||
if let Some((&best_label, _)) = votes.iter()
|
||
.max_by(|a, b| a.1.total_cmp(b.1))
|
||
{
|
||
let current = labels[key];
|
||
if best_label != current {
|
||
labels.insert(key.clone(), best_label);
|
||
changed = true;
|
||
}
|
||
}
|
||
}
|
||
|
||
if !changed { break; }
|
||
}
|
||
|
||
// Compact labels to 0..n
|
||
let mut label_map: HashMap<u32, u32> = HashMap::new();
|
||
let mut next_id = 0;
|
||
for label in labels.values_mut() {
|
||
let new_label = *label_map.entry(*label).or_insert_with(|| {
|
||
let id = next_id;
|
||
next_id += 1;
|
||
id
|
||
});
|
||
*label = new_label;
|
||
}
|
||
|
||
labels
|
||
}
|
||
|
||
|
||
/// A snapshot of graph topology metrics, for tracking evolution over time
|
||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||
pub struct MetricsSnapshot {
|
||
pub timestamp: i64,
|
||
pub date: String,
|
||
pub nodes: usize,
|
||
pub edges: usize,
|
||
pub communities: usize,
|
||
pub sigma: f32,
|
||
pub alpha: f32,
|
||
pub gini: f32,
|
||
pub avg_cc: f32,
|
||
pub avg_path_length: f32,
|
||
// Removed: avg_schema_fit was identical to avg_cc.
|
||
// Old snapshots with the field still deserialize (serde ignores unknown fields by default).
|
||
}
|
||
|
||
fn metrics_log_path() -> std::path::PathBuf {
|
||
crate::store::memory_dir().join("metrics.jsonl")
|
||
}
|
||
|
||
/// Load previous metrics snapshots
|
||
pub fn load_metrics_history() -> Vec<MetricsSnapshot> {
|
||
crate::util::jsonl_load(&metrics_log_path())
|
||
}
|
||
|
||
/// Append a metrics snapshot to the log
|
||
pub fn save_metrics_snapshot(snap: &MetricsSnapshot) {
|
||
let _ = crate::util::jsonl_append(&metrics_log_path(), snap);
|
||
}
|
||
|
||
/// Compute current graph metrics as a snapshot (no side effects).
|
||
pub fn current_metrics(graph: &Graph) -> MetricsSnapshot {
|
||
let now = crate::store::now_epoch();
|
||
let date = crate::store::format_datetime_space(now);
|
||
MetricsSnapshot {
|
||
timestamp: now,
|
||
date,
|
||
nodes: graph.nodes().len(),
|
||
edges: graph.edge_count(),
|
||
communities: graph.community_count(),
|
||
sigma: graph.small_world_sigma(),
|
||
alpha: graph.degree_power_law_exponent(),
|
||
gini: graph.degree_gini(),
|
||
avg_cc: graph.avg_clustering_coefficient(),
|
||
avg_path_length: graph.avg_path_length(),
|
||
}
|
||
}
|
||
|
||
/// Health report: summary of graph metrics.
|
||
/// Saves a metrics snapshot as a side effect (callers who want pure
|
||
/// computation should use `current_metrics` + `save_metrics_snapshot`).
|
||
pub fn health_report(graph: &Graph, store: &Store) -> String {
|
||
let snap = current_metrics(graph);
|
||
save_metrics_snapshot(&snap);
|
||
|
||
let n = snap.nodes;
|
||
let e = snap.edges;
|
||
let avg_cc = snap.avg_cc;
|
||
let avg_pl = snap.avg_path_length;
|
||
let sigma = snap.sigma;
|
||
let alpha = snap.alpha;
|
||
let gini = snap.gini;
|
||
let communities = snap.communities;
|
||
|
||
// Community sizes
|
||
let mut comm_sizes: HashMap<u32, usize> = HashMap::new();
|
||
for label in graph.communities().values() {
|
||
*comm_sizes.entry(*label).or_default() += 1;
|
||
}
|
||
let mut sizes: Vec<usize> = comm_sizes.values().copied().collect();
|
||
sizes.sort_unstable_by(|a, b| b.cmp(a));
|
||
|
||
// Degree distribution
|
||
let mut degrees: Vec<usize> = graph.nodes().iter()
|
||
.map(|k| graph.degree(k))
|
||
.collect();
|
||
degrees.sort_unstable();
|
||
let max_deg = degrees.last().copied().unwrap_or(0);
|
||
let median_deg = if degrees.is_empty() { 0 } else { degrees[degrees.len() / 2] };
|
||
let avg_deg = if n == 0 { 0.0 } else {
|
||
degrees.iter().sum::<usize>() as f64 / n as f64
|
||
};
|
||
|
||
// Low-CC nodes: poorly integrated
|
||
let low_cc = graph.nodes().iter()
|
||
.filter(|k| graph.clustering_coefficient(k) < 0.1)
|
||
.count();
|
||
|
||
// Orphan edges: relations referencing non-existent nodes
|
||
let mut orphan_edges = 0usize;
|
||
let mut missing_nodes: HashSet<String> = HashSet::new();
|
||
for rel in &store.relations {
|
||
if rel.deleted { continue; }
|
||
let s_missing = !store.nodes.contains_key(&rel.source_key);
|
||
let t_missing = !store.nodes.contains_key(&rel.target_key);
|
||
if s_missing || t_missing {
|
||
orphan_edges += 1;
|
||
if s_missing { missing_nodes.insert(rel.source_key.clone()); }
|
||
if t_missing { missing_nodes.insert(rel.target_key.clone()); }
|
||
}
|
||
}
|
||
|
||
// NodeType breakdown
|
||
let mut type_counts: HashMap<&str, usize> = HashMap::new();
|
||
for node in store.nodes.values() {
|
||
let label = match node.node_type {
|
||
crate::store::NodeType::EpisodicSession => "episodic",
|
||
crate::store::NodeType::EpisodicDaily => "daily",
|
||
crate::store::NodeType::EpisodicWeekly => "weekly",
|
||
crate::store::NodeType::EpisodicMonthly => "monthly",
|
||
crate::store::NodeType::Semantic => "semantic",
|
||
};
|
||
*type_counts.entry(label).or_default() += 1;
|
||
}
|
||
|
||
// Load history for deltas
|
||
let history = load_metrics_history();
|
||
let prev = if history.len() >= 2 {
|
||
Some(&history[history.len() - 2]) // second-to-last (last is the one we just wrote)
|
||
} else {
|
||
None
|
||
};
|
||
|
||
fn delta(current: f32, prev: Option<f32>) -> String {
|
||
match prev {
|
||
Some(p) => {
|
||
let d = current - p;
|
||
if d.abs() < 0.001 { String::new() }
|
||
else { format!(" (Δ{:+.3})", d) }
|
||
}
|
||
None => String::new(),
|
||
}
|
||
}
|
||
|
||
let sigma_d = delta(sigma, prev.map(|p| p.sigma));
|
||
let alpha_d = delta(alpha, prev.map(|p| p.alpha));
|
||
let gini_d = delta(gini, prev.map(|p| p.gini));
|
||
let cc_d = delta(avg_cc, prev.map(|p| p.avg_cc));
|
||
|
||
let mut report = format!(
|
||
"Memory Health Report
|
||
====================
|
||
Nodes: {n} Relations: {e} Communities: {communities}
|
||
|
||
Degree: max={max_deg} median={median_deg} avg={avg_deg:.1}
|
||
Clustering coefficient (avg): {avg_cc:.4}{cc_d} low-CC (<0.1): {low_cc} nodes
|
||
Average path length: {avg_pl:.2}
|
||
Small-world σ: {sigma:.3}{sigma_d} (>1 = small-world)
|
||
Power-law α: {alpha:.2}{alpha_d} (2=hub-dominated, 3=healthy, >3=egalitarian)
|
||
Degree Gini: {gini:.3}{gini_d} (0=equal, 1=one-hub)
|
||
|
||
Community sizes (top 5): {top5}
|
||
Types: semantic={semantic} episodic={episodic} daily={daily} weekly={weekly} monthly={monthly}",
|
||
top5 = sizes.iter().take(5)
|
||
.map(|s| s.to_string())
|
||
.collect::<Vec<_>>()
|
||
.join(", "),
|
||
semantic = type_counts.get("semantic").unwrap_or(&0),
|
||
episodic = type_counts.get("episodic").unwrap_or(&0),
|
||
daily = type_counts.get("daily").unwrap_or(&0),
|
||
weekly = type_counts.get("weekly").unwrap_or(&0),
|
||
monthly = type_counts.get("monthly").unwrap_or(&0),
|
||
);
|
||
|
||
// Orphan edges
|
||
if orphan_edges == 0 {
|
||
report.push_str("\n\nBroken links: 0");
|
||
} else {
|
||
report.push_str(&format!(
|
||
"\n\nBroken links: {} edges reference {} missing nodes",
|
||
orphan_edges, missing_nodes.len()));
|
||
let mut sorted: Vec<_> = missing_nodes.iter().collect();
|
||
sorted.sort();
|
||
for key in sorted.iter().take(10) {
|
||
report.push_str(&format!("\n - {}", key));
|
||
}
|
||
if sorted.len() > 10 {
|
||
report.push_str(&format!("\n ... and {} more", sorted.len() - 10));
|
||
}
|
||
}
|
||
|
||
// Show history trend if we have enough data points
|
||
if history.len() >= 3 {
|
||
report.push_str("\n\nMetrics history (last 5):\n");
|
||
for snap in &history[history.len().saturating_sub(5)..] {
|
||
report.push_str(&format!(" {} — σ={:.1} α={:.2} gini={:.3} cc={:.4}\n",
|
||
snap.date, snap.sigma, snap.alpha, snap.gini, snap.avg_cc));
|
||
}
|
||
}
|
||
|
||
report
|
||
}
|