spectral.rs: remove print_summary, to_embedding, save_embedding, nearest_neighbors, unlinked_neighbors, dominant_dimensions, SpectralResult, shorten_key. Core functions (load_embedding, nearest_to_seeds_weighted, analyze_positions, etc.) kept. identity.rs: remove context_file_info (zero callers). Co-Authored-By: Proof of Concept <poc@bcachefs.org>
257 lines
9 KiB
Rust
257 lines
9 KiB
Rust
// Spectral decomposition of the memory graph.
|
|
//
|
|
// Computes eigenvalues and eigenvectors of the normalized graph Laplacian.
|
|
// The eigenvectors provide natural coordinates for each node — connected
|
|
// nodes land nearby, communities form clusters, bridges sit between clusters.
|
|
//
|
|
// The eigenvalue spectrum reveals:
|
|
// - Number of connected components (count of zero eigenvalues)
|
|
// - Number of natural communities (eigenvalues near zero, before the gap)
|
|
// - How well-connected the graph is (Fiedler value = second eigenvalue)
|
|
//
|
|
// The eigenvectors provide:
|
|
// - Spectral coordinates for each node (the embedding)
|
|
// - Community membership (sign/magnitude of Fiedler vector)
|
|
// - Natural projections (select which eigenvectors to include)
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::{HashMap, HashSet};
|
|
use std::path::PathBuf;
|
|
|
|
/// Per-node spectral embedding, serializable to disk.
|
|
#[derive(Serialize, Deserialize)]
|
|
pub struct SpectralEmbedding {
|
|
/// Number of dimensions (eigenvectors)
|
|
pub dims: usize,
|
|
/// Eigenvalues for each dimension
|
|
pub eigenvalues: Vec<f64>,
|
|
/// Node key → coordinate vector
|
|
pub coords: HashMap<String, Vec<f64>>,
|
|
}
|
|
|
|
pub fn embedding_path() -> PathBuf {
|
|
crate::store::memory_dir().join("spectral-embedding.json")
|
|
}
|
|
|
|
/// Load embedding from disk.
|
|
pub fn load_embedding() -> Result<SpectralEmbedding, String> {
|
|
let path = embedding_path();
|
|
let data = std::fs::read_to_string(&path)
|
|
.map_err(|e| format!("read {}: {}", path.display(), e))?;
|
|
serde_json::from_str(&data)
|
|
.map_err(|e| format!("parse embedding: {}", e))
|
|
}
|
|
|
|
/// Find nearest neighbors to weighted seed nodes, using link weights.
|
|
///
|
|
/// Each seed has a weight (from query term weighting). For candidates
|
|
/// directly linked to a seed, the spectral distance is scaled by
|
|
/// 1/link_strength — strong links make effective distance shorter.
|
|
/// Seed weight scales the contribution: high-weight seeds pull harder.
|
|
///
|
|
/// Returns (key, effective_distance) sorted by distance ascending.
|
|
pub fn nearest_to_seeds_weighted(
|
|
emb: &SpectralEmbedding,
|
|
seeds: &[(&str, f64)], // (key, seed_weight)
|
|
graph: Option<&crate::graph::Graph>,
|
|
k: usize,
|
|
) -> Vec<(String, f64)> {
|
|
let seed_set: HashSet<&str> = seeds.iter().map(|(s, _)| *s).collect();
|
|
|
|
let seed_data: Vec<(&str, &Vec<f64>, f64)> = seeds.iter()
|
|
.filter_map(|(s, w)| {
|
|
emb.coords.get(*s)
|
|
.filter(|c| c.iter().any(|&v| v.abs() > 1e-12)) // skip degenerate seeds
|
|
.map(|c| (*s, c, *w))
|
|
})
|
|
.collect();
|
|
if seed_data.is_empty() {
|
|
return vec![];
|
|
}
|
|
|
|
// Build seed→neighbor link strength lookup
|
|
let link_strengths: HashMap<(&str, &str), f32> = if let Some(g) = graph {
|
|
let mut map = HashMap::new();
|
|
for &(seed_key, _) in seeds {
|
|
for (neighbor, strength) in g.neighbors(seed_key) {
|
|
map.insert((seed_key, neighbor.as_str()), strength);
|
|
}
|
|
}
|
|
map
|
|
} else {
|
|
HashMap::new()
|
|
};
|
|
|
|
let dim_weights = eigenvalue_weights(&emb.eigenvalues);
|
|
|
|
let mut distances: Vec<(String, f64)> = emb.coords.iter()
|
|
.filter(|(k, coords)| {
|
|
!seed_set.contains(k.as_str())
|
|
&& coords.iter().any(|&v| v.abs() > 1e-12) // skip degenerate zero-coord nodes
|
|
})
|
|
.map(|(candidate_key, coords)| {
|
|
let min_dist = seed_data.iter()
|
|
.map(|(seed_key, sc, seed_weight)| {
|
|
let raw_dist = weighted_distance(coords, sc, &dim_weights);
|
|
|
|
// Scale by link strength if directly connected
|
|
let link_scale = link_strengths
|
|
.get(&(*seed_key, candidate_key.as_str()))
|
|
.map(|&s| 1.0 / (1.0 + s as f64)) // strong link → smaller distance
|
|
.unwrap_or(1.0);
|
|
|
|
raw_dist * link_scale / seed_weight
|
|
})
|
|
.fold(f64::MAX, f64::min);
|
|
(candidate_key.clone(), min_dist)
|
|
})
|
|
.collect();
|
|
|
|
distances.sort_by(|a, b| a.1.total_cmp(&b.1));
|
|
distances.truncate(k);
|
|
distances
|
|
}
|
|
|
|
/// Weighted euclidean distance in spectral space.
|
|
/// Dimensions weighted by 1/eigenvalue — coarser structure matters more.
|
|
fn weighted_distance(a: &[f64], b: &[f64], weights: &[f64]) -> f64 {
|
|
a.iter()
|
|
.zip(b.iter())
|
|
.zip(weights.iter())
|
|
.map(|((&x, &y), &w)| w * (x - y) * (x - y))
|
|
.sum::<f64>()
|
|
.sqrt()
|
|
}
|
|
|
|
/// Compute eigenvalue-inverse weights for distance calculations.
|
|
fn eigenvalue_weights(eigenvalues: &[f64]) -> Vec<f64> {
|
|
eigenvalues.iter()
|
|
.map(|&ev| if ev > 1e-8 { 1.0 / ev } else { 0.0 })
|
|
.collect()
|
|
}
|
|
|
|
/// Compute cluster centers (centroids) in spectral space.
|
|
pub fn cluster_centers(
|
|
emb: &SpectralEmbedding,
|
|
communities: &HashMap<String, u32>,
|
|
) -> HashMap<u32, Vec<f64>> {
|
|
let mut sums: HashMap<u32, (Vec<f64>, usize)> = HashMap::new();
|
|
|
|
for (key, coords) in &emb.coords {
|
|
if let Some(&comm) = communities.get(key) {
|
|
let entry = sums.entry(comm)
|
|
.or_insert_with(|| (vec![0.0; emb.dims], 0));
|
|
for (i, &c) in coords.iter().enumerate() {
|
|
entry.0[i] += c;
|
|
}
|
|
entry.1 += 1;
|
|
}
|
|
}
|
|
|
|
sums.into_iter()
|
|
.map(|(comm, (sum, count))| {
|
|
let center: Vec<f64> = sum.iter()
|
|
.map(|s| s / count as f64)
|
|
.collect();
|
|
(comm, center)
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Per-node analysis of spectral position relative to communities.
|
|
pub struct SpectralPosition {
|
|
pub key: String,
|
|
pub community: u32,
|
|
/// Distance to own community center
|
|
pub dist_to_center: f64,
|
|
/// Distance to nearest OTHER community center
|
|
pub dist_to_nearest: f64,
|
|
/// Which community is nearest (other than own)
|
|
pub nearest_community: u32,
|
|
/// dist_to_center / median_dist_in_community (>1 = outlier)
|
|
pub outlier_score: f64,
|
|
/// dist_to_center / dist_to_nearest (>1 = between clusters, potential bridge)
|
|
pub bridge_score: f64,
|
|
}
|
|
|
|
/// Analyze spectral positions for all nodes.
|
|
///
|
|
/// Returns positions sorted by outlier_score descending (most displaced first).
|
|
pub fn analyze_positions(
|
|
emb: &SpectralEmbedding,
|
|
communities: &HashMap<String, u32>,
|
|
) -> Vec<SpectralPosition> {
|
|
let centers = cluster_centers(emb, communities);
|
|
let weights = eigenvalue_weights(&emb.eigenvalues);
|
|
|
|
// Compute distances to own community center
|
|
let mut by_community: HashMap<u32, Vec<f64>> = HashMap::new();
|
|
let mut node_dists: Vec<(String, u32, f64)> = Vec::new();
|
|
|
|
for (key, coords) in &emb.coords {
|
|
if let Some(&comm) = communities.get(key)
|
|
&& let Some(center) = centers.get(&comm) {
|
|
let dist = weighted_distance(coords, center, &weights);
|
|
by_community.entry(comm).or_default().push(dist);
|
|
node_dists.push((key.clone(), comm, dist));
|
|
}
|
|
}
|
|
|
|
// Median distance per community for outlier scoring
|
|
let medians: HashMap<u32, f64> = by_community.into_iter()
|
|
.map(|(comm, mut dists)| {
|
|
dists.sort_by(|a, b| a.total_cmp(b));
|
|
let median = if dists.is_empty() {
|
|
1.0
|
|
} else if dists.len() % 2 == 0 {
|
|
(dists[dists.len() / 2 - 1] + dists[dists.len() / 2]) / 2.0
|
|
} else {
|
|
dists[dists.len() / 2]
|
|
};
|
|
(comm, median.max(1e-6))
|
|
})
|
|
.collect();
|
|
|
|
let mut positions: Vec<SpectralPosition> = node_dists.into_iter()
|
|
.map(|(key, comm, dist_to_center)| {
|
|
let coords = &emb.coords[&key];
|
|
|
|
let (nearest_community, dist_to_nearest) = centers.iter()
|
|
.filter(|&(&c, _)| c != comm)
|
|
.map(|(&c, center)| (c, weighted_distance(coords, center, &weights)))
|
|
.min_by(|a, b| a.1.total_cmp(&b.1))
|
|
.unwrap_or((comm, f64::MAX));
|
|
|
|
let median = medians.get(&comm).copied().unwrap_or(1.0);
|
|
let outlier_score = dist_to_center / median;
|
|
let bridge_score = if dist_to_nearest > 1e-8 {
|
|
dist_to_center / dist_to_nearest
|
|
} else {
|
|
0.0
|
|
};
|
|
|
|
SpectralPosition {
|
|
key, community: comm,
|
|
dist_to_center, dist_to_nearest, nearest_community,
|
|
outlier_score, bridge_score,
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
positions.sort_by(|a, b| b.outlier_score.total_cmp(&a.outlier_score));
|
|
positions
|
|
}
|
|
|
|
/// Classify a spectral position: well-integrated, outlier, bridge, or orphan.
|
|
pub fn classify_position(pos: &SpectralPosition) -> &'static str {
|
|
if pos.bridge_score > 0.7 {
|
|
"bridge" // between two communities
|
|
} else if pos.outlier_score > 2.0 {
|
|
"outlier" // far from own community center
|
|
} else if pos.outlier_score < 0.5 {
|
|
"core" // close to community center
|
|
} else {
|
|
"peripheral" // normal community member
|
|
}
|
|
}
|
|
|