search: add confluence, geodesic, and manifold algorithms

Three new composable search stages:

  confluence — multi-source spreading activation. Unlike spread (which
  takes max from any source), confluence rewards nodes reachable from
  multiple seeds additively. Naturally separates unrelated seed groups
  since their neighborhoods don't overlap. Params: max_hops, edge_decay,
  min_sources.

  geodesic — straightest path between seed pairs in spectral space.
  At each graph hop, picks the neighbor whose spectral direction most
  aligns with the target (cosine similarity of direction vectors).
  Nodes on many geodesic paths score highest. Params: max_path, k.

  manifold — extrapolation along the direction seeds define. Computes
  weighted centroid + principal axis of seeds in spectral space, then
  scores candidates by projection onto that axis (penalized by
  perpendicular distance). Finds what's "further along" rather than
  "nearby." Params: k.

Co-Authored-By: ProofOfConcept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-03-09 01:22:29 -04:00
parent c1664bf76b
commit 63253f102a

View file

@ -37,6 +37,8 @@ pub enum Algorithm {
Spread, Spread,
Spectral, Spectral,
Manifold, Manifold,
Confluence,
Geodesic,
} }
impl fmt::Display for Algorithm { impl fmt::Display for Algorithm {
@ -45,6 +47,8 @@ impl fmt::Display for Algorithm {
Algorithm::Spread => write!(f, "spread"), Algorithm::Spread => write!(f, "spread"),
Algorithm::Spectral => write!(f, "spectral"), Algorithm::Spectral => write!(f, "spectral"),
Algorithm::Manifold => write!(f, "manifold"), Algorithm::Manifold => write!(f, "manifold"),
Algorithm::Confluence => write!(f, "confluence"),
Algorithm::Geodesic => write!(f, "geodesic"),
} }
} }
} }
@ -58,6 +62,8 @@ impl AlgoStage {
"spread" => Algorithm::Spread, "spread" => Algorithm::Spread,
"spectral" => Algorithm::Spectral, "spectral" => Algorithm::Spectral,
"manifold" => Algorithm::Manifold, "manifold" => Algorithm::Manifold,
"confluence" => Algorithm::Confluence,
"geodesic" => Algorithm::Geodesic,
_ => return Err(format!("unknown algorithm: {}", name)), _ => return Err(format!("unknown algorithm: {}", name)),
}; };
let mut params = HashMap::new(); let mut params = HashMap::new();
@ -136,10 +142,9 @@ pub fn run_pipeline(
current = match stage.algo { current = match stage.algo {
Algorithm::Spread => run_spread(&current, graph, store, stage, debug), Algorithm::Spread => run_spread(&current, graph, store, stage, debug),
Algorithm::Spectral => run_spectral(&current, graph, stage, debug), Algorithm::Spectral => run_spectral(&current, graph, stage, debug),
Algorithm::Manifold => { Algorithm::Manifold => run_manifold(&current, graph, stage, debug),
if debug { println!(" (manifold not yet implemented, passing through)"); } Algorithm::Confluence => run_confluence(&current, graph, store, stage, debug),
current Algorithm::Geodesic => run_geodesic(&current, graph, stage, debug),
}
}; };
if debug { if debug {
@ -222,6 +227,397 @@ fn run_spectral(
result result
} }
/// Confluence: multi-source reachability scoring.
///
/// Unlike spreading activation (which takes max activation from any source),
/// confluence rewards nodes reachable from *multiple* seeds. For each candidate
/// node within k hops, score = sum of (seed_weight * edge_decay^distance) across
/// all seeds that can reach it. Nodes at the intersection of multiple seeds'
/// neighborhoods score highest.
///
/// This naturally handles mixed seeds: unrelated seeds activate disjoint
/// neighborhoods that don't overlap, so their results separate naturally.
///
/// Tunable params: max_hops (default 3), edge_decay (default 0.5),
/// min_sources (default 2, minimum number of distinct seeds that must reach a node).
fn run_confluence(
seeds: &[(String, f64)],
graph: &Graph,
store: &impl StoreView,
stage: &AlgoStage,
debug: bool,
) -> Vec<(String, f64)> {
let max_hops = stage.param_u32("max_hops", 3);
let edge_decay = stage.param_f64("edge_decay", 0.5);
let min_sources = stage.param_usize("min_sources", 2);
// For each seed, BFS outward collecting (node → activation) at each distance
// Track which seeds contributed to each node's score
let mut node_scores: HashMap<String, f64> = HashMap::new();
let mut node_sources: HashMap<String, HashSet<usize>> = HashMap::new();
for (seed_idx, (seed_key, seed_weight)) in seeds.iter().enumerate() {
let mut visited: HashMap<String, f64> = HashMap::new();
let mut queue: VecDeque<(String, u32)> = VecDeque::new();
visited.insert(seed_key.clone(), *seed_weight);
queue.push_back((seed_key.clone(), 0));
while let Some((key, depth)) = queue.pop_front() {
if depth >= max_hops { continue; }
let act = visited[&key];
for (neighbor, strength) in graph.neighbors(&key) {
let neighbor_weight = store.node_weight(neighbor.as_str());
let propagated = act * edge_decay * neighbor_weight * strength as f64;
if propagated < 0.001 { continue; }
if !visited.contains_key(neighbor.as_str()) || visited[neighbor.as_str()] < propagated {
visited.insert(neighbor.clone(), propagated);
queue.push_back((neighbor.clone(), depth + 1));
}
}
}
// Accumulate into global scores (additive across seeds)
for (key, act) in visited {
*node_scores.entry(key.clone()).or_insert(0.0) += act;
node_sources.entry(key).or_default().insert(seed_idx);
}
}
// Filter to nodes reached by min_sources distinct seeds
let mut results: Vec<(String, f64)> = node_scores.into_iter()
.filter(|(key, _)| {
node_sources.get(key).map(|s| s.len()).unwrap_or(0) >= min_sources
})
.collect();
if debug {
// Show source counts
for (key, score) in results.iter().take(15) {
let sources = node_sources.get(key).map(|s| s.len()).unwrap_or(0);
println!(" [{:.4}] {} (from {} seeds)", score, key, sources);
}
}
results.sort_by(|a, b| b.1.total_cmp(&a.1));
results
}
/// Geodesic: straightest paths between seed pairs in spectral space.
///
/// For each pair of seeds, walk the graph from one to the other, at each
/// step choosing the neighbor whose spectral direction most aligns with
/// the target direction. Nodes along these geodesic paths score higher
/// the more paths pass through them and the straighter those paths are.
///
/// Tunable params: max_path (default 6), k (default 20 results).
fn run_geodesic(
seeds: &[(String, f64)],
graph: &Graph,
stage: &AlgoStage,
debug: bool,
) -> Vec<(String, f64)> {
let max_path = stage.param_usize("max_path", 6);
let k = stage.param_usize("k", 20);
let emb = match spectral::load_embedding() {
Ok(e) => e,
Err(e) => {
if debug { println!(" no spectral embedding: {}", e); }
return seeds.to_vec();
}
};
// Filter seeds to those with valid spectral coords
let valid_seeds: Vec<(&str, f64, &Vec<f64>)> = seeds.iter()
.filter_map(|(key, weight)| {
emb.coords.get(key.as_str())
.filter(|c| c.iter().any(|&v| v.abs() > 1e-12))
.map(|c| (key.as_str(), *weight, c))
})
.collect();
if valid_seeds.len() < 2 {
if debug { println!(" need ≥2 seeds with spectral coords, have {}", valid_seeds.len()); }
return seeds.to_vec();
}
// For each pair of seeds, find the geodesic path
let mut path_counts: HashMap<String, f64> = HashMap::new();
let seed_set: HashSet<&str> = seeds.iter().map(|(k, _)| k.as_str()).collect();
for i in 0..valid_seeds.len() {
for j in (i + 1)..valid_seeds.len() {
let (key_a, weight_a, coords_a) = &valid_seeds[i];
let (key_b, weight_b, coords_b) = &valid_seeds[j];
let pair_weight = weight_a * weight_b;
// Walk from A toward B
let path_ab = geodesic_walk(
key_a, coords_a, coords_b, graph, &emb, max_path,
);
// Walk from B toward A
let path_ba = geodesic_walk(
key_b, coords_b, coords_a, graph, &emb, max_path,
);
// Score nodes on both paths (nodes found from both directions score double)
for (node, alignment) in path_ab.iter().chain(path_ba.iter()) {
if !seed_set.contains(node.as_str()) {
*path_counts.entry(node.clone()).or_insert(0.0) += pair_weight * alignment;
}
}
}
}
if debug && !path_counts.is_empty() {
println!(" {} pairs examined, {} distinct nodes on paths",
valid_seeds.len() * (valid_seeds.len() - 1) / 2,
path_counts.len());
}
// Merge with original seeds
let mut results = seeds.to_vec();
let mut path_results: Vec<(String, f64)> = path_counts.into_iter().collect();
path_results.sort_by(|a, b| b.1.total_cmp(&a.1));
path_results.truncate(k);
for (key, score) in path_results {
if !seed_set.contains(key.as_str()) {
results.push((key, score));
}
}
results.sort_by(|a, b| b.1.total_cmp(&a.1));
results
}
/// Walk from `start` toward `target_coords` in spectral space, choosing
/// the neighbor at each step whose direction most aligns with the target.
/// Returns (node_key, alignment_score) for each intermediate node.
fn geodesic_walk(
start: &str,
start_coords: &[f64],
target_coords: &[f64],
graph: &Graph,
emb: &spectral::SpectralEmbedding,
max_steps: usize,
) -> Vec<(String, f64)> {
let mut path = Vec::new();
let mut current = start.to_string();
let mut current_coords = start_coords.to_vec();
let mut visited: HashSet<String> = HashSet::new();
visited.insert(current.clone());
for _ in 0..max_steps {
// Direction we want to travel: from current toward target
let direction: Vec<f64> = target_coords.iter()
.zip(current_coords.iter())
.map(|(t, c)| t - c)
.collect();
let dir_norm = direction.iter().map(|d| d * d).sum::<f64>().sqrt();
if dir_norm < 1e-12 { break; } // arrived
// Among neighbors with spectral coords, find the one most aligned
let mut best: Option<(String, Vec<f64>, f64)> = None;
for (neighbor, _strength) in graph.neighbors(&current) {
if visited.contains(neighbor.as_str()) { continue; }
let neighbor_coords = match emb.coords.get(neighbor.as_str()) {
Some(c) if c.iter().any(|&v| v.abs() > 1e-12) => c,
_ => continue,
};
// Direction to this neighbor
let step: Vec<f64> = neighbor_coords.iter()
.zip(current_coords.iter())
.map(|(n, c)| n - c)
.collect();
let step_norm = step.iter().map(|s| s * s).sum::<f64>().sqrt();
if step_norm < 1e-12 { continue; }
// Cosine similarity between desired direction and step direction
let dot: f64 = direction.iter().zip(step.iter()).map(|(d, s)| d * s).sum();
let alignment = dot / (dir_norm * step_norm);
if alignment > 0.0 { // only consider forward-facing neighbors
if best.as_ref().map(|(_, _, a)| alignment > *a).unwrap_or(true) {
best = Some((neighbor.clone(), neighbor_coords.clone(), alignment));
}
}
}
match best {
Some((next_key, next_coords, alignment)) => {
path.push((next_key.clone(), alignment));
visited.insert(next_key.clone());
current = next_key;
current_coords = next_coords;
}
None => break, // no forward-facing neighbors
}
}
path
}
/// Manifold: extrapolation along the direction defined by seeds.
///
/// Instead of finding what's *near* the seeds in spectral space (proximity),
/// find what's in the *direction* the seeds define. Given a weighted centroid
/// of seeds and the principal direction they span, find nodes that continue
/// along that direction.
///
/// Tunable params: k (default 20 results).
fn run_manifold(
seeds: &[(String, f64)],
graph: &Graph,
stage: &AlgoStage,
debug: bool,
) -> Vec<(String, f64)> {
let k = stage.param_usize("k", 20);
let emb = match spectral::load_embedding() {
Ok(e) => e,
Err(e) => {
if debug { println!(" no spectral embedding: {}", e); }
return seeds.to_vec();
}
};
// Collect seeds with valid spectral coordinates
let seed_data: Vec<(&str, f64, &Vec<f64>)> = seeds.iter()
.filter_map(|(key, weight)| {
emb.coords.get(key.as_str())
.filter(|c| c.iter().any(|&v| v.abs() > 1e-12))
.map(|c| (key.as_str(), *weight, c))
})
.collect();
if seed_data.is_empty() {
if debug { println!(" no seeds with spectral coords"); }
return seeds.to_vec();
}
let dims = emb.dims;
// Compute weighted centroid of seeds
let mut centroid = vec![0.0f64; dims];
let mut total_weight = 0.0;
for (_, weight, coords) in &seed_data {
for (i, &c) in coords.iter().enumerate() {
centroid[i] += c * weight;
}
total_weight += weight;
}
if total_weight > 0.0 {
for c in &mut centroid {
*c /= total_weight;
}
}
// Compute principal direction: weighted PCA axis 1
// For each seed, its deviation from centroid contributes to the direction
let mut direction = vec![0.0f64; dims];
if seed_data.len() >= 2 {
// Use power iteration to find dominant direction of seed spread
// Initialize with the vector from first seed to last seed
let first = seed_data.first().unwrap().2;
let last = seed_data.last().unwrap().2;
for i in 0..dims {
direction[i] = last[i] - first[i];
}
// One round of power iteration on the covariance matrix
let mut new_dir = vec![0.0f64; dims];
for (_, weight, coords) in &seed_data {
let dev: Vec<f64> = coords.iter().zip(centroid.iter()).map(|(c, m)| c - m).collect();
let dot: f64 = dev.iter().zip(direction.iter()).map(|(d, v)| d * v).sum();
for i in 0..dims {
new_dir[i] += weight * dot * dev[i];
}
}
direction = new_dir;
}
let dir_norm = direction.iter().map(|d| d * d).sum::<f64>().sqrt();
let seed_set: HashSet<&str> = seeds.iter().map(|(k, _)| k.as_str()).collect();
// Score each non-seed node by projection onto the direction from centroid
let mut candidates: Vec<(String, f64)> = emb.coords.iter()
.filter(|(key, coords)| {
!seed_set.contains(key.as_str())
&& coords.iter().any(|&v| v.abs() > 1e-12)
})
.map(|(key, coords)| {
let deviation: Vec<f64> = coords.iter().zip(centroid.iter())
.map(|(c, m)| c - m)
.collect();
let score = if dir_norm > 1e-12 {
// Project onto direction: how far along the principal axis
let projection: f64 = deviation.iter().zip(direction.iter())
.map(|(d, v)| d * v)
.sum::<f64>() / dir_norm;
// Distance from the axis (perpendicular component)
let proj_vec: Vec<f64> = direction.iter()
.map(|&d| d * projection / dir_norm)
.collect();
let perp_dist: f64 = deviation.iter().zip(proj_vec.iter())
.map(|(d, p)| (d - p).powi(2))
.sum::<f64>()
.sqrt();
// Score: prefer nodes far along the direction but close to the axis
// Use absolute projection (both directions from centroid are interesting)
let along = projection.abs();
if perp_dist < 1e-12 {
along
} else {
along / (1.0 + perp_dist)
}
} else {
// No direction (single seed or all seeds coincide): use distance from centroid
let dist: f64 = deviation.iter().map(|d| d * d).sum::<f64>().sqrt();
1.0 / (1.0 + dist)
};
// Bonus for being connected to seeds in the graph
let graph_bonus: f64 = graph.neighbors(key).iter()
.filter(|(n, _)| seed_set.contains(n.as_str()))
.map(|(_, s)| *s as f64 * 0.1)
.sum();
(key.clone(), score + graph_bonus)
})
.collect();
candidates.sort_by(|a, b| b.1.total_cmp(&a.1));
candidates.truncate(k);
if debug {
for (key, score) in candidates.iter().take(15) {
println!(" [{:.4}] {}", score, key);
}
}
// Merge with original seeds
let mut results = seeds.to_vec();
for (key, score) in candidates {
results.push((key, score));
}
results.sort_by(|a, b| b.1.total_cmp(&a.1));
results
}
fn spreading_activation( fn spreading_activation(
seeds: &[(String, f64)], seeds: &[(String, f64)],
graph: &Graph, graph: &Graph,