consciousness/src/hippocampus/query/engine.rs
ProofOfConcept d5c0e86700 restructure: hippocampus/ for memory, subconscious/ for agents
hippocampus/ — memory storage, retrieval, and consolidation:
  store, graph, query, similarity, spectral, neuro, counters,
  config, transcript, memory_search, lookups, cursor, migrate

subconscious/ — autonomous agents that process without being asked:
  reflect, surface, consolidate, digest, audit, etc.

All existing crate::X paths preserved via re-exports in lib.rs.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2026-03-25 01:05:30 -04:00

1536 lines
55 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Memory search: composable query pipeline.
//
// The pipeline has four kinds of stages, all composing left-to-right:
//
// Generators — produce a result set from nothing:
// all every non-deleted node
// match:TERM text match (current seed extraction)
//
// Filters — narrow an existing result set on node metadata:
// type:episodic node_type == EpisodicSession
// type:semantic node_type == Semantic
// type:daily node_type == EpisodicDaily
// type:weekly node_type == EpisodicWeekly
// type:monthly node_type == EpisodicMonthly
// key:GLOB glob match on key
// weight:>0.5 numeric comparison on weight
// age:<7d created/modified within duration
// content-len:>1000 content size filter
// provenance:manual provenance match
// not-visited:AGENT,DUR not seen by agent in duration
// visited:AGENT has been seen by agent
//
// Transforms — reorder or reshape:
// sort:priority consolidation priority scoring
// sort:timestamp by timestamp (desc)
// sort:content-len by content size
// sort:degree by graph degree
// sort:weight by weight
// limit:N truncate to N results
//
// Algorithms — graph exploration (existing):
// spread spreading activation
// spectral,k=20 spectral nearest neighbors
// confluence multi-source reachability
// geodesic straightest spectral paths
// manifold extrapolation along seed direction
//
// Stages are parsed from strings and composed via the -p flag or
// pipe-separated in agent definitions.
use crate::store::{Store, StoreView, NodeType};
use crate::graph::Graph;
use crate::spectral;
use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::fmt;
pub struct SearchResult {
pub key: String,
pub activation: f64,
pub is_direct: bool,
pub snippet: Option<String>,
}
/// A parsed algorithm stage with its parameters.
#[derive(Clone, Debug)]
pub struct AlgoStage {
pub algo: Algorithm,
pub params: HashMap<String, String>,
}
#[derive(Clone, Debug)]
pub enum Algorithm {
Spread,
Spectral,
Manifold,
Confluence,
Geodesic,
}
impl fmt::Display for Algorithm {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Algorithm::Spread => write!(f, "spread"),
Algorithm::Spectral => write!(f, "spectral"),
Algorithm::Manifold => write!(f, "manifold"),
Algorithm::Confluence => write!(f, "confluence"),
Algorithm::Geodesic => write!(f, "geodesic"),
}
}
}
impl AlgoStage {
/// Parse "spread,max_hops=4,edge_decay=0.5" into an AlgoStage.
pub fn parse(s: &str) -> Result<Self, String> {
let mut parts = s.split(',');
let name = parts.next().unwrap_or("");
let algo = match name {
"spread" => Algorithm::Spread,
"spectral" => Algorithm::Spectral,
"manifold" => Algorithm::Manifold,
"confluence" => Algorithm::Confluence,
"geodesic" => Algorithm::Geodesic,
_ => return Err(format!("unknown algorithm: {}", name)),
};
let mut params = HashMap::new();
for part in parts {
if let Some((k, v)) = part.split_once('=') {
params.insert(k.to_string(), v.to_string());
} else {
return Err(format!("bad param (expected key=val): {}", part));
}
}
Ok(AlgoStage { algo, params })
}
fn param_f64(&self, key: &str, default: f64) -> f64 {
self.params.get(key)
.and_then(|v| v.parse().ok())
.unwrap_or(default)
}
fn param_u32(&self, key: &str, default: u32) -> u32 {
self.params.get(key)
.and_then(|v| v.parse().ok())
.unwrap_or(default)
}
fn param_usize(&self, key: &str, default: usize) -> usize {
self.params.get(key)
.and_then(|v| v.parse().ok())
.unwrap_or(default)
}
}
// ── Unified query pipeline ──────────────────────────────────────────
/// A pipeline stage: generator, filter, transform, or graph algorithm.
#[derive(Clone, Debug)]
pub enum Stage {
Generator(Generator),
Filter(Filter),
Transform(Transform),
Algorithm(AlgoStage),
}
#[derive(Clone, Debug)]
pub enum Generator {
All, // every non-deleted node
Match(Vec<String>), // text match seeds
}
#[derive(Clone, Debug)]
pub enum Filter {
Type(NodeType),
KeyGlob(String),
Weight(Cmp),
Age(Cmp), // vs now - timestamp (seconds)
ContentLen(Cmp),
Provenance(String),
NotVisited { agent: String, duration: i64 }, // seconds
Visited { agent: String },
Negated(Box<Filter>),
}
#[derive(Clone, Debug)]
pub enum Transform {
Sort(SortField),
Limit(usize),
DominatingSet,
}
#[derive(Clone, Debug)]
pub enum SortField {
Priority,
Timestamp,
ContentLen,
Degree,
Weight,
Isolation,
Composite(Vec<(ScoreField, f64)>),
}
/// Individual scoring dimensions for composite sorts.
/// Each computes a 0.0-1.0 score per node.
#[derive(Clone, Debug)]
pub enum ScoreField {
Isolation,
Degree,
Weight,
ContentLen,
Priority,
/// Time since last visit by named agent. 1.0 = never visited, decays toward 0.
Recency(String),
}
/// Numeric comparison operator.
#[derive(Clone, Debug)]
pub enum Cmp {
Gt(f64),
Gte(f64),
Lt(f64),
Lte(f64),
Eq(f64),
}
impl Cmp {
fn matches(&self, val: f64) -> bool {
match self {
Cmp::Gt(x) => val > *x,
Cmp::Gte(x) => val >= *x,
Cmp::Lt(x) => val < *x,
Cmp::Lte(x) => val <= *x,
Cmp::Eq(x) => (val - x).abs() < f64::EPSILON,
}
}
}
/// Parse a comparison like ">0.5", ">=60", "<7d" (durations converted to seconds).
fn parse_cmp(s: &str) -> Result<Cmp, String> {
let (op_len, ctor): (usize, fn(f64) -> Cmp) = if s.starts_with(">=") {
(2, Cmp::Gte)
} else if s.starts_with("<=") {
(2, Cmp::Lte)
} else if s.starts_with('>') {
(1, Cmp::Gt)
} else if s.starts_with('<') {
(1, Cmp::Lt)
} else if s.starts_with('=') {
(1, Cmp::Eq)
} else {
return Err(format!("expected comparison operator in '{}'", s));
};
let val_str = &s[op_len..];
let val = parse_duration_or_number(val_str)?;
Ok(ctor(val))
}
/// Parse "7d", "24h", "30m" as seconds, or plain numbers.
fn parse_duration_or_number(s: &str) -> Result<f64, String> {
if let Some(n) = s.strip_suffix('d') {
let v: f64 = n.parse().map_err(|_| format!("bad number: {}", n))?;
Ok(v * 86400.0)
} else if let Some(n) = s.strip_suffix('h') {
let v: f64 = n.parse().map_err(|_| format!("bad number: {}", n))?;
Ok(v * 3600.0)
} else if let Some(n) = s.strip_suffix('m') {
let v: f64 = n.parse().map_err(|_| format!("bad number: {}", n))?;
Ok(v * 60.0)
} else {
s.parse().map_err(|_| format!("bad number: {}", s))
}
}
/// Parse composite sort: "isolation*0.7+recency(linker)*0.3"
/// Each term is field or field(arg), optionally *weight (default 1.0).
fn parse_composite_sort(s: &str) -> Result<Vec<(ScoreField, f64)>, String> {
let mut terms = Vec::new();
for term in s.split('+') {
let term = term.trim();
let (field_part, weight) = if let Some((f, w)) = term.rsplit_once('*') {
(f, w.parse::<f64>().map_err(|_| format!("bad weight: {}", w))?)
} else {
(term, 1.0)
};
// Parse field, possibly with (arg)
let field = if let Some((name, arg)) = field_part.split_once('(') {
let arg = arg.strip_suffix(')').ok_or("missing ) in sort field")?;
match name {
"recency" => ScoreField::Recency(arg.to_string()),
_ => return Err(format!("unknown parameterized sort field: {}", name)),
}
} else {
match field_part {
"isolation" => ScoreField::Isolation,
"degree" => ScoreField::Degree,
"weight" => ScoreField::Weight,
"content-len" => ScoreField::ContentLen,
"priority" => ScoreField::Priority,
_ => return Err(format!("unknown sort field: {}", field_part)),
}
};
terms.push((field, weight));
}
if terms.is_empty() {
return Err("empty composite sort".into());
}
Ok(terms)
}
/// Compute a 0-1 score for a node on a single dimension.
fn score_field(
field: &ScoreField,
key: &str,
store: &Store,
graph: &Graph,
precomputed: &CompositeCache,
) -> f64 {
match field {
ScoreField::Isolation => {
let comm = graph.communities().get(key).copied().unwrap_or(0);
precomputed.isolation.get(&comm).copied().unwrap_or(1.0) as f64
}
ScoreField::Degree => {
let d = graph.degree(key) as f64;
let max = precomputed.max_degree.max(1.0);
(d / max).min(1.0)
}
ScoreField::Weight => {
store.nodes.get(key).map(|n| n.weight as f64).unwrap_or(0.0)
}
ScoreField::ContentLen => {
let len = store.nodes.get(key).map(|n| n.content.len()).unwrap_or(0) as f64;
let max = precomputed.max_content_len.max(1.0);
(len / max).min(1.0)
}
ScoreField::Priority => {
let p = crate::neuro::consolidation_priority(store, key, graph, None);
// Priority is already roughly 0-1 from the scoring function
p.min(1.0)
}
ScoreField::Recency(agent) => {
let last = store.last_visited(key, agent);
if last == 0 {
1.0 // never visited = highest recency score
} else {
let age = (crate::store::now_epoch() - last) as f64;
// Sigmoid decay: 1.0 at 7+ days, ~0.5 at 1 day, ~0.1 at 1 hour
let hours = age / 3600.0;
1.0 - (-0.03 * hours).exp()
}
}
}
}
/// Cached values for composite scoring (computed once per sort).
struct CompositeCache {
isolation: HashMap<u32, f32>,
max_degree: f64,
max_content_len: f64,
}
impl CompositeCache {
fn build(items: &[(String, f64)], store: &Store, graph: &Graph) -> Self {
let max_degree = items.iter()
.map(|(k, _)| graph.degree(k) as f64)
.fold(0.0f64, f64::max);
let max_content_len = items.iter()
.map(|(k, _)| store.nodes.get(k).map(|n| n.content.len()).unwrap_or(0) as f64)
.fold(0.0f64, f64::max);
Self {
isolation: graph.community_isolation(),
max_degree,
max_content_len,
}
}
}
/// Parse a NodeType from a label.
fn parse_node_type(s: &str) -> Result<NodeType, String> {
match s {
"episodic" | "session" => Ok(NodeType::EpisodicSession),
"daily" => Ok(NodeType::EpisodicDaily),
"weekly" => Ok(NodeType::EpisodicWeekly),
"monthly" => Ok(NodeType::EpisodicMonthly),
"semantic" => Ok(NodeType::Semantic),
_ => Err(format!("unknown node type: {} (use: episodic, semantic, daily, weekly, monthly)", s)),
}
}
impl Stage {
/// Parse a single stage from a string.
///
/// Algorithm names are tried first (bare words), then predicate syntax
/// (contains ':'). No ambiguity since algorithms are bare words.
pub fn parse(s: &str) -> Result<Self, String> {
let s = s.trim();
let (negated, s) = if let Some(rest) = s.strip_prefix('!') {
(true, rest)
} else {
(false, s)
};
// Generator: "all"
if s == "all" {
return Ok(Stage::Generator(Generator::All));
}
// Transform: "dominating-set"
if s == "dominating-set" {
return Ok(Stage::Transform(Transform::DominatingSet));
}
// Try algorithm parse first (bare words, no colon)
if !s.contains(':')
&& let Ok(algo) = AlgoStage::parse(s) {
return Ok(Stage::Algorithm(algo));
}
// Algorithm with params: "spread,max_hops=4" (contains comma but no colon)
if s.contains(',') && !s.contains(':') {
return AlgoStage::parse(s).map(Stage::Algorithm);
}
// Predicate/transform syntax: "key:value"
let (prefix, value) = s.split_once(':')
.ok_or_else(|| format!("unknown stage: {}", s))?;
let filter_or_transform = match prefix {
"type" => Stage::Filter(Filter::Type(parse_node_type(value)?)),
"key" => Stage::Filter(Filter::KeyGlob(value.to_string())),
"weight" => Stage::Filter(Filter::Weight(parse_cmp(value)?)),
"age" => Stage::Filter(Filter::Age(parse_cmp(value)?)),
"content-len" => Stage::Filter(Filter::ContentLen(parse_cmp(value)?)),
"provenance" => {
Stage::Filter(Filter::Provenance(value.to_string()))
}
"not-visited" => {
let (agent, dur) = value.split_once(',')
.ok_or("not-visited:AGENT,DURATION")?;
let secs = parse_duration_or_number(dur)?;
Stage::Filter(Filter::NotVisited {
agent: agent.to_string(),
duration: secs as i64,
})
}
"visited" => Stage::Filter(Filter::Visited {
agent: value.to_string(),
}),
"sort" => {
// Check for composite sort: field*weight+field*weight+...
let field = if value.contains('+') || value.contains('*') {
SortField::Composite(parse_composite_sort(value)?)
} else {
match value {
"priority" => SortField::Priority,
"timestamp" => SortField::Timestamp,
"content-len" => SortField::ContentLen,
"degree" => SortField::Degree,
"weight" => SortField::Weight,
"isolation" => SortField::Isolation,
_ => return Err(format!("unknown sort field: {}", value)),
}
};
Stage::Transform(Transform::Sort(field))
}
"limit" => {
let n: usize = value.parse()
.map_err(|_| format!("bad limit: {}", value))?;
Stage::Transform(Transform::Limit(n))
}
"match" => {
let terms: Vec<String> = value.split(',')
.map(|t| t.to_string())
.collect();
Stage::Generator(Generator::Match(terms))
}
// Algorithm with colon in params? Try fallback.
_ => return AlgoStage::parse(s).map(Stage::Algorithm)
.map_err(|_| format!("unknown stage: {}", s)),
};
// Apply negation to filters
if negated {
match filter_or_transform {
Stage::Filter(f) => Ok(Stage::Filter(Filter::Negated(Box::new(f)))),
_ => Err("! prefix only works on filter stages".to_string()),
}
} else {
Ok(filter_or_transform)
}
}
/// Parse a pipe-separated pipeline string.
pub fn parse_pipeline(s: &str) -> Result<Vec<Stage>, String> {
s.split('|')
.map(|part| Stage::parse(part.trim()))
.collect()
}
}
impl fmt::Display for Stage {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Stage::Generator(Generator::All) => write!(f, "all"),
Stage::Generator(Generator::Match(terms)) => write!(f, "match:{}", terms.join(",")),
Stage::Filter(filt) => write!(f, "{}", filt),
Stage::Transform(Transform::Sort(field)) => write!(f, "sort:{:?}", field),
Stage::Transform(Transform::Limit(n)) => write!(f, "limit:{}", n),
Stage::Transform(Transform::DominatingSet) => write!(f, "dominating-set"),
Stage::Algorithm(a) => write!(f, "{}", a.algo),
}
}
}
impl fmt::Display for Filter {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Filter::Type(t) => write!(f, "type:{:?}", t),
Filter::KeyGlob(g) => write!(f, "key:{}", g),
Filter::Weight(c) => write!(f, "weight:{}", c),
Filter::Age(c) => write!(f, "age:{}", c),
Filter::ContentLen(c) => write!(f, "content-len:{}", c),
Filter::Provenance(p) => write!(f, "provenance:{}", p),
Filter::NotVisited { agent, duration } => write!(f, "not-visited:{},{}s", agent, duration),
Filter::Visited { agent } => write!(f, "visited:{}", agent),
Filter::Negated(inner) => write!(f, "!{}", inner),
}
}
}
impl fmt::Display for Cmp {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Cmp::Gt(v) => write!(f, ">{}", v),
Cmp::Gte(v) => write!(f, ">={}", v),
Cmp::Lt(v) => write!(f, "<{}", v),
Cmp::Lte(v) => write!(f, "<={}", v),
Cmp::Eq(v) => write!(f, "={}", v),
}
}
}
/// Simple glob matching (supports * and ?).
fn glob_matches(pattern: &str, text: &str) -> bool {
fn inner(pat: &[char], txt: &[char]) -> bool {
if pat.is_empty() { return txt.is_empty(); }
if pat[0] == '*' {
// Try matching * against 0..n characters
for skip in 0..=txt.len() {
if inner(&pat[1..], &txt[skip..]) { return true; }
}
return false;
}
if txt.is_empty() { return false; }
if pat[0] == '?' || pat[0] == txt[0] {
return inner(&pat[1..], &txt[1..]);
}
false
}
let pat: Vec<char> = pattern.chars().collect();
let txt: Vec<char> = text.chars().collect();
inner(&pat, &txt)
}
/// Run a unified query pipeline. Requires &Store for filter/transform stages.
///
/// If the pipeline starts with no generator, the input `seeds` are used.
/// Generators produce a fresh result set (ignoring seeds). Filters narrow
/// the current set. Transforms reorder/truncate. Algorithms do graph
/// exploration.
pub fn run_query(
stages: &[Stage],
seeds: Vec<(String, f64)>,
graph: &Graph,
store: &Store,
debug: bool,
max_results: usize,
) -> Vec<(String, f64)> {
let now = crate::store::now_epoch();
let mut current = seeds;
for stage in stages {
if debug {
println!("\n[query] === {} ({} items in) ===", stage, current.len());
}
current = match stage {
Stage::Generator(g) => run_generator(g, store),
Stage::Filter(filt) => {
current.into_iter()
.filter(|(key, _)| eval_filter(filt, key, store, now))
.collect()
}
Stage::Transform(xform) => run_transform(xform, current, store, graph),
Stage::Algorithm(algo_stage) => {
match algo_stage.algo {
Algorithm::Spread => run_spread(&current, graph, store, algo_stage, debug),
Algorithm::Spectral => run_spectral(&current, graph, algo_stage, debug),
Algorithm::Manifold => run_manifold(&current, graph, algo_stage, debug),
Algorithm::Confluence => run_confluence(&current, graph, store, algo_stage, debug),
Algorithm::Geodesic => run_geodesic(&current, graph, algo_stage, debug),
}
}
};
if debug {
println!("[query] → {} results", current.len());
for (key, score) in current.iter().take(10) {
println!(" [{:.4}] {}", score, key);
}
if current.len() > 10 {
println!(" ... ({} more)", current.len() - 10);
}
}
}
current.truncate(max_results);
current
}
fn run_generator(g: &Generator, store: &Store) -> Vec<(String, f64)> {
match g {
Generator::All => {
store.nodes.iter()
.filter(|(_, n)| !n.deleted)
.map(|(key, n)| (key.clone(), n.weight as f64))
.collect()
}
Generator::Match(terms) => {
let weighted: BTreeMap<String, f64> = terms.iter()
.map(|t| (t.to_lowercase(), 1.0))
.collect();
let (seeds, _) = match_seeds(&weighted, store);
seeds
}
}
}
fn eval_filter(filt: &Filter, key: &str, store: &Store, now: i64) -> bool {
let node = match store.nodes.get(key) {
Some(n) => n,
None => return false,
};
match filt {
Filter::Type(t) => node.node_type == *t,
Filter::KeyGlob(pattern) => glob_matches(pattern, key),
Filter::Weight(cmp) => cmp.matches(node.weight as f64),
Filter::Age(cmp) => {
let age_secs = (now - node.timestamp) as f64;
cmp.matches(age_secs)
}
Filter::ContentLen(cmp) => cmp.matches(node.content.len() as f64),
Filter::Provenance(p) => node.provenance == *p,
Filter::NotVisited { agent, duration } => {
let last = store.last_visited(key, agent);
last == 0 || (now - last) > *duration
}
Filter::Visited { agent } => {
store.last_visited(key, agent) > 0
}
Filter::Negated(inner) => !eval_filter(inner, key, store, now),
}
}
pub fn run_transform(
xform: &Transform,
mut items: Vec<(String, f64)>,
store: &Store,
graph: &Graph,
) -> Vec<(String, f64)> {
match xform {
Transform::Sort(field) => {
match field {
SortField::Weight => {
items.sort_by(|a, b| b.1.total_cmp(&a.1));
}
SortField::Timestamp => {
items.sort_by(|a, b| {
let ta = store.nodes.get(&a.0).map(|n| n.timestamp).unwrap_or(0);
let tb = store.nodes.get(&b.0).map(|n| n.timestamp).unwrap_or(0);
tb.cmp(&ta) // desc
});
}
SortField::ContentLen => {
items.sort_by(|a, b| {
let la = store.nodes.get(&a.0).map(|n| n.content.len()).unwrap_or(0);
let lb = store.nodes.get(&b.0).map(|n| n.content.len()).unwrap_or(0);
lb.cmp(&la) // desc
});
}
SortField::Degree => {
items.sort_by(|a, b| {
let da = graph.degree(&a.0);
let db = graph.degree(&b.0);
db.cmp(&da) // desc
});
}
SortField::Isolation => {
// Score nodes by their community's isolation.
// Most isolated communities first (highest internal edge ratio).
let iso = graph.community_isolation();
let comms = graph.communities();
items.sort_by(|a, b| {
let ca = comms.get(&a.0).copied().unwrap_or(0);
let cb = comms.get(&b.0).copied().unwrap_or(0);
let sa = iso.get(&ca).copied().unwrap_or(1.0);
let sb = iso.get(&cb).copied().unwrap_or(1.0);
sb.total_cmp(&sa) // most isolated first
});
}
SortField::Priority => {
// Pre-compute priorities to avoid O(n log n) calls
// inside the sort comparator.
let priorities: HashMap<String, f64> = items.iter()
.map(|(key, _)| {
let p = crate::neuro::consolidation_priority(
store, key, graph, None);
(key.clone(), p)
})
.collect();
items.sort_by(|a, b| {
let pa = priorities.get(&a.0).copied().unwrap_or(0.0);
let pb = priorities.get(&b.0).copied().unwrap_or(0.0);
pb.total_cmp(&pa) // desc
});
}
SortField::Composite(terms) => {
let cache = CompositeCache::build(&items, store, graph);
let scores: HashMap<String, f64> = items.iter()
.map(|(key, _)| {
let s: f64 = terms.iter()
.map(|(field, w)| score_field(field, key, store, graph, &cache) * w)
.sum();
(key.clone(), s)
})
.collect();
items.sort_by(|a, b| {
let sa = scores.get(&a.0).copied().unwrap_or(0.0);
let sb = scores.get(&b.0).copied().unwrap_or(0.0);
sb.total_cmp(&sa) // highest composite score first
});
}
}
items
}
Transform::Limit(n) => {
items.truncate(*n);
items
}
Transform::DominatingSet => {
// Greedy 3-covering dominating set: pick the node that covers
// the most under-covered neighbors, repeat until every node
// has been covered 3 times (by 3 different selected seeds).
use std::collections::HashMap as HMap;
let input_keys: std::collections::HashSet<String> = items.iter().map(|(k, _)| k.clone()).collect();
let mut cover_count: HMap<String, usize> = items.iter().map(|(k, _)| (k.clone(), 0)).collect();
let mut selected: Vec<(String, f64)> = Vec::new();
let mut selected_set: std::collections::HashSet<String> = std::collections::HashSet::new();
const REQUIRED_COVERAGE: usize = 3;
loop {
// Find the unselected node that covers the most under-covered nodes
let best = items.iter()
.filter(|(k, _)| !selected_set.contains(k.as_str()))
.map(|(k, _)| {
let mut value = 0usize;
// Count self if under-covered
if cover_count.get(k).copied().unwrap_or(0) < REQUIRED_COVERAGE {
value += 1;
}
for (nbr, _) in graph.neighbors(k) {
if input_keys.contains(nbr.as_str())
&& cover_count.get(nbr.as_str()).copied().unwrap_or(0) < REQUIRED_COVERAGE {
value += 1;
}
}
(k.clone(), value)
})
.max_by_key(|(_, v)| *v);
let Some((key, value)) = best else { break };
if value == 0 { break; } // everything covered 3x
// Mark coverage
*cover_count.entry(key.clone()).or_default() += 1;
for (nbr, _) in graph.neighbors(&key) {
if let Some(c) = cover_count.get_mut(nbr.as_str()) {
*c += 1;
}
}
let score = items.iter().find(|(k, _)| k == &key).map(|(_, s)| *s).unwrap_or(1.0);
selected.push((key.clone(), score));
selected_set.insert(key);
}
selected
}
}
}
/// Extract seeds from weighted terms by matching against node keys and content.
///
/// Three matching strategies, in priority order:
/// 1. Exact key match: term matches a node key exactly → full weight
/// 2. Key component match: term matches a word in a hyphenated/underscored key → 0.5× weight
/// 3. Content match: term appears in node content → 0.2× weight (capped at 50 nodes)
///
/// Returns (seeds, direct_hits) where direct_hits tracks which keys
/// were matched directly (vs found by an algorithm stage).
pub fn match_seeds(
terms: &BTreeMap<String, f64>,
store: &impl StoreView,
) -> (Vec<(String, f64)>, HashSet<String>) {
match_seeds_opts(terms, store, false, false)
}
pub fn match_seeds_opts(
terms: &BTreeMap<String, f64>,
store: &impl StoreView,
component_match: bool,
content_fallback: bool,
) -> (Vec<(String, f64)>, HashSet<String>) {
let mut seed_map: HashMap<String, f64> = HashMap::new();
let mut direct_hits: HashSet<String> = HashSet::new();
// Build key lookup: lowercase key → (original key, weight)
let mut key_map: HashMap<String, (String, f64)> = HashMap::new();
// Build component index: word → vec of (original key, weight)
let mut component_map: HashMap<String, Vec<(String, f64)>> = HashMap::new();
store.for_each_node(|key, _content, weight| {
let lkey = key.to_lowercase();
key_map.insert(lkey.clone(), (key.to_owned(), weight as f64));
// Split key on hyphens, underscores, dots, hashes for component matching
for component in lkey.split(['-', '_', '.', '#']) {
if component.len() >= 3 {
component_map.entry(component.to_owned())
.or_default()
.push((key.to_owned(), weight as f64));
}
}
});
for (term, &term_weight) in terms {
// Strategy 1: exact key match
if let Some((orig_key, node_weight)) = key_map.get(term) {
let score = term_weight * node_weight;
*seed_map.entry(orig_key.clone()).or_insert(0.0) += score;
direct_hits.insert(orig_key.clone());
continue;
}
// Strategy 2: key component match (0.5× weight) — only when explicitly requested
if component_match
&& let Some(matches) = component_map.get(term.as_str()) {
for (orig_key, node_weight) in matches {
let score = term_weight * node_weight * 0.5;
*seed_map.entry(orig_key.clone()).or_insert(0.0) += score;
direct_hits.insert(orig_key.clone());
}
continue;
}
// Strategy 3: content match (0.2× weight) — only when explicitly requested
if content_fallback {
let term_lower = term.to_lowercase();
if term_lower.len() >= 3 {
let mut content_hits = 0;
store.for_each_node(|key, content, weight| {
if content_hits >= 50 { return; }
if content.to_lowercase().contains(&term_lower) {
let score = term_weight * weight as f64 * 0.2;
*seed_map.entry(key.to_owned()).or_insert(0.0) += score;
content_hits += 1;
}
});
}
}
}
let seeds: Vec<(String, f64)> = seed_map.into_iter().collect();
(seeds, direct_hits)
}
/// Run a pipeline of algorithm stages.
pub fn run_pipeline(
stages: &[AlgoStage],
seeds: Vec<(String, f64)>,
graph: &Graph,
store: &impl StoreView,
debug: bool,
max_results: usize,
) -> Vec<(String, f64)> {
let mut current = seeds;
for stage in stages {
if debug {
println!("\n[search] === {} ({} seeds in) ===", stage.algo, current.len());
}
current = match stage.algo {
Algorithm::Spread => run_spread(&current, graph, store, stage, debug),
Algorithm::Spectral => run_spectral(&current, graph, stage, debug),
Algorithm::Manifold => run_manifold(&current, graph, stage, debug),
Algorithm::Confluence => run_confluence(&current, graph, store, stage, debug),
Algorithm::Geodesic => run_geodesic(&current, graph, stage, debug),
};
if debug {
println!("[search] {}{} results", stage.algo, current.len());
for (i, (key, score)) in current.iter().enumerate().take(15) {
let cutoff = if i + 1 == max_results { " <-- cutoff" } else { "" };
println!(" [{:.4}] {}{}", score, key, cutoff);
}
if current.len() > 15 {
println!(" ... ({} more)", current.len() - 15);
}
}
}
current.truncate(max_results);
current
}
/// Spreading activation: propagate scores through graph edges.
///
/// Tunable params: max_hops (default from store), edge_decay (default from store),
/// min_activation (default from store).
fn run_spread(
seeds: &[(String, f64)],
graph: &Graph,
store: &impl StoreView,
stage: &AlgoStage,
_debug: bool,
) -> Vec<(String, f64)> {
let store_params = store.params();
let max_hops = stage.param_u32("max_hops", store_params.max_hops);
let edge_decay = stage.param_f64("edge_decay", store_params.edge_decay);
let min_activation = stage.param_f64("min_activation", store_params.min_activation * 0.1);
spreading_activation(seeds, graph, store, max_hops, edge_decay, min_activation)
}
/// Spectral projection: find nearest neighbors in spectral embedding space.
///
/// Tunable params: k (default 20, number of neighbors to find).
fn run_spectral(
seeds: &[(String, f64)],
graph: &Graph,
stage: &AlgoStage,
debug: bool,
) -> Vec<(String, f64)> {
let k = stage.param_usize("k", 20);
let emb = match spectral::load_embedding() {
Ok(e) => e,
Err(e) => {
if debug { println!(" no spectral embedding: {}", e); }
return seeds.to_vec();
}
};
let weighted_seeds: Vec<(&str, f64)> = seeds.iter()
.map(|(k, w)| (k.as_str(), *w))
.collect();
let projected = spectral::nearest_to_seeds_weighted(
&emb, &weighted_seeds, Some(graph), k,
);
if debug {
for (key, dist) in &projected {
let score = 1.0 / (1.0 + dist);
println!(" dist={:.6} score={:.4} {}", dist, score, key);
}
}
// Merge: keep original seeds, add spectral results as new seeds
let seed_set: HashSet<&str> = seeds.iter().map(|(k, _)| k.as_str()).collect();
let mut result = seeds.to_vec();
for (key, dist) in projected {
if !seed_set.contains(key.as_str()) {
let score = 1.0 / (1.0 + dist);
result.push((key, score));
}
}
result
}
/// Confluence: multi-source reachability scoring.
///
/// Unlike spreading activation (which takes max activation from any source),
/// confluence rewards nodes reachable from *multiple* seeds. For each candidate
/// node within k hops, score = sum of (seed_weight * edge_decay^distance) across
/// all seeds that can reach it. Nodes at the intersection of multiple seeds'
/// neighborhoods score highest.
///
/// This naturally handles mixed seeds: unrelated seeds activate disjoint
/// neighborhoods that don't overlap, so their results separate naturally.
///
/// Tunable params: max_hops (default 3), edge_decay (default 0.5),
/// min_sources (default 2, minimum number of distinct seeds that must reach a node).
fn run_confluence(
seeds: &[(String, f64)],
graph: &Graph,
store: &impl StoreView,
stage: &AlgoStage,
debug: bool,
) -> Vec<(String, f64)> {
let max_hops = stage.param_u32("max_hops", 3);
let edge_decay = stage.param_f64("edge_decay", 0.5);
let min_sources = stage.param_usize("min_sources", 2);
// For each seed, BFS outward collecting (node → activation) at each distance
// Track which seeds contributed to each node's score
let mut node_scores: HashMap<String, f64> = HashMap::new();
let mut node_sources: HashMap<String, HashSet<usize>> = HashMap::new();
for (seed_idx, (seed_key, seed_weight)) in seeds.iter().enumerate() {
let mut visited: HashMap<String, f64> = HashMap::new();
let mut queue: VecDeque<(String, u32)> = VecDeque::new();
visited.insert(seed_key.clone(), *seed_weight);
queue.push_back((seed_key.clone(), 0));
while let Some((key, depth)) = queue.pop_front() {
if depth >= max_hops { continue; }
let act = visited[&key];
for (neighbor, strength) in graph.neighbors(&key) {
let neighbor_weight = store.node_weight(neighbor.as_str());
let propagated = act * edge_decay * neighbor_weight * strength as f64;
if propagated < 0.001 { continue; }
if !visited.contains_key(neighbor.as_str()) || visited[neighbor.as_str()] < propagated {
visited.insert(neighbor.clone(), propagated);
queue.push_back((neighbor.clone(), depth + 1));
}
}
}
// Accumulate into global scores (additive across seeds)
for (key, act) in visited {
*node_scores.entry(key.clone()).or_insert(0.0) += act;
node_sources.entry(key).or_default().insert(seed_idx);
}
}
// Filter to nodes reached by min_sources distinct seeds
let mut results: Vec<(String, f64)> = node_scores.into_iter()
.filter(|(key, _)| {
node_sources.get(key).map(|s| s.len()).unwrap_or(0) >= min_sources
})
.collect();
if debug {
// Show source counts
for (key, score) in results.iter().take(15) {
let sources = node_sources.get(key).map(|s| s.len()).unwrap_or(0);
println!(" [{:.4}] {} (from {} seeds)", score, key, sources);
}
}
results.sort_by(|a, b| b.1.total_cmp(&a.1));
results
}
/// Geodesic: straightest paths between seed pairs in spectral space.
///
/// For each pair of seeds, walk the graph from one to the other, at each
/// step choosing the neighbor whose spectral direction most aligns with
/// the target direction. Nodes along these geodesic paths score higher
/// the more paths pass through them and the straighter those paths are.
///
/// Tunable params: max_path (default 6), k (default 20 results).
fn run_geodesic(
seeds: &[(String, f64)],
graph: &Graph,
stage: &AlgoStage,
debug: bool,
) -> Vec<(String, f64)> {
let max_path = stage.param_usize("max_path", 6);
let k = stage.param_usize("k", 20);
let emb = match spectral::load_embedding() {
Ok(e) => e,
Err(e) => {
if debug { println!(" no spectral embedding: {}", e); }
return seeds.to_vec();
}
};
// Filter seeds to those with valid spectral coords
let valid_seeds: Vec<(&str, f64, &Vec<f64>)> = seeds.iter()
.filter_map(|(key, weight)| {
emb.coords.get(key.as_str())
.filter(|c| c.iter().any(|&v| v.abs() > 1e-12))
.map(|c| (key.as_str(), *weight, c))
})
.collect();
if valid_seeds.len() < 2 {
if debug { println!(" need ≥2 seeds with spectral coords, have {}", valid_seeds.len()); }
return seeds.to_vec();
}
// For each pair of seeds, find the geodesic path
let mut path_counts: HashMap<String, f64> = HashMap::new();
let seed_set: HashSet<&str> = seeds.iter().map(|(k, _)| k.as_str()).collect();
for i in 0..valid_seeds.len() {
for j in (i + 1)..valid_seeds.len() {
let (key_a, weight_a, coords_a) = &valid_seeds[i];
let (key_b, weight_b, coords_b) = &valid_seeds[j];
let pair_weight = weight_a * weight_b;
// Walk from A toward B
let path_ab = geodesic_walk(
key_a, coords_a, coords_b, graph, &emb, max_path,
);
// Walk from B toward A
let path_ba = geodesic_walk(
key_b, coords_b, coords_a, graph, &emb, max_path,
);
// Score nodes on both paths (nodes found from both directions score double)
for (node, alignment) in path_ab.iter().chain(path_ba.iter()) {
if !seed_set.contains(node.as_str()) {
*path_counts.entry(node.clone()).or_insert(0.0) += pair_weight * alignment;
}
}
}
}
if debug && !path_counts.is_empty() {
println!(" {} pairs examined, {} distinct nodes on paths",
valid_seeds.len() * (valid_seeds.len() - 1) / 2,
path_counts.len());
}
// Merge with original seeds
let mut results = seeds.to_vec();
let mut path_results: Vec<(String, f64)> = path_counts.into_iter().collect();
path_results.sort_by(|a, b| b.1.total_cmp(&a.1));
path_results.truncate(k);
for (key, score) in path_results {
if !seed_set.contains(key.as_str()) {
results.push((key, score));
}
}
results.sort_by(|a, b| b.1.total_cmp(&a.1));
results
}
/// Walk from `start` toward `target_coords` in spectral space, choosing
/// the neighbor at each step whose direction most aligns with the target.
/// Returns (node_key, alignment_score) for each intermediate node.
fn geodesic_walk(
start: &str,
start_coords: &[f64],
target_coords: &[f64],
graph: &Graph,
emb: &spectral::SpectralEmbedding,
max_steps: usize,
) -> Vec<(String, f64)> {
let mut path = Vec::new();
let mut current = start.to_string();
let mut current_coords = start_coords.to_vec();
let mut visited: HashSet<String> = HashSet::new();
visited.insert(current.clone());
for _ in 0..max_steps {
// Direction we want to travel: from current toward target
let direction: Vec<f64> = target_coords.iter()
.zip(current_coords.iter())
.map(|(t, c)| t - c)
.collect();
let dir_norm = direction.iter().map(|d| d * d).sum::<f64>().sqrt();
if dir_norm < 1e-12 { break; } // arrived
// Among neighbors with spectral coords, find the one most aligned
let mut best: Option<(String, Vec<f64>, f64)> = None;
for (neighbor, _strength) in graph.neighbors(&current) {
if visited.contains(neighbor.as_str()) { continue; }
let neighbor_coords = match emb.coords.get(neighbor.as_str()) {
Some(c) if c.iter().any(|&v| v.abs() > 1e-12) => c,
_ => continue,
};
// Direction to this neighbor
let step: Vec<f64> = neighbor_coords.iter()
.zip(current_coords.iter())
.map(|(n, c)| n - c)
.collect();
let step_norm = step.iter().map(|s| s * s).sum::<f64>().sqrt();
if step_norm < 1e-12 { continue; }
// Cosine similarity between desired direction and step direction
let dot: f64 = direction.iter().zip(step.iter()).map(|(d, s)| d * s).sum();
let alignment = dot / (dir_norm * step_norm);
if alignment > 0.0 { // only consider forward-facing neighbors
if best.as_ref().map(|(_, _, a)| alignment > *a).unwrap_or(true) {
best = Some((neighbor.clone(), neighbor_coords.clone(), alignment));
}
}
}
match best {
Some((next_key, next_coords, alignment)) => {
path.push((next_key.clone(), alignment));
visited.insert(next_key.clone());
current = next_key;
current_coords = next_coords;
}
None => break, // no forward-facing neighbors
}
}
path
}
/// Manifold: extrapolation along the direction defined by seeds.
///
/// Instead of finding what's *near* the seeds in spectral space (proximity),
/// find what's in the *direction* the seeds define. Given a weighted centroid
/// of seeds and the principal direction they span, find nodes that continue
/// along that direction.
///
/// Tunable params: k (default 20 results).
fn run_manifold(
seeds: &[(String, f64)],
graph: &Graph,
stage: &AlgoStage,
debug: bool,
) -> Vec<(String, f64)> {
let k = stage.param_usize("k", 20);
let emb = match spectral::load_embedding() {
Ok(e) => e,
Err(e) => {
if debug { println!(" no spectral embedding: {}", e); }
return seeds.to_vec();
}
};
// Collect seeds with valid spectral coordinates
let seed_data: Vec<(&str, f64, &Vec<f64>)> = seeds.iter()
.filter_map(|(key, weight)| {
emb.coords.get(key.as_str())
.filter(|c| c.iter().any(|&v| v.abs() > 1e-12))
.map(|c| (key.as_str(), *weight, c))
})
.collect();
if seed_data.is_empty() {
if debug { println!(" no seeds with spectral coords"); }
return seeds.to_vec();
}
let dims = emb.dims;
// Compute weighted centroid of seeds
let mut centroid = vec![0.0f64; dims];
let mut total_weight = 0.0;
for (_, weight, coords) in &seed_data {
for (i, &c) in coords.iter().enumerate() {
centroid[i] += c * weight;
}
total_weight += weight;
}
if total_weight > 0.0 {
for c in &mut centroid {
*c /= total_weight;
}
}
// Compute principal direction via power iteration on seed covariance.
// Initialize with the two most separated seeds (largest spectral distance).
let mut direction = vec![0.0f64; dims];
if seed_data.len() >= 2 {
// Find the two seeds furthest apart in spectral space
let mut best_dist = 0.0f64;
for i in 0..seed_data.len() {
for j in (i + 1)..seed_data.len() {
let dist: f64 = seed_data[i].2.iter().zip(seed_data[j].2.iter())
.map(|(a, b)| (a - b).powi(2)).sum::<f64>().sqrt();
if dist > best_dist {
best_dist = dist;
for d in 0..dims {
direction[d] = seed_data[j].2[d] - seed_data[i].2[d];
}
}
}
}
// Power iteration: 3 rounds on the weighted covariance matrix
for _ in 0..3 {
let mut new_dir = vec![0.0f64; dims];
for (_, weight, coords) in &seed_data {
let dev: Vec<f64> = coords.iter().zip(centroid.iter()).map(|(c, m)| c - m).collect();
let dot: f64 = dev.iter().zip(direction.iter()).map(|(d, v)| d * v).sum();
for d in 0..dims {
new_dir[d] += weight * dot * dev[d];
}
}
// Normalize
let norm = new_dir.iter().map(|d| d * d).sum::<f64>().sqrt();
if norm > 1e-12 {
for d in &mut new_dir { *d /= norm; }
}
direction = new_dir;
}
}
let dir_norm = direction.iter().map(|d| d * d).sum::<f64>().sqrt();
let seed_set: HashSet<&str> = seeds.iter().map(|(k, _)| k.as_str()).collect();
// Score each non-seed node by projection onto the direction from centroid
let mut candidates: Vec<(String, f64)> = emb.coords.iter()
.filter(|(key, coords)| {
!seed_set.contains(key.as_str())
&& coords.iter().any(|&v| v.abs() > 1e-12)
})
.map(|(key, coords)| {
let deviation: Vec<f64> = coords.iter().zip(centroid.iter())
.map(|(c, m)| c - m)
.collect();
let score = if dir_norm > 1e-12 {
// Project onto direction: how far along the principal axis
let projection: f64 = deviation.iter().zip(direction.iter())
.map(|(d, v)| d * v)
.sum::<f64>() / dir_norm;
// Distance from the axis (perpendicular component)
let proj_vec: Vec<f64> = direction.iter()
.map(|&d| d * projection / dir_norm)
.collect();
let perp_dist: f64 = deviation.iter().zip(proj_vec.iter())
.map(|(d, p)| (d - p).powi(2))
.sum::<f64>()
.sqrt();
// Score: prefer nodes far along the direction but close to the axis
// Use absolute projection (both directions from centroid are interesting)
let along = projection.abs();
if perp_dist < 1e-12 {
along
} else {
along / (1.0 + perp_dist)
}
} else {
// No direction (single seed or all seeds coincide): use distance from centroid
let dist: f64 = deviation.iter().map(|d| d * d).sum::<f64>().sqrt();
1.0 / (1.0 + dist)
};
// Bonus for being connected to seeds in the graph
let graph_bonus: f64 = graph.neighbors(key).iter()
.filter(|(n, _)| seed_set.contains(n.as_str()))
.map(|(_, s)| *s as f64 * 0.1)
.sum();
(key.clone(), score + graph_bonus)
})
.collect();
candidates.sort_by(|a, b| b.1.total_cmp(&a.1));
candidates.truncate(k);
if debug {
for (key, score) in candidates.iter().take(15) {
println!(" [{:.4}] {}", score, key);
}
}
// Merge with original seeds
let mut results = seeds.to_vec();
for (key, score) in candidates {
results.push((key, score));
}
results.sort_by(|a, b| b.1.total_cmp(&a.1));
results
}
/// Simultaneous wavefront spreading activation.
///
/// All seeds emit at once. At each hop, activations from all sources
/// sum at each node, and the combined activation map propagates on
/// the next hop. This creates interference patterns — nodes where
/// multiple wavefronts overlap get reinforced and radiate stronger.
fn spreading_activation(
seeds: &[(String, f64)],
graph: &Graph,
store: &impl StoreView,
max_hops: u32,
edge_decay: f64,
min_activation: f64,
) -> Vec<(String, f64)> {
let mut activation: HashMap<String, f64> = HashMap::new();
// Initialize wavefront from all seeds
let mut frontier: HashMap<String, f64> = HashMap::new();
for (key, act) in seeds {
*frontier.entry(key.clone()).or_insert(0.0) += act;
*activation.entry(key.clone()).or_insert(0.0) += act;
}
// Propagate hop by hop — all sources simultaneously
// Node weight does NOT gate traversal — only edge_decay and edge strength.
// Node weight is applied at the end for ranking.
for _hop in 0..max_hops {
let mut next_frontier: HashMap<String, f64> = HashMap::new();
for (key, act) in &frontier {
for (neighbor, strength) in graph.neighbors(key) {
let propagated = act * edge_decay * strength as f64;
if propagated < min_activation { continue; }
*next_frontier.entry(neighbor.clone()).or_insert(0.0) += propagated;
}
}
if next_frontier.is_empty() { break; }
// Merge into total activation and advance frontier
for (key, act) in &next_frontier {
*activation.entry(key.clone()).or_insert(0.0) += act;
}
frontier = next_frontier;
}
// Apply node weight for ranking, not traversal
let mut results: Vec<_> = activation.into_iter()
.map(|(key, act)| {
let weight = store.node_weight(&key);
(key, act * weight)
})
.collect();
results.sort_by(|a, b| b.1.total_cmp(&a.1));
results
}
/// Search with weighted terms: exact key matching + spectral projection.
///
/// Terms are matched against node keys. Matching nodes become seeds,
/// scored by term_weight × node_weight. Seeds are then projected into
/// spectral space to find nearby nodes, with link weights modulating distance.
pub fn search_weighted(
terms: &BTreeMap<String, f64>,
store: &impl StoreView,
) -> Vec<SearchResult> {
search_weighted_inner(terms, store, false, 5)
}
/// Like search_weighted but with debug output and configurable result count.
pub fn search_weighted_debug(
terms: &BTreeMap<String, f64>,
store: &impl StoreView,
max_results: usize,
) -> Vec<SearchResult> {
search_weighted_inner(terms, store, true, max_results)
}
fn search_weighted_inner(
terms: &BTreeMap<String, f64>,
store: &impl StoreView,
debug: bool,
max_results: usize,
) -> Vec<SearchResult> {
let graph = crate::graph::build_graph_fast(store);
let (seeds, direct_hits) = match_seeds(terms, store);
if seeds.is_empty() {
return Vec::new();
}
if debug {
println!("\n[search] === SEEDS ({}) ===", seeds.len());
let mut sorted_seeds = seeds.clone();
sorted_seeds.sort_by(|a, b| b.1.total_cmp(&a.1));
for (key, score) in sorted_seeds.iter().take(20) {
println!(" {:.4} {}", score, key);
}
}
// Default pipeline: spectral → spread (legacy behavior)
let pipeline = vec![
AlgoStage { algo: Algorithm::Spectral, params: HashMap::new() },
AlgoStage { algo: Algorithm::Spread, params: HashMap::new() },
];
let raw_results = run_pipeline(&pipeline, seeds, &graph, store, debug, max_results);
raw_results.into_iter()
.take(max_results)
.map(|(key, activation)| {
let is_direct = direct_hits.contains(&key);
SearchResult { key, activation, is_direct, snippet: None }
}).collect()
}
/// Search with equal-weight terms (for interactive use).
pub fn search(query: &str, store: &impl StoreView) -> Vec<SearchResult> {
let terms: BTreeMap<String, f64> = query.split_whitespace()
.map(|t| (t.to_lowercase(), 1.0))
.collect();
search_weighted(&terms, store)
}
/// Extract meaningful search terms from natural language.
/// Strips common English stop words, returns up to max_terms words.
pub fn extract_query_terms(text: &str, max_terms: usize) -> String {
const STOP_WORDS: &[&str] = &[
"the", "a", "an", "is", "are", "was", "were", "do", "does", "did",
"have", "has", "had", "will", "would", "could", "should", "can",
"may", "might", "shall", "been", "being", "to", "of", "in", "for",
"on", "with", "at", "by", "from", "as", "but", "or", "and", "not",
"no", "if", "then", "than", "that", "this", "it", "its", "my",
"your", "our", "we", "you", "i", "me", "he", "she", "they", "them",
"what", "how", "why", "when", "where", "about", "just", "let",
"want", "tell", "show", "think", "know", "see", "look", "make",
"get", "go", "some", "any", "all", "very", "really", "also", "too",
"so", "up", "out", "here", "there",
];
text.to_lowercase()
.split(|c: char| !c.is_alphanumeric())
.filter(|w| !w.is_empty() && w.len() > 2 && !STOP_WORDS.contains(w))
.take(max_terms)
.collect::<Vec<_>>()
.join(" ")
}
/// Format search results as text lines (for hook consumption).
pub fn format_results(results: &[SearchResult]) -> String {
let mut out = String::new();
for (i, r) in results.iter().enumerate() {
let marker = if r.is_direct { "" } else { " " };
out.push_str(&format!("{}{:2}. [{:.2}/{:.2}] {}",
marker, i + 1, r.activation, r.activation, r.key));
out.push('\n');
if let Some(ref snippet) = r.snippet {
out.push_str(&format!(" {}\n", snippet));
}
}
out
}