search: unified query pipeline with filters, transforms, generators

Extend the pipeline with four stage types composing left-to-right:
  Generators: all, match:TERM
  Filters: type:, key:, weight:, age:, content-len:, provenance:,
           not-visited:, visited: (plus ! negation)
  Transforms: sort:(priority|timestamp|content-len|degree|weight), limit:N
  Algorithms: spread, spectral, confluence, geodesic, manifold (unchanged)

Duration syntax (7d, 24h, 30m) and glob matching on keys.
CLI auto-detects filter/transform stages and loads full Store;
algorithm-only pipelines keep the fast MmapView path.

Co-Authored-By: ProofOfConcept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-03-10 15:22:12 -04:00
parent c6bb7c3910
commit e736471d99
2 changed files with 591 additions and 70 deletions

View file

@ -598,81 +598,129 @@ fn cmd_search(terms: &[String], pipeline_args: &[String], expand: bool, full: bo
use store::StoreView;
use std::collections::BTreeMap;
if terms.is_empty() {
return Err("search requires at least one term".into());
// Parse pipeline stages (unified: algorithms, filters, transforms, generators)
let stages: Vec<search::Stage> = if pipeline_args.is_empty() {
vec![search::Stage::Algorithm(search::AlgoStage::parse("spread").unwrap())]
} else {
pipeline_args.iter()
.map(|a| search::Stage::parse(a))
.collect::<Result<Vec<_>, _>>()?
};
// Check if pipeline needs full Store (has filters/transforms/generators)
let needs_store = stages.iter().any(|s| !matches!(s, search::Stage::Algorithm(_)));
// Check if pipeline starts with a generator (doesn't need seed terms)
let has_generator = stages.first().map(|s| matches!(s, search::Stage::Generator(_))).unwrap_or(false);
if terms.is_empty() && !has_generator {
return Err("search requires terms or a generator stage (e.g. 'all')".into());
}
let query: String = terms.join(" ");
// Parse pipeline (default: spread)
let pipeline: Vec<search::AlgoStage> = if pipeline_args.is_empty() {
vec![search::AlgoStage::parse("spread").unwrap()]
} else {
pipeline_args.iter()
.map(|a| search::AlgoStage::parse(a))
.collect::<Result<Vec<_>, _>>()?
};
if debug {
let names: Vec<String> = pipeline.iter().map(|s| format!("{}", s.algo)).collect();
let names: Vec<String> = stages.iter().map(|s| format!("{}", s)).collect();
println!("[search] pipeline: {}", names.join(""));
}
let view = store::AnyView::load()?;
let graph = graph::build_graph_fast(&view);
// Build equal-weight terms from query
let terms: BTreeMap<String, f64> = query.split_whitespace()
.map(|t| (t.to_lowercase(), 1.0))
.collect();
let (seeds, direct_hits) = search::match_seeds(&terms, &view);
if seeds.is_empty() {
eprintln!("No results for '{}'", query);
return Ok(());
}
if debug {
println!("[search] {} seeds from query '{}'", seeds.len(), query);
for (key, score) in &seeds {
println!(" {:.4} {}", score, key);
}
}
let max_results = if expand { 15 } else { 5 };
let raw = search::run_pipeline(&pipeline, seeds, &graph, &view, debug, max_results);
let results: Vec<search::SearchResult> = raw.into_iter()
.map(|(key, activation)| {
let is_direct = direct_hits.contains(&key);
search::SearchResult { key, activation, is_direct, snippet: None }
})
.collect();
if needs_store {
// Full Store path — needed for filter/transform/generator stages
let store = store::Store::load()?;
let graph = store.build_graph();
if results.is_empty() {
eprintln!("No results for '{}'", query);
return Ok(());
}
let seeds = if has_generator {
vec![] // generator will produce its own result set
} else {
let terms_map: BTreeMap<String, f64> = query.split_whitespace()
.map(|t| (t.to_lowercase(), 1.0))
.collect();
let (seeds, _) = search::match_seeds(&terms_map, &store);
seeds
};
// Log retrieval
store::Store::log_retrieval_static(&query,
&results.iter().map(|r| r.key.clone()).collect::<Vec<_>>());
let raw = search::run_query(&stages, seeds, &graph, &store, debug, max_results);
let bump_keys: Vec<&str> = results.iter().take(max_results).map(|r| r.key.as_str()).collect();
let _ = lookups::bump_many(&bump_keys);
if raw.is_empty() {
eprintln!("No results");
return Ok(());
}
for (i, r) in results.iter().enumerate().take(max_results) {
let marker = if r.is_direct { "" } else { " " };
let weight = view.node_weight(&r.key);
println!("{}{:2}. [{:.2}/{:.2}] {}", marker, i + 1, r.activation, weight, r.key);
if full {
if let Some(content) = view.node_content(&r.key) {
println!();
for line in content.lines() {
println!(" {}", line);
for (i, (key, score)) in raw.iter().enumerate().take(max_results) {
let weight = store.nodes.get(key).map(|n| n.weight).unwrap_or(0.0);
println!("{:2}. [{:.2}/{:.2}] {}", i + 1, score, weight, key);
if full {
if let Some(node) = store.nodes.get(key) {
println!();
for line in node.content.lines() {
println!(" {}", line);
}
println!();
}
}
}
} else {
// Fast MmapView path — algorithm-only pipeline
let view = store::AnyView::load()?;
let graph = graph::build_graph_fast(&view);
let terms_map: BTreeMap<String, f64> = query.split_whitespace()
.map(|t| (t.to_lowercase(), 1.0))
.collect();
let (seeds, direct_hits) = search::match_seeds(&terms_map, &view);
if seeds.is_empty() {
eprintln!("No results for '{}'", query);
return Ok(());
}
if debug {
println!("[search] {} seeds from query '{}'", seeds.len(), query);
}
// Extract AlgoStages from the unified stages
let algo_stages: Vec<&search::AlgoStage> = stages.iter()
.filter_map(|s| match s {
search::Stage::Algorithm(a) => Some(a),
_ => None,
})
.collect();
let algo_owned: Vec<search::AlgoStage> = algo_stages.into_iter().cloned().collect();
let raw = search::run_pipeline(&algo_owned, seeds, &graph, &view, debug, max_results);
let results: Vec<search::SearchResult> = raw.into_iter()
.map(|(key, activation)| {
let is_direct = direct_hits.contains(&key);
search::SearchResult { key, activation, is_direct, snippet: None }
})
.collect();
if results.is_empty() {
eprintln!("No results for '{}'", query);
return Ok(());
}
// Log retrieval
store::Store::log_retrieval_static(&query,
&results.iter().map(|r| r.key.clone()).collect::<Vec<_>>());
let bump_keys: Vec<&str> = results.iter().take(max_results).map(|r| r.key.as_str()).collect();
let _ = lookups::bump_many(&bump_keys);
for (i, r) in results.iter().enumerate().take(max_results) {
let marker = if r.is_direct { "" } else { " " };
let weight = view.node_weight(&r.key);
println!("{}{:2}. [{:.2}/{:.2}] {}", marker, i + 1, r.activation, weight, r.key);
if full {
if let Some(content) = view.node_content(&r.key) {
println!();
for line in content.lines() {
println!(" {}", line);
}
println!();
}
println!();
}
}
}