search: composable algorithm pipeline

Break search into composable stages that chain left-to-right:
each stage takes seeds Vec<(String, f64)> and returns modified seeds.

Available algorithms:
  spread              — spreading activation through graph edges
  spectral            — nearest neighbors in spectral embedding
  manifold            — (placeholder) extrapolation along seed direction

Stages accept inline params: spread,max_hops=4,edge_decay=0.5

memory-search gets --hook, --debug, --seen modes plus positional
pipeline args. poc-memory search gets -p/--pipeline flags.

Also: fix spectral decompose() to skip zero eigenvalues from
disconnected components, filter degenerate zero-coord nodes from
spectral projection, POC_AGENT bail-out for daemon agents, all
debug output to stdout.

Co-Authored-By: ProofOfConcept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-03-09 01:19:04 -04:00
parent 0a35a17fad
commit c1664bf76b
4 changed files with 723 additions and 151 deletions

View file

@ -1,24 +1,76 @@
// memory-search: combined hook for session context loading + ambient memory retrieval
//
// On first prompt per session: loads full memory context (identity, journal, etc.)
// On subsequent prompts: searches memory for relevant entries
// On post-compaction: reloads full context
//
// Reads JSON from stdin (Claude Code UserPromptSubmit hook format),
// outputs results for injection into the conversation.
// Modes:
// --hook Run as Claude Code UserPromptSubmit hook (reads stdin, injects into conversation)
// --debug Replay last stashed input, dump every stage to stdout
// --seen Show the seen set for current session
// (default) No-op (future: manual search modes)
use poc_memory::search;
use clap::Parser;
use poc_memory::search::{self, AlgoStage};
use poc_memory::store;
use std::collections::HashSet;
use std::collections::{BTreeMap, HashSet};
use std::fs;
use std::io::{self, Read, Write};
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{Duration, SystemTime};
#[derive(Parser)]
#[command(name = "memory-search")]
struct Args {
/// Run as Claude Code hook (reads stdin, outputs for injection)
#[arg(long)]
hook: bool,
/// Debug mode: replay last stashed input, dump every stage
#[arg(short, long)]
debug: bool,
/// Show the seen set and returned memories for this session
#[arg(long)]
seen: bool,
/// Max results to return
#[arg(long, default_value = "5")]
max_results: usize,
/// Algorithm pipeline stages: e.g. spread spectral,k=20 spread,max_hops=4
/// Default: spread.
pipeline: Vec<String>,
}
const STASH_PATH: &str = "/tmp/claude-memory-search/last-input.json";
fn main() {
let mut input = String::new();
io::stdin().read_to_string(&mut input).unwrap_or_default();
// Daemon agent calls set POC_AGENT=1 — skip memory search.
if std::env::var("POC_AGENT").is_ok() {
return;
}
let args = Args::parse();
if args.seen {
show_seen();
return;
}
let input = if args.hook {
// Hook mode: read from stdin, stash for later debug runs
let mut buf = String::new();
io::stdin().read_to_string(&mut buf).unwrap_or_default();
fs::create_dir_all("/tmp/claude-memory-search").ok();
fs::write(STASH_PATH, &buf).ok();
buf
} else {
// All other modes: replay stashed input
fs::read_to_string(STASH_PATH).unwrap_or_else(|_| {
eprintln!("No stashed input at {}", STASH_PATH);
std::process::exit(1);
})
};
let debug = args.debug || !args.hook;
let json: serde_json::Value = match serde_json::from_str(&input) {
Ok(v) => v,
@ -42,6 +94,16 @@ fn main() {
let cookie_path = state_dir.join(format!("cookie-{}", session_id));
let is_first = !cookie_path.exists();
if is_first || is_compaction {
// Reset seen set to keys that load-context will inject
let seen_path = state_dir.join(format!("seen-{}", session_id));
fs::remove_file(&seen_path).ok();
}
if debug {
println!("[memory-search] session={} is_first={} is_compaction={}", session_id, is_first, is_compaction);
}
if is_first || is_compaction {
// Create/touch the cookie
let cookie = if is_first {
@ -52,52 +114,135 @@ fn main() {
fs::read_to_string(&cookie_path).unwrap_or_default().trim().to_string()
};
// Load full memory context
if debug { println!("[memory-search] loading full context"); }
// Load full memory context and pre-populate seen set with injected keys
if let Ok(output) = Command::new("poc-memory").args(["load-context"]).output() {
if output.status.success() {
let ctx = String::from_utf8_lossy(&output.stdout);
if !ctx.trim().is_empty() {
print!("{}", ctx);
// Extract keys from "--- KEY (group) ---" lines
for line in ctx.lines() {
if line.starts_with("--- ") && line.ends_with(" ---") {
let inner = &line[4..line.len() - 4];
if let Some(paren) = inner.rfind(" (") {
let key = inner[..paren].trim();
mark_seen(&state_dir, session_id, key);
}
}
}
if debug { println!("[memory-search] context loaded: {} bytes", ctx.len()); }
if args.hook {
print!("{}", ctx);
}
}
}
}
// On first prompt, also bump lookup counter for the cookie
let _ = cookie; // used for tagging below
}
// Always do ambient search (skip on very short or system prompts)
let word_count = prompt.split_whitespace().count();
if word_count < 3 {
return;
let _ = cookie;
}
// Skip system/AFK prompts
for prefix in &["is AFK", "You're on your own", "IRC mention"] {
if prompt.starts_with(prefix) {
return;
}
}
let query = search::extract_query_terms(prompt, 3);
if query.is_empty() {
return;
}
let store = match store::Store::load() {
Ok(s) => s,
Err(_) => return,
};
let results = search::search(&query, &store);
if results.is_empty() {
// Search for node keys in last ~150k tokens of transcript
let transcript_path = json["transcript_path"].as_str().unwrap_or("");
if debug { println!("[memory-search] transcript: {}", transcript_path); }
let terms = extract_weighted_terms(transcript_path, 150_000, &store);
if debug {
println!("[memory-search] {} node keys found in transcript", terms.len());
let mut by_weight: Vec<_> = terms.iter().collect();
by_weight.sort_by(|a, b| b.1.total_cmp(a.1));
for (term, weight) in by_weight.iter().take(20) {
println!(" {:.3} {}", weight, term);
}
}
if terms.is_empty() {
if debug { println!("[memory-search] no node keys found, done"); }
return;
}
// Parse algorithm pipeline
let pipeline: Vec<AlgoStage> = if args.pipeline.is_empty() {
// Default: just spreading activation
vec![AlgoStage::parse("spread").unwrap()]
} else {
let mut stages = Vec::new();
for arg in &args.pipeline {
match AlgoStage::parse(arg) {
Ok(s) => stages.push(s),
Err(e) => {
eprintln!("error: {}", e);
std::process::exit(1);
}
}
}
stages
};
if debug {
let names: Vec<String> = pipeline.iter().map(|s| format!("{}", s.algo)).collect();
println!("[memory-search] pipeline: {}", names.join(""));
}
// Extract seeds from terms
let graph = poc_memory::graph::build_graph_fast(&store);
let (seeds, direct_hits) = search::match_seeds(&terms, &store);
if seeds.is_empty() {
if debug { println!("[memory-search] no seeds matched, done"); }
return;
}
if debug {
println!("[memory-search] {} seeds", seeds.len());
let mut sorted = seeds.clone();
sorted.sort_by(|a, b| b.1.total_cmp(&a.1));
for (key, score) in sorted.iter().take(20) {
println!(" {:.4} {}", score, key);
}
}
let max_results = if debug { args.max_results.max(25) } else { args.max_results };
let raw_results = search::run_pipeline(&pipeline, seeds, &graph, &store, debug, max_results);
let results: Vec<search::SearchResult> = raw_results.into_iter()
.map(|(key, activation)| {
let is_direct = direct_hits.contains(&key);
search::SearchResult { key, activation, is_direct, snippet: None }
}).collect();
if debug {
println!("[memory-search] {} search results", results.len());
for r in results.iter().take(10) {
let marker = if r.is_direct { "" } else { " " };
println!(" {} [{:.4}] {}", marker, r.activation, r.key);
}
}
if results.is_empty() {
if debug { println!("[memory-search] no results, done"); }
return;
}
let seen = load_seen(&state_dir, session_id);
if debug { println!("[memory-search] {} keys in seen set", seen.len()); }
// Format results like poc-memory search output
let search_output = search::format_results(&results);
let cookie = fs::read_to_string(&cookie_path).unwrap_or_default().trim().to_string();
let seen = load_seen(&state_dir, session_id);
let mut result_output = String::new();
let mut count = 0;
@ -112,6 +257,7 @@ fn main() {
if let Some(key) = extract_key_from_line(trimmed) {
if seen.contains(&key) { continue; }
mark_seen(&state_dir, session_id, &key);
mark_returned(&state_dir, session_id, &key);
result_output.push_str(line);
result_output.push('\n');
count += 1;
@ -121,9 +267,14 @@ fn main() {
}
}
if count == 0 { return; }
if count == 0 {
if debug { println!("[memory-search] all results already seen"); }
return;
}
println!("Recalled memories [{}]:", cookie);
if args.hook {
println!("Recalled memories [{}]:", cookie);
}
print!("{}", result_output);
// Clean up stale state files (opportunistic)
@ -131,6 +282,82 @@ fn main() {
}
/// Reverse-scan the transcript JSONL, extracting text from user/assistant
/// messages until we accumulate `max_tokens` tokens of text content.
/// Then search for all node keys as substrings, weighted by position.
fn extract_weighted_terms(
path: &str,
max_tokens: usize,
store: &poc_memory::store::Store,
) -> BTreeMap<String, f64> {
if path.is_empty() { return BTreeMap::new(); }
let content = match fs::read_to_string(path) {
Ok(c) => c,
Err(_) => return BTreeMap::new(),
};
// Collect text from messages, scanning backwards, until token budget hit
let mut message_texts: Vec<String> = Vec::new();
let mut token_count = 0;
for line in content.lines().rev() {
if token_count >= max_tokens { break; }
let obj: serde_json::Value = match serde_json::from_str(line) {
Ok(v) => v,
Err(_) => continue,
};
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
if msg_type != "user" && msg_type != "assistant" { continue; }
let mut msg_text = String::new();
let msg = obj.get("message").unwrap_or(&obj);
match msg.get("content") {
Some(serde_json::Value::String(s)) => {
msg_text.push_str(s);
}
Some(serde_json::Value::Array(arr)) => {
for block in arr {
if block.get("type").and_then(|v| v.as_str()) == Some("text") {
if let Some(t) = block.get("text").and_then(|v| v.as_str()) {
msg_text.push(' ');
msg_text.push_str(t);
}
}
}
}
_ => {}
}
token_count += msg_text.len() / 4;
message_texts.push(msg_text);
}
// Reverse so oldest is first (position weighting: later = more recent = higher)
message_texts.reverse();
let all_text = message_texts.join(" ").to_lowercase();
let text_len = all_text.len();
if text_len == 0 { return BTreeMap::new(); }
// Search for each node key as a substring (casefolded), accumulate position-weighted score
let mut terms = BTreeMap::new();
for (key, _node) in &store.nodes {
let key_folded = key.to_lowercase();
let mut pos = 0;
while let Some(found) = all_text[pos..].find(&key_folded) {
let abs_pos = pos + found;
let weight = (abs_pos + 1) as f64 / text_len as f64;
*terms.entry(key_folded.clone()).or_insert(0.0) += weight;
pos = abs_pos + key_folded.len();
}
}
terms
}
fn extract_key_from_line(line: &str) -> Option<String> {
let after_bracket = line.find("] ")?;
let rest = &line[after_bracket + 2..];
@ -167,6 +394,70 @@ fn mark_seen(dir: &Path, session_id: &str, key: &str) {
}
}
fn mark_returned(dir: &Path, session_id: &str, key: &str) {
let path = dir.join(format!("returned-{}", session_id));
if let Ok(mut f) = fs::OpenOptions::new().create(true).append(true).open(path) {
writeln!(f, "{}", key).ok();
}
}
fn load_returned(dir: &Path, session_id: &str) -> Vec<String> {
let path = dir.join(format!("returned-{}", session_id));
if path.exists() {
fs::read_to_string(path)
.unwrap_or_default()
.lines()
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect()
} else {
Vec::new()
}
}
fn show_seen() {
let state_dir = PathBuf::from("/tmp/claude-memory-search");
// Read stashed input for session_id
let input = match fs::read_to_string(STASH_PATH) {
Ok(s) => s,
Err(_) => {
eprintln!("No stashed input at {}", STASH_PATH);
return;
}
};
let json: serde_json::Value = match serde_json::from_str(&input) {
Ok(v) => v,
Err(_) => {
eprintln!("Failed to parse stashed input");
return;
}
};
let session_id = json["session_id"].as_str().unwrap_or("");
if session_id.is_empty() {
eprintln!("No session_id in stashed input");
return;
}
println!("Session: {}", session_id);
let cookie_path = state_dir.join(format!("cookie-{}", session_id));
if let Ok(cookie) = fs::read_to_string(&cookie_path) {
println!("Cookie: {}", cookie.trim());
}
let returned = load_returned(&state_dir, session_id);
if !returned.is_empty() {
println!("\nReturned by search ({}):", returned.len());
for key in &returned {
println!(" {}", key);
}
}
let seen = load_seen(&state_dir, session_id);
println!("\nSeen set ({} total, {} pre-seeded):", seen.len(), seen.len() - returned.len());
}
fn cleanup_stale_files(dir: &Path, max_age: Duration) {
let entries = match fs::read_dir(dir) {
Ok(e) => e,