poc-memory v0.4.0: graph-structured memory with consolidation pipeline

Rust core:
- Cap'n Proto append-only storage (nodes + relations)
- Graph algorithms: clustering coefficient, community detection,
  schema fit, small-world metrics, interference detection
- BM25 text similarity with Porter stemming
- Spaced repetition replay queue
- Commands: search, init, health, status, graph, categorize,
  link-add, link-impact, decay, consolidate-session, etc.

Python scripts:
- Episodic digest pipeline: daily/weekly/monthly-digest.py
- retroactive-digest.py for backfilling
- consolidation-agents.py: 3 parallel Sonnet agents
- apply-consolidation.py: structured action extraction + apply
- digest-link-parser.py: extract ~400 explicit links from digests
- content-promotion-agent.py: promote episodic obs to semantic files
- bulk-categorize.py: categorize all nodes via single Sonnet call
- consolidation-loop.py: multi-round automated consolidation

Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
ProofOfConcept 2026-02-28 22:17:00 -05:00
commit 23fac4e5fe
35 changed files with 9388 additions and 0 deletions

135
src/similarity.rs Normal file
View file

@ -0,0 +1,135 @@
// Text similarity: Porter stemming + BM25
//
// Used for interference detection (similar content, different communities)
// and schema fit scoring. Intentionally simple — ~100 lines, no
// external dependencies.
use std::collections::HashMap;
/// Minimal Porter stemmer — handles the most common English suffixes.
/// Not linguistically complete but good enough for similarity matching.
pub fn stem(word: &str) -> String {
let w = word.to_lowercase();
if w.len() <= 3 { return w; }
let w = strip_suffix(&w, "ation", "ate");
let w = strip_suffix(&w, "ness", "");
let w = strip_suffix(&w, "ment", "");
let w = strip_suffix(&w, "ting", "t");
let w = strip_suffix(&w, "ling", "l");
let w = strip_suffix(&w, "ring", "r");
let w = strip_suffix(&w, "ning", "n");
let w = strip_suffix(&w, "ding", "d");
let w = strip_suffix(&w, "ping", "p");
let w = strip_suffix(&w, "ging", "g");
let w = strip_suffix(&w, "ying", "y");
let w = strip_suffix(&w, "ied", "y");
let w = strip_suffix(&w, "ies", "y");
let w = strip_suffix(&w, "ing", "");
let w = strip_suffix(&w, "ed", "");
let w = strip_suffix(&w, "ly", "");
let w = strip_suffix(&w, "er", "");
let w = strip_suffix(&w, "al", "");
strip_suffix(&w, "s", "")
}
fn strip_suffix(word: &str, suffix: &str, replacement: &str) -> String {
if word.len() > suffix.len() + 2 && word.ends_with(suffix) {
let base = &word[..word.len() - suffix.len()];
format!("{}{}", base, replacement)
} else {
word.to_string()
}
}
/// Tokenize and stem a text into a term frequency map
pub fn term_frequencies(text: &str) -> HashMap<String, u32> {
let mut tf = HashMap::new();
for word in text.split(|c: char| !c.is_alphanumeric()) {
if word.len() > 2 {
let stemmed = stem(word);
*tf.entry(stemmed).or_default() += 1;
}
}
tf
}
/// Cosine similarity between two documents using stemmed term frequencies.
/// Returns 0.0 for disjoint vocabularies, 1.0 for identical content.
pub fn cosine_similarity(doc_a: &str, doc_b: &str) -> f32 {
let tf_a = term_frequencies(doc_a);
let tf_b = term_frequencies(doc_b);
if tf_a.is_empty() || tf_b.is_empty() {
return 0.0;
}
// Dot product
let mut dot = 0.0f64;
for (term, &freq_a) in &tf_a {
if let Some(&freq_b) = tf_b.get(term) {
dot += freq_a as f64 * freq_b as f64;
}
}
// Magnitudes
let mag_a: f64 = tf_a.values().map(|&f| (f as f64).powi(2)).sum::<f64>().sqrt();
let mag_b: f64 = tf_b.values().map(|&f| (f as f64).powi(2)).sum::<f64>().sqrt();
if mag_a < 1e-10 || mag_b < 1e-10 {
return 0.0;
}
(dot / (mag_a * mag_b)) as f32
}
/// Compute pairwise similarity for a set of documents.
/// Returns pairs with similarity above threshold.
pub fn pairwise_similar(
docs: &[(String, String)], // (key, content)
threshold: f32,
) -> Vec<(String, String, f32)> {
let mut results = Vec::new();
for i in 0..docs.len() {
for j in (i + 1)..docs.len() {
let sim = cosine_similarity(&docs[i].1, &docs[j].1);
if sim >= threshold {
results.push((docs[i].0.clone(), docs[j].0.clone(), sim));
}
}
}
results.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap());
results
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_stem() {
assert_eq!(stem("running"), "runn"); // -ning → n
assert_eq!(stem("talking"), "talk"); // not matched by specific consonant rules
assert_eq!(stem("slowly"), "slow"); // -ly
// The stemmer is minimal — it doesn't need to be perfect,
// just consistent enough that related words collide.
assert_eq!(stem("observations"), "observation"); // -s stripped, -ation stays (word too short after)
}
#[test]
fn test_cosine_identical() {
let text = "the quick brown fox jumps over the lazy dog";
let sim = cosine_similarity(text, text);
assert!((sim - 1.0).abs() < 0.01, "identical docs should have sim ~1.0, got {}", sim);
}
#[test]
fn test_cosine_different() {
let a = "kernel filesystem transaction restart handling";
let b = "cooking recipe chocolate cake baking temperature";
let sim = cosine_similarity(a, b);
assert!(sim < 0.1, "unrelated docs should have low sim, got {}", sim);
}
}