split into workspace: poc-memory and poc-daemon subcrates

poc-daemon (notification routing, idle timer, IRC, Telegram) was already fully self-contained with no imports from the poc-memory library. Now it's a proper separate crate with its own Cargo.toml and capnp schema. poc-memory retains the store, graph, search, neuro, knowledge, and the jobkit-based memory maintenance daemon (daemon.rs). Co-Authored-By: ProofOfConcept <poc@bcachefs.org>
2026-03-08 20:42:40 -04:00 · 2026-03-08 20:42:40 -04:00 · fc48ac7c7f
commit fc48ac7c7f
parent 488fd5a0aa
53 changed files with 108 additions and 76 deletions
--- a/poc-memory/src/similarity.rs
+++ b/poc-memory/src/similarity.rs
@ -0,0 +1,135 @@
+// Text similarity: Porter stemming + BM25
+//
+// Used for interference detection (similar content, different communities)
+// and schema fit scoring. Intentionally simple — ~100 lines, no
+// external dependencies.
+
+use std::collections::HashMap;
+
+/// Minimal Porter stemmer — handles the most common English suffixes.
+/// Not linguistically complete but good enough for similarity matching.
+pub fn stem(word: &str) -> String {
+    let w = word.to_lowercase();
+    if w.len() <= 3 { return w; }
+
+    let w = strip_suffix(&w, "ation", "ate");
+    let w = strip_suffix(&w, "ness", "");
+    let w = strip_suffix(&w, "ment", "");
+    let w = strip_suffix(&w, "ting", "t");
+    let w = strip_suffix(&w, "ling", "l");
+    let w = strip_suffix(&w, "ring", "r");
+    let w = strip_suffix(&w, "ning", "n");
+    let w = strip_suffix(&w, "ding", "d");
+    let w = strip_suffix(&w, "ping", "p");
+    let w = strip_suffix(&w, "ging", "g");
+    let w = strip_suffix(&w, "ying", "y");
+    let w = strip_suffix(&w, "ied", "y");
+    let w = strip_suffix(&w, "ies", "y");
+    let w = strip_suffix(&w, "ing", "");
+    let w = strip_suffix(&w, "ed", "");
+    let w = strip_suffix(&w, "ly", "");
+    let w = strip_suffix(&w, "er", "");
+    let w = strip_suffix(&w, "al", "");
+    strip_suffix(&w, "s", "")
+}
+
+fn strip_suffix(word: &str, suffix: &str, replacement: &str) -> String {
+    if word.len() > suffix.len() + 2 && word.ends_with(suffix) {
+        let base = &word[..word.len() - suffix.len()];
+        format!("{}{}", base, replacement)
+    } else {
+        word.to_string()
+    }
+}
+
+/// Tokenize and stem a text into a term frequency map
+pub fn term_frequencies(text: &str) -> HashMap<String, u32> {
+    let mut tf = HashMap::new();
+    for word in text.split(|c: char| !c.is_alphanumeric()) {
+        if word.len() > 2 {
+            let stemmed = stem(word);
+            *tf.entry(stemmed).or_default() += 1;
+        }
+    }
+    tf
+}
+
+/// Cosine similarity between two documents using stemmed term frequencies.
+/// Returns 0.0 for disjoint vocabularies, 1.0 for identical content.
+pub fn cosine_similarity(doc_a: &str, doc_b: &str) -> f32 {
+    let tf_a = term_frequencies(doc_a);
+    let tf_b = term_frequencies(doc_b);
+
+    if tf_a.is_empty() || tf_b.is_empty() {
+        return 0.0;
+    }
+
+    // Dot product
+    let mut dot = 0.0f64;
+    for (term, &freq_a) in &tf_a {
+        if let Some(&freq_b) = tf_b.get(term) {
+            dot += freq_a as f64 * freq_b as f64;
+        }
+    }
+
+    // Magnitudes
+    let mag_a: f64 = tf_a.values().map(|&f| (f as f64).powi(2)).sum::<f64>().sqrt();
+    let mag_b: f64 = tf_b.values().map(|&f| (f as f64).powi(2)).sum::<f64>().sqrt();
+
+    if mag_a < 1e-10 || mag_b < 1e-10 {
+        return 0.0;
+    }
+
+    (dot / (mag_a * mag_b)) as f32
+}
+
+/// Compute pairwise similarity for a set of documents.
+/// Returns pairs with similarity above threshold.
+pub fn pairwise_similar(
+    docs: &[(String, String)],  // (key, content)
+    threshold: f32,
+) -> Vec<(String, String, f32)> {
+    let mut results = Vec::new();
+
+    for i in 0..docs.len() {
+        for j in (i + 1)..docs.len() {
+            let sim = cosine_similarity(&docs[i].1, &docs[j].1);
+            if sim >= threshold {
+                results.push((docs[i].0.clone(), docs[j].0.clone(), sim));
+            }
+        }
+    }
+
+    results.sort_by(|a, b| b.2.total_cmp(&a.2));
+    results
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_stem() {
+        assert_eq!(stem("running"), "runn"); // -ning → n
+        assert_eq!(stem("talking"), "talk"); // not matched by specific consonant rules
+        assert_eq!(stem("slowly"), "slow"); // -ly
+        // The stemmer is minimal — it doesn't need to be perfect,
+        // just consistent enough that related words collide.
+        assert_eq!(stem("observations"), "observation"); // -s stripped, -ation stays (word too short after)
+    }
+
+    #[test]
+    fn test_cosine_identical() {
+        let text = "the quick brown fox jumps over the lazy dog";
+        let sim = cosine_similarity(text, text);
+        assert!((sim - 1.0).abs() < 0.01, "identical docs should have sim ~1.0, got {}", sim);
+    }
+
+    #[test]
+    fn test_cosine_different() {
+        let a = "kernel filesystem transaction restart handling";
+        let b = "cooking recipe chocolate cake baking temperature";
+        let sim = cosine_similarity(a, b);
+        assert!(sim < 0.1, "unrelated docs should have low sim, got {}", sim);
+    }
+}