// Text similarity: Porter stemming + BM25 // // Used for interference detection (similar content, different communities) // and schema fit scoring. Intentionally simple — ~100 lines, no // external dependencies. use std::collections::HashMap; /// Minimal Porter stemmer — handles the most common English suffixes. /// Not linguistically complete but good enough for similarity matching. /// Single allocation: works on one String buffer throughout. /// /// If this is still a hot spot, replace the sequential suffix checks /// with a reversed-suffix trie: single pass from the end of the word /// matches the longest applicable suffix in O(suffix_len) instead of /// O(n_rules). pub fn stem(word: &str) -> String { let mut w = word.to_lowercase(); if w.len() <= 3 { return w; } strip_suffix_inplace(&mut w, "ation", "ate"); strip_suffix_inplace(&mut w, "ness", ""); strip_suffix_inplace(&mut w, "ment", ""); strip_suffix_inplace(&mut w, "ting", "t"); strip_suffix_inplace(&mut w, "ling", "l"); strip_suffix_inplace(&mut w, "ring", "r"); strip_suffix_inplace(&mut w, "ning", "n"); strip_suffix_inplace(&mut w, "ding", "d"); strip_suffix_inplace(&mut w, "ping", "p"); strip_suffix_inplace(&mut w, "ging", "g"); strip_suffix_inplace(&mut w, "ying", "y"); strip_suffix_inplace(&mut w, "ied", "y"); strip_suffix_inplace(&mut w, "ies", "y"); strip_suffix_inplace(&mut w, "ing", ""); strip_suffix_inplace(&mut w, "ed", ""); strip_suffix_inplace(&mut w, "ly", ""); strip_suffix_inplace(&mut w, "er", ""); strip_suffix_inplace(&mut w, "al", ""); strip_suffix_inplace(&mut w, "s", ""); w } fn strip_suffix_inplace(word: &mut String, suffix: &str, replacement: &str) { if word.len() > suffix.len() + 2 && word.ends_with(suffix) { word.truncate(word.len() - suffix.len()); word.push_str(replacement); } } /// Tokenize and stem a text into a term frequency map pub fn term_frequencies(text: &str) -> HashMap { let mut tf = HashMap::new(); for word in text.split(|c: char| !c.is_alphanumeric()) { if word.len() > 2 { let stemmed = stem(word); *tf.entry(stemmed).or_default() += 1; } } tf } /// Cosine similarity between two documents using stemmed term frequencies. /// Returns 0.0 for disjoint vocabularies, 1.0 for identical content. pub fn cosine_similarity(doc_a: &str, doc_b: &str) -> f32 { let tf_a = term_frequencies(doc_a); let tf_b = term_frequencies(doc_b); if tf_a.is_empty() || tf_b.is_empty() { return 0.0; } // Dot product let mut dot = 0.0f64; for (term, &freq_a) in &tf_a { if let Some(&freq_b) = tf_b.get(term) { dot += freq_a as f64 * freq_b as f64; } } // Magnitudes let mag_a: f64 = tf_a.values().map(|&f| (f as f64).powi(2)).sum::().sqrt(); let mag_b: f64 = tf_b.values().map(|&f| (f as f64).powi(2)).sum::().sqrt(); if mag_a < 1e-10 || mag_b < 1e-10 { return 0.0; } (dot / (mag_a * mag_b)) as f32 } /// Compute pairwise similarity for a set of documents. /// Returns pairs with similarity above threshold. pub fn pairwise_similar( docs: &[(String, String)], // (key, content) threshold: f32, ) -> Vec<(String, String, f32)> { let mut results = Vec::new(); for i in 0..docs.len() { for j in (i + 1)..docs.len() { let sim = cosine_similarity(&docs[i].1, &docs[j].1); if sim >= threshold { results.push((docs[i].0.clone(), docs[j].0.clone(), sim)); } } } results.sort_by(|a, b| b.2.total_cmp(&a.2)); results } #[cfg(test)] mod tests { use super::*; #[test] fn test_stem() { assert_eq!(stem("running"), "runn"); // -ning → n assert_eq!(stem("talking"), "talk"); // not matched by specific consonant rules assert_eq!(stem("slowly"), "slow"); // -ly // The stemmer is minimal — it doesn't need to be perfect, // just consistent enough that related words collide. assert_eq!(stem("observations"), "observation"); // -s stripped, -ation stays (word too short after) } #[test] fn test_cosine_identical() { let text = "the quick brown fox jumps over the lazy dog"; let sim = cosine_similarity(text, text); assert!((sim - 1.0).abs() < 0.01, "identical docs should have sim ~1.0, got {}", sim); } #[test] fn test_cosine_different() { let a = "kernel filesystem transaction restart handling"; let b = "cooking recipe chocolate cake baking temperature"; let sim = cosine_similarity(a, b); assert!(sim < 0.1, "unrelated docs should have low sim, got {}", sim); } }