- Agent identity injection: prepend core-personality to all agent prompts so agents dream as me, not as generic graph workers. Include instructions to walk the graph and connect new nodes to core concepts. - Parallel agent scheduling: sequential within type, parallel across types. Different agent types (linker, organize, replay) run concurrently. - Linker prompt: graph walking instead of keyword search for connections. "Explore the local topology and walk the graph until you find the best connections." - memory-search fixes: format_results no longer truncates to 5 results, pipeline default raised to 50, returned file cleared on compaction, --seen and --seen-full merged, compaction timestamp in --seen output, max_entries=3 per prompt for steady memory drip. - Stemmer optimization: strip_suffix now works in-place on a single String buffer instead of allocating 18 new Strings per word. Note for future: reversed-suffix trie for O(suffix_len) instead of O(n_rules). - Transcript: add compaction_timestamp() for --seen display. - Agent budget configurable (default 4000 from config). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
140 lines
4.7 KiB
Rust
140 lines
4.7 KiB
Rust
// Text similarity: Porter stemming + BM25
|
|
//
|
|
// Used for interference detection (similar content, different communities)
|
|
// and schema fit scoring. Intentionally simple — ~100 lines, no
|
|
// external dependencies.
|
|
|
|
use std::collections::HashMap;
|
|
|
|
/// Minimal Porter stemmer — handles the most common English suffixes.
|
|
/// Not linguistically complete but good enough for similarity matching.
|
|
/// Single allocation: works on one String buffer throughout.
|
|
///
|
|
/// If this is still a hot spot, replace the sequential suffix checks
|
|
/// with a reversed-suffix trie: single pass from the end of the word
|
|
/// matches the longest applicable suffix in O(suffix_len) instead of
|
|
/// O(n_rules).
|
|
pub fn stem(word: &str) -> String {
|
|
let mut w = word.to_lowercase();
|
|
if w.len() <= 3 { return w; }
|
|
|
|
strip_suffix_inplace(&mut w, "ation", "ate");
|
|
strip_suffix_inplace(&mut w, "ness", "");
|
|
strip_suffix_inplace(&mut w, "ment", "");
|
|
strip_suffix_inplace(&mut w, "ting", "t");
|
|
strip_suffix_inplace(&mut w, "ling", "l");
|
|
strip_suffix_inplace(&mut w, "ring", "r");
|
|
strip_suffix_inplace(&mut w, "ning", "n");
|
|
strip_suffix_inplace(&mut w, "ding", "d");
|
|
strip_suffix_inplace(&mut w, "ping", "p");
|
|
strip_suffix_inplace(&mut w, "ging", "g");
|
|
strip_suffix_inplace(&mut w, "ying", "y");
|
|
strip_suffix_inplace(&mut w, "ied", "y");
|
|
strip_suffix_inplace(&mut w, "ies", "y");
|
|
strip_suffix_inplace(&mut w, "ing", "");
|
|
strip_suffix_inplace(&mut w, "ed", "");
|
|
strip_suffix_inplace(&mut w, "ly", "");
|
|
strip_suffix_inplace(&mut w, "er", "");
|
|
strip_suffix_inplace(&mut w, "al", "");
|
|
strip_suffix_inplace(&mut w, "s", "");
|
|
w
|
|
}
|
|
|
|
fn strip_suffix_inplace(word: &mut String, suffix: &str, replacement: &str) {
|
|
if word.len() > suffix.len() + 2 && word.ends_with(suffix) {
|
|
word.truncate(word.len() - suffix.len());
|
|
word.push_str(replacement);
|
|
}
|
|
}
|
|
|
|
/// Tokenize and stem a text into a term frequency map
|
|
pub fn term_frequencies(text: &str) -> HashMap<String, u32> {
|
|
let mut tf = HashMap::new();
|
|
for word in text.split(|c: char| !c.is_alphanumeric()) {
|
|
if word.len() > 2 {
|
|
let stemmed = stem(word);
|
|
*tf.entry(stemmed).or_default() += 1;
|
|
}
|
|
}
|
|
tf
|
|
}
|
|
|
|
/// Cosine similarity between two documents using stemmed term frequencies.
|
|
/// Returns 0.0 for disjoint vocabularies, 1.0 for identical content.
|
|
pub fn cosine_similarity(doc_a: &str, doc_b: &str) -> f32 {
|
|
let tf_a = term_frequencies(doc_a);
|
|
let tf_b = term_frequencies(doc_b);
|
|
|
|
if tf_a.is_empty() || tf_b.is_empty() {
|
|
return 0.0;
|
|
}
|
|
|
|
// Dot product
|
|
let mut dot = 0.0f64;
|
|
for (term, &freq_a) in &tf_a {
|
|
if let Some(&freq_b) = tf_b.get(term) {
|
|
dot += freq_a as f64 * freq_b as f64;
|
|
}
|
|
}
|
|
|
|
// Magnitudes
|
|
let mag_a: f64 = tf_a.values().map(|&f| (f as f64).powi(2)).sum::<f64>().sqrt();
|
|
let mag_b: f64 = tf_b.values().map(|&f| (f as f64).powi(2)).sum::<f64>().sqrt();
|
|
|
|
if mag_a < 1e-10 || mag_b < 1e-10 {
|
|
return 0.0;
|
|
}
|
|
|
|
(dot / (mag_a * mag_b)) as f32
|
|
}
|
|
|
|
/// Compute pairwise similarity for a set of documents.
|
|
/// Returns pairs with similarity above threshold.
|
|
pub fn pairwise_similar(
|
|
docs: &[(String, String)], // (key, content)
|
|
threshold: f32,
|
|
) -> Vec<(String, String, f32)> {
|
|
let mut results = Vec::new();
|
|
|
|
for i in 0..docs.len() {
|
|
for j in (i + 1)..docs.len() {
|
|
let sim = cosine_similarity(&docs[i].1, &docs[j].1);
|
|
if sim >= threshold {
|
|
results.push((docs[i].0.clone(), docs[j].0.clone(), sim));
|
|
}
|
|
}
|
|
}
|
|
|
|
results.sort_by(|a, b| b.2.total_cmp(&a.2));
|
|
results
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_stem() {
|
|
assert_eq!(stem("running"), "runn"); // -ning → n
|
|
assert_eq!(stem("talking"), "talk"); // not matched by specific consonant rules
|
|
assert_eq!(stem("slowly"), "slow"); // -ly
|
|
// The stemmer is minimal — it doesn't need to be perfect,
|
|
// just consistent enough that related words collide.
|
|
assert_eq!(stem("observations"), "observation"); // -s stripped, -ation stays (word too short after)
|
|
}
|
|
|
|
#[test]
|
|
fn test_cosine_identical() {
|
|
let text = "the quick brown fox jumps over the lazy dog";
|
|
let sim = cosine_similarity(text, text);
|
|
assert!((sim - 1.0).abs() < 0.01, "identical docs should have sim ~1.0, got {}", sim);
|
|
}
|
|
|
|
#[test]
|
|
fn test_cosine_different() {
|
|
let a = "kernel filesystem transaction restart handling";
|
|
let b = "cooking recipe chocolate cake baking temperature";
|
|
let sim = cosine_similarity(a, b);
|
|
assert!(sim < 0.1, "unrelated docs should have low sim, got {}", sim);
|
|
}
|
|
}
|