extract shared transcript parser and similarity matching helpers

- New agents/transcript.rs: shared JSONL parsing for enrich, fact_mine, and knowledge (was 3 separate implementations, ~150 lines duplicated) - New best_match() and section_children() helpers in neuro/rewrite.rs (was duplicated find-best-by-similarity loop + section collection) - Net -153 lines
2026-03-08 21:42:53 -04:00 · 2026-03-08 21:42:53 -04:00 · 92f3ba5acf
commit 92f3ba5acf
parent 7c491e92eb
6 changed files with 166 additions and 225 deletions
--- a/poc-memory/src/agents/enrich.rs
+++ b/poc-memory/src/agents/enrich.rs
@ -72,53 +72,11 @@ pub fn is_transcript_mined_with_keys(mined: &HashSet<String>, path: &str) -> boo
 /// Extract user/assistant messages with line numbers from a JSONL transcript.
 /// (line_number, role, text, timestamp)
 pub fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String, String)>, String> {
-    let content = fs::read_to_string(jsonl_path)
+    let path = std::path::Path::new(jsonl_path);
-        .map_err(|e| format!("read {}: {}", jsonl_path, e))?;
+    let messages = super::transcript::parse_transcript(path)?;
-
+    Ok(messages.into_iter()
-    let mut messages = Vec::new();
+        .map(|m| (m.line, m.role, m.text, m.timestamp))
-    for (i, line) in content.lines().enumerate() {
+        .collect())
        let obj: serde_json::Value = match serde_json::from_str(line) {
            Ok(v) => v,
            Err(_) => continue,
        };
        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
        if msg_type != "user" && msg_type != "assistant" { continue; }
        let timestamp = obj.get("timestamp")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string();
        let msg = obj.get("message").unwrap_or(&obj);
        let content = msg.get("content");
        let text = match content {
            Some(serde_json::Value::String(s)) => s.clone(),
            Some(serde_json::Value::Array(arr)) => {
                arr.iter()
                    .filter_map(|c| {
                        // Only extract text blocks; skip tool_use, tool_result, thinking, etc.
                        let is_text = c.get("type").and_then(|v| v.as_str()) == Some("text");
                        if is_text {
                            c.get("text").and_then(|v| v.as_str()).map(|s| s.to_string())
                        } else {
                            c.as_str().map(|s| s.to_string())
                        }
                    })
                    .collect::<Vec<_>>()
                    .join("\n")
            }
            _ => continue,
        };
        let text = text.trim().to_string();
        if text.is_empty() { continue; }
        messages.push((i + 1, msg_type.to_string(), text, timestamp));
    }
    Ok(messages)
 }
 pub const COMPACTION_MARKER: &str = "This session is being continued from a previous conversation that ran out of context";
--- a/poc-memory/src/agents/fact_mine.rs
+++ b/poc-memory/src/agents/fact_mine.rs
@ -7,11 +7,11 @@
 use crate::config;
 use super::llm;
 use super::transcript;
 use crate::store::{self, Provenance};
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 use std::fs;
 use std::path::Path;
 const CHARS_PER_TOKEN: usize = 4;
@ -75,81 +75,27 @@ pub struct Fact {
    pub source_offset: Option<usize>,
 }
 struct Message {
    role: String,
    text: String,
    timestamp: String,
 }
 /// Extract user/assistant text messages from a JSONL transcript.
-fn extract_conversation(path: &Path) -> Vec<Message> {
+fn extract_messages(path: &Path) -> Vec<transcript::TranscriptMessage> {
-    let cfg = config::get();
+    transcript::parse_transcript(path)
-    let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
+        .unwrap_or_default()
-    let mut messages = Vec::new();
+        .into_iter()
-
+        .filter(|m| m.text.len() >= 20)
-    for line in content.lines() {
+        .collect()
        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
        if msg_type != "user" && msg_type != "assistant" {
            continue;
        }
        let timestamp = obj.get("timestamp")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string();
        let msg = obj.get("message").unwrap_or(&obj);
        let content = msg.get("content");
        let text = match content {
            Some(serde_json::Value::String(s)) => s.clone(),
            Some(serde_json::Value::Array(arr)) => {
                let texts: Vec<&str> = arr.iter()
                    .filter_map(|block| {
                        let obj = block.as_object()?;
                        if obj.get("type")?.as_str()? != "text" {
                            return None;
                        }
                        let t = obj.get("text")?.as_str()?;
                        if t.contains("<system-reminder>") {
                            return None;
                        }
                        Some(t)
                    })
                    .collect();
                texts.join("\n")
            }
            _ => continue,
        };
        let text = text.trim().to_string();
        if text.len() < 20 {
            continue;
        }
        let role = if msg_type == "user" {
            cfg.user_name.clone()
        } else {
            cfg.assistant_name.clone()
        };
        messages.push(Message { role, text, timestamp });
    }
    messages
 }
 /// Format messages into a single text for chunking.
-fn format_for_extraction(messages: &[Message]) -> String {
+fn format_for_extraction(messages: &[transcript::TranscriptMessage]) -> String {
    let cfg = config::get();
    messages.iter()
        .map(|msg| {
            let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
            let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
            let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
            if ts.is_empty() {
-                format!("[{}] {}", msg.role, text)
+                format!("[{}] {}", role, text)
            } else {
-                format!("[{} {}] {}", msg.role, ts, text)
+                format!("[{} {}] {}", role, ts, text)
            }
        })
        .collect::<Vec<_>>()
@ -224,7 +170,7 @@ pub fn mine_transcript(
    log(&format!("Mining: {}", filename));
-    let messages = extract_conversation(path);
+    let messages = extract_messages(path);
    if messages.is_empty() {
        log("No messages found");
        return Ok(Vec::new());
@ -322,7 +268,7 @@ pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result
    let mut all_facts = Vec::new();
    for path in paths {
-        let messages = extract_conversation(path);
+        let messages = extract_messages(path);
        if messages.len() < min_messages {
            eprintln!("Skipping {} ({} messages < {})",
                path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
--- a/poc-memory/src/agents/knowledge.rs
+++ b/poc-memory/src/agents/knowledge.rs
@ -333,84 +333,41 @@ fn get_graph_topology(store: &Store, graph: &Graph) -> String {
 }
 /// Strip <system-reminder> blocks from text
 fn strip_system_tags(text: &str) -> String {
    let re = Regex::new(r"(?s)<system-reminder>.*?</system-reminder>").unwrap();
    re.replace_all(text, "").trim().to_string()
 }
 /// Extract human-readable dialogue from a conversation JSONL
 fn extract_conversation_text(path: &Path, max_chars: usize) -> String {
-    let Ok(content) = fs::read_to_string(path) else { return String::new() };
+    let cfg = crate::config::get();
    let messages = super::transcript::parse_transcript(path).unwrap_or_default();
    let mut fragments = Vec::new();
    let mut total = 0;
-    for line in content.lines() {
+    for msg in &messages {
-        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
+        let min_len = if msg.role == "user" { 5 } else { 10 };
        if msg.text.len() <= min_len { continue; }
-        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
+        // Only include external user messages
-
+        if msg.role == "user" {
-        if msg_type == "user" && obj.get("userType").and_then(|v| v.as_str()) == Some("external") {
+            if msg.user_type.as_deref() != Some("external") { continue; }
-            if let Some(text) = extract_text_content(&obj) {
+            if msg.text.starts_with("[Request interrupted") { continue; }
                let text = strip_system_tags(&text);
                if text.starts_with("[Request interrupted") { continue; }
                if text.len() > 5 {
                    fragments.push(format!("**{}:** {}", crate::config::get().user_name, text));
                    total += text.len();
                }
            }
        } else if msg_type == "assistant" {
            if let Some(text) = extract_text_content(&obj) {
                let text = strip_system_tags(&text);
                if text.len() > 10 {
                    fragments.push(format!("**{}:** {}", crate::config::get().assistant_name, text));
                    total += text.len();
                }
            }
        }
        let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
        fragments.push(format!("**{}:** {}", role, msg.text));
        total += msg.text.len();
        if total > max_chars { break; }
    }
    fragments.join("\n\n")
 }
 fn extract_text_content(obj: &serde_json::Value) -> Option<String> {
    let msg = obj.get("message")?;
    let content = msg.get("content")?;
    if let Some(s) = content.as_str() {
        return Some(s.to_string());
    }
    if let Some(arr) = content.as_array() {
        let texts: Vec<&str> = arr.iter()
            .filter_map(|b| {
                if b.get("type")?.as_str()? == "text" {
                    b.get("text")?.as_str()
                } else {
                    None
                }
            })
            .collect();
        if !texts.is_empty() {
            return Some(texts.join("\n"));
        }
    }
    None
 }
 /// Count short user messages (dialogue turns) in a JSONL
 fn count_dialogue_turns(path: &Path) -> usize {
-    let Ok(content) = fs::read_to_string(path) else { return 0 };
+    let messages = super::transcript::parse_transcript(path).unwrap_or_default();
-    content.lines()
+    messages.iter()
-        .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
+        .filter(|m| m.role == "user"
-        .filter(|obj| {
+            && m.user_type.as_deref() == Some("external")
-            obj.get("type").and_then(|v| v.as_str()) == Some("user")
+            && m.text.len() > 5
-                && obj.get("userType").and_then(|v| v.as_str()) == Some("external")
+            && m.text.len() < 500
-        })
+            && !m.text.starts_with("[Request interrupted")
-        .filter(|obj| {
+            && !m.text.starts_with("Implement the following"))
            let text = extract_text_content(obj).unwrap_or_default();
            text.len() > 5 && text.len() < 500
                && !text.starts_with("[Request interrupted")
                && !text.starts_with("Implement the following")
        })
        .count()
 }
--- a/poc-memory/src/agents/mod.rs
+++ b/poc-memory/src/agents/mod.rs
@ -13,7 +13,9 @@
 //   fact_mine    — fact extraction from transcripts
 //   digest       — episodic digest generation (daily/weekly/monthly)
 //   daemon       — background job scheduler
 //   transcript   — shared JSONL transcript parsing
 pub mod transcript;
 pub mod llm;
 pub mod prompts;
 pub mod audit;
--- a/poc-memory/src/agents/transcript.rs
+++ b/poc-memory/src/agents/transcript.rs
@ -0,0 +1,94 @@
 // Shared JSONL transcript parsing
 //
 // Three agents (enrich, fact_mine, knowledge) all parse Claude Code JSONL
 // transcripts. This module provides the shared core: parse each line, extract
 // message type, text content from string-or-array blocks, timestamp, and
 // user type. Callers filter and transform as needed.
 use std::fs;
 use std::path::Path;
 /// A single message extracted from a JSONL transcript.
 pub struct TranscriptMessage {
    /// 1-based line number in the JSONL file.
    pub line: usize,
    /// Raw role: "user" or "assistant".
    pub role: String,
    /// Extracted text content (trimmed, blocks joined with newlines).
    pub text: String,
    /// ISO timestamp from the message, or empty string.
    pub timestamp: String,
    /// For user messages: "external", "internal", etc. None for assistant.
    pub user_type: Option<String>,
 }
 /// Parse a JSONL transcript into structured messages.
 ///
 /// Extracts all user and assistant messages. Content blocks of type "text"
 /// are joined; tool_use, tool_result, thinking blocks are skipped.
 /// System-reminder blocks are filtered out.
 pub fn parse_transcript(path: &Path) -> Result<Vec<TranscriptMessage>, String> {
    let content = fs::read_to_string(path)
        .map_err(|e| format!("read {}: {}", path.display(), e))?;
    let mut messages = Vec::new();
    for (i, line) in content.lines().enumerate() {
        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
        if msg_type != "user" && msg_type != "assistant" { continue; }
        let timestamp = obj.get("timestamp")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string();
        let user_type = obj.get("userType")
            .and_then(|v| v.as_str())
            .map(|s| s.to_string());
        let Some(text) = extract_text_content(&obj) else { continue };
        let text = text.trim().to_string();
        if text.is_empty() { continue; }
        messages.push(TranscriptMessage {
            line: i + 1,
            role: msg_type.to_string(),
            text,
            timestamp,
            user_type,
        });
    }
    Ok(messages)
 }
 /// Extract text content from a JSONL message object.
 ///
 /// Handles both string content and array-of-blocks content (filtering to
 /// type="text" blocks only). Strips `<system-reminder>` tags.
 fn extract_text_content(obj: &serde_json::Value) -> Option<String> {
    let msg = obj.get("message").unwrap_or(obj);
    let content = msg.get("content")?;
    let text = match content {
        serde_json::Value::String(s) => s.clone(),
        serde_json::Value::Array(arr) => {
            let texts: Vec<&str> = arr.iter()
                .filter_map(|block| {
                    let block_type = block.get("type").and_then(|v| v.as_str())?;
                    if block_type != "text" { return None; }
                    let t = block.get("text").and_then(|v| v.as_str())?;
                    // Skip system-reminder blocks entirely
                    if t.contains("<system-reminder>") { return None; }
                    Some(t)
                })
                .collect();
            if texts.is_empty() { return None; }
            texts.join("\n")
        }
        _ => return None,
    };
    Some(text)
 }
--- a/poc-memory/src/neuro/rewrite.rs
+++ b/poc-memory/src/neuro/rewrite.rs
@ -5,6 +5,28 @@ use crate::store::{Store, new_relation};
 use crate::graph::Graph;
 use crate::similarity;
 /// Collect (key, content) pairs for all section children of a file-level node.
 fn section_children<'a>(store: &'a Store, file_key: &str) -> Vec<(&'a str, &'a str)> {
    let prefix = format!("{}#", file_key);
    store.nodes.iter()
        .filter(|(k, _)| k.starts_with(&prefix))
        .map(|(k, n)| (k.as_str(), n.content.as_str()))
        .collect()
 }
 /// Find the best matching candidate by cosine similarity against content.
 /// Returns (key, similarity) if any candidate exceeds threshold.
 fn best_match(candidates: &[(&str, &str)], content: &str, threshold: f32) -> Option<(String, f32)> {
    let (best_key, best_sim) = candidates.iter()
        .map(|(key, text)| (*key, similarity::cosine_similarity(content, text)))
        .max_by(|a, b| a.1.total_cmp(&b.1))?;
    if best_sim > threshold {
        Some((best_key.to_string(), best_sim))
    } else {
        None
    }
 }
 /// Refine a link target: if the target is a file-level node with section
 /// children, find the best-matching section by cosine similarity against
 /// the source content. Returns the original key if no sections exist or
@ -16,31 +38,13 @@ pub fn refine_target(store: &Store, source_content: &str, target_key: &str) -> S
    // Only refine file-level nodes (no # in key)
    if target_key.contains('#') { return target_key.to_string(); }
-    let prefix = format!("{}#", target_key);
+    let sections = section_children(store, target_key);
    let sections: Vec<(&str, &str)> = store.nodes.iter()
        .filter(|(k, _)| k.starts_with(&prefix))
        .map(|(k, n)| (k.as_str(), n.content.as_str()))
        .collect();
    if sections.is_empty() { return target_key.to_string(); }
-    let mut best_section = "";
+    best_match(&sections, source_content, 0.05)
-    let mut best_sim = 0.0f32;
+        .map(|(key, _)| key)
-
+        .unwrap_or_else(|| target_key.to_string())
    for (section_key, section_content) in &sections {
        let sim = similarity::cosine_similarity(source_content, section_content);
        if sim > best_sim {
            best_sim = sim;
            best_section = section_key;
        }
    }
    // Threshold: only refine if there's a meaningful match
    if best_sim > 0.05 && !best_section.is_empty() {
        best_section.to_string()
    } else {
        target_key.to_string()
    }
 }
 /// A proposed link move: from hub→neighbor to section→neighbor
@ -70,16 +74,12 @@ pub fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph)
    // Only works on file-level nodes that have section children
    if hub_key.contains('#') { return None; }
-    let prefix = format!("{}#", hub_key);
+    let sections = section_children(store, hub_key);
    let sections: Vec<(&str, &str)> = store.nodes.iter()
        .filter(|(k, _)| k.starts_with(&prefix))
        .map(|(k, n)| (k.as_str(), n.content.as_str()))
        .collect();
    if sections.is_empty() { return None; }
    // Get all neighbors of the hub
    let neighbors = graph.neighbors(hub_key);
    let prefix = format!("{}#", hub_key);
    let mut moves = Vec::new();
@ -93,19 +93,7 @@ pub fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph)
        };
        // Find best-matching section by content similarity
-        let mut best_section = "";
+        if let Some((best_section, best_sim)) = best_match(&sections, neighbor_content, 0.05) {
        let mut best_sim = 0.0f32;
        for (section_key, section_content) in &sections {
            let sim = similarity::cosine_similarity(neighbor_content, section_content);
            if sim > best_sim {
                best_sim = sim;
                best_section = section_key;
            }
        }
        // Only propose move if there's a reasonable match
        if best_sim > 0.05 && !best_section.is_empty() {
            let snippet = crate::util::first_n_chars(
                neighbor_content.lines()
                    .find(|l| !l.is_empty() && !l.starts_with("<!--") && !l.starts_with("##"))
@ -115,7 +103,7 @@ pub fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph)
            moves.push(LinkMove {
                neighbor_key: neighbor_key.to_string(),
                from_hub: hub_key.to_string(),
-                to_section: best_section.to_string(),
+                to_section: best_section,
                similarity: best_sim,
                neighbor_snippet: snippet,
            });
@ -188,11 +176,7 @@ pub fn find_differentiable_hubs(store: &Store) -> Vec<(String, usize, usize)> {
        if deg < threshold { continue; }
        if key.contains('#') { continue; }
-        let prefix = format!("{}#", key);
+        let section_count = section_children(store, key).len();
        let section_count = store.nodes.keys()
            .filter(|k| k.starts_with(&prefix))
            .count();
        if section_count > 0 {
            hubs.push((key.clone(), deg, section_count));
        }