extract shared transcript parser and similarity matching helpers

- New agents/transcript.rs: shared JSONL parsing for enrich, fact_mine, and knowledge (was 3 separate implementations, ~150 lines duplicated) - New best_match() and section_children() helpers in neuro/rewrite.rs (was duplicated find-best-by-similarity loop + section collection) - Net -153 lines
2026-03-08 21:42:53 -04:00 · 2026-03-08 21:42:53 -04:00 · 92f3ba5acf
commit 92f3ba5acf
parent 7c491e92eb
6 changed files with 166 additions and 225 deletions
--- a/poc-memory/src/agents/enrich.rs
+++ b/poc-memory/src/agents/enrich.rs
@ -72,53 +72,11 @@ pub fn is_transcript_mined_with_keys(mined: &HashSet<String>, path: &str) -> boo
 /// Extract user/assistant messages with line numbers from a JSONL transcript.
 /// (line_number, role, text, timestamp)
 pub fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String, String)>, String> {
-    let content = fs::read_to_string(jsonl_path)
-        .map_err(|e| format!("read {}: {}", jsonl_path, e))?;
-
-    let mut messages = Vec::new();
-    for (i, line) in content.lines().enumerate() {
-        let obj: serde_json::Value = match serde_json::from_str(line) {
-            Ok(v) => v,
-            Err(_) => continue,
-        };
-
-        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
-        if msg_type != "user" && msg_type != "assistant" { continue; }
-
-        let timestamp = obj.get("timestamp")
-            .and_then(|v| v.as_str())
-            .unwrap_or("")
-            .to_string();
-
-        let msg = obj.get("message").unwrap_or(&obj);
-        let content = msg.get("content");
-
-        let text = match content {
-            Some(serde_json::Value::String(s)) => s.clone(),
-            Some(serde_json::Value::Array(arr)) => {
-                arr.iter()
-                    .filter_map(|c| {
-                        // Only extract text blocks; skip tool_use, tool_result, thinking, etc.
-                        let is_text = c.get("type").and_then(|v| v.as_str()) == Some("text");
-                        if is_text {
-                            c.get("text").and_then(|v| v.as_str()).map(|s| s.to_string())
-                        } else {
-                            c.as_str().map(|s| s.to_string())
-                        }
-                    })
-                    .collect::<Vec<_>>()
-                    .join("\n")
-            }
-            _ => continue,
-        };
-
-        let text = text.trim().to_string();
-        if text.is_empty() { continue; }
-
-        messages.push((i + 1, msg_type.to_string(), text, timestamp));
-    }
-
-    Ok(messages)
+    let path = std::path::Path::new(jsonl_path);
+    let messages = super::transcript::parse_transcript(path)?;
+    Ok(messages.into_iter()
+        .map(|m| (m.line, m.role, m.text, m.timestamp))
+        .collect())
 }

 pub const COMPACTION_MARKER: &str = "This session is being continued from a previous conversation that ran out of context";
--- a/poc-memory/src/agents/fact_mine.rs
+++ b/poc-memory/src/agents/fact_mine.rs
@ -7,11 +7,11 @@

 use crate::config;
 use super::llm;
+use super::transcript;
 use crate::store::{self, Provenance};

 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
-use std::fs;
 use std::path::Path;

 const CHARS_PER_TOKEN: usize = 4;
@ -75,81 +75,27 @@ pub struct Fact {
    pub source_offset: Option<usize>,
 }

-struct Message {
-    role: String,
-    text: String,
-    timestamp: String,
-}
-
 /// Extract user/assistant text messages from a JSONL transcript.
-fn extract_conversation(path: &Path) -> Vec<Message> {
-    let cfg = config::get();
-    let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
-    let mut messages = Vec::new();
-
-    for line in content.lines() {
-        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
-
-        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
-        if msg_type != "user" && msg_type != "assistant" {
-            continue;
-        }
-
-        let timestamp = obj.get("timestamp")
-            .and_then(|v| v.as_str())
-            .unwrap_or("")
-            .to_string();
-
-        let msg = obj.get("message").unwrap_or(&obj);
-        let content = msg.get("content");
-
-        let text = match content {
-            Some(serde_json::Value::String(s)) => s.clone(),
-            Some(serde_json::Value::Array(arr)) => {
-                let texts: Vec<&str> = arr.iter()
-                    .filter_map(|block| {
-                        let obj = block.as_object()?;
-                        if obj.get("type")?.as_str()? != "text" {
-                            return None;
-                        }
-                        let t = obj.get("text")?.as_str()?;
-                        if t.contains("<system-reminder>") {
-                            return None;
-                        }
-                        Some(t)
-                    })
-                    .collect();
-                texts.join("\n")
-            }
-            _ => continue,
-        };
-
-        let text = text.trim().to_string();
-        if text.len() < 20 {
-            continue;
-        }
-
-        let role = if msg_type == "user" {
-            cfg.user_name.clone()
-        } else {
-            cfg.assistant_name.clone()
-        };
-        messages.push(Message { role, text, timestamp });
-    }
-
-    messages
+fn extract_messages(path: &Path) -> Vec<transcript::TranscriptMessage> {
+    transcript::parse_transcript(path)
+        .unwrap_or_default()
+        .into_iter()
+        .filter(|m| m.text.len() >= 20)
+        .collect()
 }

 /// Format messages into a single text for chunking.
-fn format_for_extraction(messages: &[Message]) -> String {
+fn format_for_extraction(messages: &[transcript::TranscriptMessage]) -> String {
+    let cfg = config::get();
    messages.iter()
        .map(|msg| {
+            let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
            let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
            let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
            if ts.is_empty() {
-                format!("[{}] {}", msg.role, text)
+                format!("[{}] {}", role, text)
            } else {
-                format!("[{} {}] {}", msg.role, ts, text)
+                format!("[{} {}] {}", role, ts, text)
            }
        })
        .collect::<Vec<_>>()
@ -224,7 +170,7 @@ pub fn mine_transcript(

    log(&format!("Mining: {}", filename));

-    let messages = extract_conversation(path);
+    let messages = extract_messages(path);
    if messages.is_empty() {
        log("No messages found");
        return Ok(Vec::new());
@ -322,7 +268,7 @@ pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result
    let mut all_facts = Vec::new();

    for path in paths {
-        let messages = extract_conversation(path);
+        let messages = extract_messages(path);
        if messages.len() < min_messages {
            eprintln!("Skipping {} ({} messages < {})",
                path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
--- a/poc-memory/src/agents/knowledge.rs
+++ b/poc-memory/src/agents/knowledge.rs
@ -333,84 +333,41 @@ fn get_graph_topology(store: &Store, graph: &Graph) -> String {
 }

 /// Strip <system-reminder> blocks from text
-fn strip_system_tags(text: &str) -> String {
-    let re = Regex::new(r"(?s)<system-reminder>.*?</system-reminder>").unwrap();
-    re.replace_all(text, "").trim().to_string()
-}
-
 /// Extract human-readable dialogue from a conversation JSONL
 fn extract_conversation_text(path: &Path, max_chars: usize) -> String {
-    let Ok(content) = fs::read_to_string(path) else { return String::new() };
+    let cfg = crate::config::get();
+    let messages = super::transcript::parse_transcript(path).unwrap_or_default();
    let mut fragments = Vec::new();
    let mut total = 0;

-    for line in content.lines() {
-        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
+    for msg in &messages {
+        let min_len = if msg.role == "user" { 5 } else { 10 };
+        if msg.text.len() <= min_len { continue; }

-        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
-
-        if msg_type == "user" && obj.get("userType").and_then(|v| v.as_str()) == Some("external") {
-            if let Some(text) = extract_text_content(&obj) {
-                let text = strip_system_tags(&text);
-                if text.starts_with("[Request interrupted") { continue; }
-                if text.len() > 5 {
-                    fragments.push(format!("**{}:** {}", crate::config::get().user_name, text));
-                    total += text.len();
-                }
-            }
-        } else if msg_type == "assistant" {
-            if let Some(text) = extract_text_content(&obj) {
-                let text = strip_system_tags(&text);
-                if text.len() > 10 {
-                    fragments.push(format!("**{}:** {}", crate::config::get().assistant_name, text));
-                    total += text.len();
-                }
-            }
+        // Only include external user messages
+        if msg.role == "user" {
+            if msg.user_type.as_deref() != Some("external") { continue; }
+            if msg.text.starts_with("[Request interrupted") { continue; }
        }

+        let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
+        fragments.push(format!("**{}:** {}", role, msg.text));
+        total += msg.text.len();
        if total > max_chars { break; }
    }
    fragments.join("\n\n")
 }

-fn extract_text_content(obj: &serde_json::Value) -> Option<String> {
-    let msg = obj.get("message")?;
-    let content = msg.get("content")?;
-    if let Some(s) = content.as_str() {
-        return Some(s.to_string());
-    }
-    if let Some(arr) = content.as_array() {
-        let texts: Vec<&str> = arr.iter()
-            .filter_map(|b| {
-                if b.get("type")?.as_str()? == "text" {
-                    b.get("text")?.as_str()
-                } else {
-                    None
-                }
-            })
-            .collect();
-        if !texts.is_empty() {
-            return Some(texts.join("\n"));
-        }
-    }
-    None
-}
-
 /// Count short user messages (dialogue turns) in a JSONL
 fn count_dialogue_turns(path: &Path) -> usize {
-    let Ok(content) = fs::read_to_string(path) else { return 0 };
-    content.lines()
-        .filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
-        .filter(|obj| {
-            obj.get("type").and_then(|v| v.as_str()) == Some("user")
-                && obj.get("userType").and_then(|v| v.as_str()) == Some("external")
-        })
-        .filter(|obj| {
-            let text = extract_text_content(obj).unwrap_or_default();
-            text.len() > 5 && text.len() < 500
-                && !text.starts_with("[Request interrupted")
-                && !text.starts_with("Implement the following")
-        })
+    let messages = super::transcript::parse_transcript(path).unwrap_or_default();
+    messages.iter()
+        .filter(|m| m.role == "user"
+            && m.user_type.as_deref() == Some("external")
+            && m.text.len() > 5
+            && m.text.len() < 500
+            && !m.text.starts_with("[Request interrupted")
+            && !m.text.starts_with("Implement the following"))
        .count()
 }

--- a/poc-memory/src/agents/mod.rs
+++ b/poc-memory/src/agents/mod.rs
@ -13,7 +13,9 @@
 //   fact_mine    — fact extraction from transcripts
 //   digest       — episodic digest generation (daily/weekly/monthly)
 //   daemon       — background job scheduler
+//   transcript   — shared JSONL transcript parsing

+pub mod transcript;
 pub mod llm;
 pub mod prompts;
 pub mod audit;
--- a/poc-memory/src/agents/transcript.rs
+++ b/poc-memory/src/agents/transcript.rs
@ -0,0 +1,94 @@
+// Shared JSONL transcript parsing
+//
+// Three agents (enrich, fact_mine, knowledge) all parse Claude Code JSONL
+// transcripts. This module provides the shared core: parse each line, extract
+// message type, text content from string-or-array blocks, timestamp, and
+// user type. Callers filter and transform as needed.
+
+use std::fs;
+use std::path::Path;
+
+/// A single message extracted from a JSONL transcript.
+pub struct TranscriptMessage {
+    /// 1-based line number in the JSONL file.
+    pub line: usize,
+    /// Raw role: "user" or "assistant".
+    pub role: String,
+    /// Extracted text content (trimmed, blocks joined with newlines).
+    pub text: String,
+    /// ISO timestamp from the message, or empty string.
+    pub timestamp: String,
+    /// For user messages: "external", "internal", etc. None for assistant.
+    pub user_type: Option<String>,
+}
+
+/// Parse a JSONL transcript into structured messages.
+///
+/// Extracts all user and assistant messages. Content blocks of type "text"
+/// are joined; tool_use, tool_result, thinking blocks are skipped.
+/// System-reminder blocks are filtered out.
+pub fn parse_transcript(path: &Path) -> Result<Vec<TranscriptMessage>, String> {
+    let content = fs::read_to_string(path)
+        .map_err(|e| format!("read {}: {}", path.display(), e))?;
+
+    let mut messages = Vec::new();
+    for (i, line) in content.lines().enumerate() {
+        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
+
+        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
+        if msg_type != "user" && msg_type != "assistant" { continue; }
+
+        let timestamp = obj.get("timestamp")
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        let user_type = obj.get("userType")
+            .and_then(|v| v.as_str())
+            .map(|s| s.to_string());
+
+        let Some(text) = extract_text_content(&obj) else { continue };
+        let text = text.trim().to_string();
+        if text.is_empty() { continue; }
+
+        messages.push(TranscriptMessage {
+            line: i + 1,
+            role: msg_type.to_string(),
+            text,
+            timestamp,
+            user_type,
+        });
+    }
+
+    Ok(messages)
+}
+
+/// Extract text content from a JSONL message object.
+///
+/// Handles both string content and array-of-blocks content (filtering to
+/// type="text" blocks only). Strips `<system-reminder>` tags.
+fn extract_text_content(obj: &serde_json::Value) -> Option<String> {
+    let msg = obj.get("message").unwrap_or(obj);
+    let content = msg.get("content")?;
+
+    let text = match content {
+        serde_json::Value::String(s) => s.clone(),
+        serde_json::Value::Array(arr) => {
+            let texts: Vec<&str> = arr.iter()
+                .filter_map(|block| {
+                    let block_type = block.get("type").and_then(|v| v.as_str())?;
+                    if block_type != "text" { return None; }
+                    let t = block.get("text").and_then(|v| v.as_str())?;
+                    // Skip system-reminder blocks entirely
+                    if t.contains("<system-reminder>") { return None; }
+                    Some(t)
+                })
+                .collect();
+            if texts.is_empty() { return None; }
+            texts.join("\n")
+        }
+        _ => return None,
+    };
+
+    Some(text)
+}