extract shared transcript parser and similarity matching helpers

- New agents/transcript.rs: shared JSONL parsing for enrich, fact_mine, and knowledge (was 3 separate implementations, ~150 lines duplicated) - New best_match() and section_children() helpers in neuro/rewrite.rs (was duplicated find-best-by-similarity loop + section collection) - Net -153 lines
2026-03-08 21:42:53 -04:00 · 2026-03-08 21:42:53 -04:00 · 92f3ba5acf
commit 92f3ba5acf
parent 7c491e92eb
6 changed files with 166 additions and 225 deletions
--- a/poc-memory/src/agents/fact_mine.rs
+++ b/poc-memory/src/agents/fact_mine.rs
@ -7,11 +7,11 @@

 use crate::config;
 use super::llm;
+use super::transcript;
 use crate::store::{self, Provenance};

 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
-use std::fs;
 use std::path::Path;

 const CHARS_PER_TOKEN: usize = 4;
@ -75,81 +75,27 @@ pub struct Fact {
    pub source_offset: Option<usize>,
 }

-struct Message {
-    role: String,
-    text: String,
-    timestamp: String,
-}
-
 /// Extract user/assistant text messages from a JSONL transcript.
-fn extract_conversation(path: &Path) -> Vec<Message> {
-    let cfg = config::get();
-    let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
-    let mut messages = Vec::new();
-
-    for line in content.lines() {
-        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
-
-        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
-        if msg_type != "user" && msg_type != "assistant" {
-            continue;
-        }
-
-        let timestamp = obj.get("timestamp")
-            .and_then(|v| v.as_str())
-            .unwrap_or("")
-            .to_string();
-
-        let msg = obj.get("message").unwrap_or(&obj);
-        let content = msg.get("content");
-
-        let text = match content {
-            Some(serde_json::Value::String(s)) => s.clone(),
-            Some(serde_json::Value::Array(arr)) => {
-                let texts: Vec<&str> = arr.iter()
-                    .filter_map(|block| {
-                        let obj = block.as_object()?;
-                        if obj.get("type")?.as_str()? != "text" {
-                            return None;
-                        }
-                        let t = obj.get("text")?.as_str()?;
-                        if t.contains("<system-reminder>") {
-                            return None;
-                        }
-                        Some(t)
-                    })
-                    .collect();
-                texts.join("\n")
-            }
-            _ => continue,
-        };
-
-        let text = text.trim().to_string();
-        if text.len() < 20 {
-            continue;
-        }
-
-        let role = if msg_type == "user" {
-            cfg.user_name.clone()
-        } else {
-            cfg.assistant_name.clone()
-        };
-        messages.push(Message { role, text, timestamp });
-    }
-
-    messages
+fn extract_messages(path: &Path) -> Vec<transcript::TranscriptMessage> {
+    transcript::parse_transcript(path)
+        .unwrap_or_default()
+        .into_iter()
+        .filter(|m| m.text.len() >= 20)
+        .collect()
 }

 /// Format messages into a single text for chunking.
-fn format_for_extraction(messages: &[Message]) -> String {
+fn format_for_extraction(messages: &[transcript::TranscriptMessage]) -> String {
+    let cfg = config::get();
    messages.iter()
        .map(|msg| {
+            let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
            let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
            let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
            if ts.is_empty() {
-                format!("[{}] {}", msg.role, text)
+                format!("[{}] {}", role, text)
            } else {
-                format!("[{} {}] {}", msg.role, ts, text)
+                format!("[{} {}] {}", role, ts, text)
            }
        })
        .collect::<Vec<_>>()
@ -224,7 +170,7 @@ pub fn mine_transcript(

    log(&format!("Mining: {}", filename));

-    let messages = extract_conversation(path);
+    let messages = extract_messages(path);
    if messages.is_empty() {
        log("No messages found");
        return Ok(Vec::new());
@ -322,7 +268,7 @@ pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result
    let mut all_facts = Vec::new();

    for path in paths {
-        let messages = extract_conversation(path);
+        let messages = extract_messages(path);
        if messages.len() < min_messages {
            eprintln!("Skipping {} ({} messages < {})",
                path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),