diff --git a/poc-memory/src/agents/enrich.rs b/poc-memory/src/agents/enrich.rs index 8146ec2..f6a2911 100644 --- a/poc-memory/src/agents/enrich.rs +++ b/poc-memory/src/agents/enrich.rs @@ -72,53 +72,11 @@ pub fn is_transcript_mined_with_keys(mined: &HashSet, path: &str) -> boo /// Extract user/assistant messages with line numbers from a JSONL transcript. /// (line_number, role, text, timestamp) pub fn extract_conversation(jsonl_path: &str) -> Result, String> { - let content = fs::read_to_string(jsonl_path) - .map_err(|e| format!("read {}: {}", jsonl_path, e))?; - - let mut messages = Vec::new(); - for (i, line) in content.lines().enumerate() { - let obj: serde_json::Value = match serde_json::from_str(line) { - Ok(v) => v, - Err(_) => continue, - }; - - let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or(""); - if msg_type != "user" && msg_type != "assistant" { continue; } - - let timestamp = obj.get("timestamp") - .and_then(|v| v.as_str()) - .unwrap_or("") - .to_string(); - - let msg = obj.get("message").unwrap_or(&obj); - let content = msg.get("content"); - - let text = match content { - Some(serde_json::Value::String(s)) => s.clone(), - Some(serde_json::Value::Array(arr)) => { - arr.iter() - .filter_map(|c| { - // Only extract text blocks; skip tool_use, tool_result, thinking, etc. - let is_text = c.get("type").and_then(|v| v.as_str()) == Some("text"); - if is_text { - c.get("text").and_then(|v| v.as_str()).map(|s| s.to_string()) - } else { - c.as_str().map(|s| s.to_string()) - } - }) - .collect::>() - .join("\n") - } - _ => continue, - }; - - let text = text.trim().to_string(); - if text.is_empty() { continue; } - - messages.push((i + 1, msg_type.to_string(), text, timestamp)); - } - - Ok(messages) + let path = std::path::Path::new(jsonl_path); + let messages = super::transcript::parse_transcript(path)?; + Ok(messages.into_iter() + .map(|m| (m.line, m.role, m.text, m.timestamp)) + .collect()) } pub const COMPACTION_MARKER: &str = "This session is being continued from a previous conversation that ran out of context"; diff --git a/poc-memory/src/agents/fact_mine.rs b/poc-memory/src/agents/fact_mine.rs index d4eb701..4ec5669 100644 --- a/poc-memory/src/agents/fact_mine.rs +++ b/poc-memory/src/agents/fact_mine.rs @@ -7,11 +7,11 @@ use crate::config; use super::llm; +use super::transcript; use crate::store::{self, Provenance}; use serde::{Deserialize, Serialize}; use std::collections::HashSet; -use std::fs; use std::path::Path; const CHARS_PER_TOKEN: usize = 4; @@ -75,81 +75,27 @@ pub struct Fact { pub source_offset: Option, } -struct Message { - role: String, - text: String, - timestamp: String, -} - /// Extract user/assistant text messages from a JSONL transcript. -fn extract_conversation(path: &Path) -> Vec { - let cfg = config::get(); - let Ok(content) = fs::read_to_string(path) else { return Vec::new() }; - let mut messages = Vec::new(); - - for line in content.lines() { - let Ok(obj) = serde_json::from_str::(line) else { continue }; - - let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or(""); - if msg_type != "user" && msg_type != "assistant" { - continue; - } - - let timestamp = obj.get("timestamp") - .and_then(|v| v.as_str()) - .unwrap_or("") - .to_string(); - - let msg = obj.get("message").unwrap_or(&obj); - let content = msg.get("content"); - - let text = match content { - Some(serde_json::Value::String(s)) => s.clone(), - Some(serde_json::Value::Array(arr)) => { - let texts: Vec<&str> = arr.iter() - .filter_map(|block| { - let obj = block.as_object()?; - if obj.get("type")?.as_str()? != "text" { - return None; - } - let t = obj.get("text")?.as_str()?; - if t.contains("") { - return None; - } - Some(t) - }) - .collect(); - texts.join("\n") - } - _ => continue, - }; - - let text = text.trim().to_string(); - if text.len() < 20 { - continue; - } - - let role = if msg_type == "user" { - cfg.user_name.clone() - } else { - cfg.assistant_name.clone() - }; - messages.push(Message { role, text, timestamp }); - } - - messages +fn extract_messages(path: &Path) -> Vec { + transcript::parse_transcript(path) + .unwrap_or_default() + .into_iter() + .filter(|m| m.text.len() >= 20) + .collect() } /// Format messages into a single text for chunking. -fn format_for_extraction(messages: &[Message]) -> String { +fn format_for_extraction(messages: &[transcript::TranscriptMessage]) -> String { + let cfg = config::get(); messages.iter() .map(|msg| { + let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name }; let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]"); let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" }; if ts.is_empty() { - format!("[{}] {}", msg.role, text) + format!("[{}] {}", role, text) } else { - format!("[{} {}] {}", msg.role, ts, text) + format!("[{} {}] {}", role, ts, text) } }) .collect::>() @@ -224,7 +170,7 @@ pub fn mine_transcript( log(&format!("Mining: {}", filename)); - let messages = extract_conversation(path); + let messages = extract_messages(path); if messages.is_empty() { log("No messages found"); return Ok(Vec::new()); @@ -322,7 +268,7 @@ pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result let mut all_facts = Vec::new(); for path in paths { - let messages = extract_conversation(path); + let messages = extract_messages(path); if messages.len() < min_messages { eprintln!("Skipping {} ({} messages < {})", path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(), diff --git a/poc-memory/src/agents/knowledge.rs b/poc-memory/src/agents/knowledge.rs index d2f3fa2..38f9bcd 100644 --- a/poc-memory/src/agents/knowledge.rs +++ b/poc-memory/src/agents/knowledge.rs @@ -333,84 +333,41 @@ fn get_graph_topology(store: &Store, graph: &Graph) -> String { } /// Strip blocks from text -fn strip_system_tags(text: &str) -> String { - let re = Regex::new(r"(?s).*?").unwrap(); - re.replace_all(text, "").trim().to_string() -} - /// Extract human-readable dialogue from a conversation JSONL fn extract_conversation_text(path: &Path, max_chars: usize) -> String { - let Ok(content) = fs::read_to_string(path) else { return String::new() }; + let cfg = crate::config::get(); + let messages = super::transcript::parse_transcript(path).unwrap_or_default(); let mut fragments = Vec::new(); let mut total = 0; - for line in content.lines() { - let Ok(obj) = serde_json::from_str::(line) else { continue }; + for msg in &messages { + let min_len = if msg.role == "user" { 5 } else { 10 }; + if msg.text.len() <= min_len { continue; } - let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or(""); - - if msg_type == "user" && obj.get("userType").and_then(|v| v.as_str()) == Some("external") { - if let Some(text) = extract_text_content(&obj) { - let text = strip_system_tags(&text); - if text.starts_with("[Request interrupted") { continue; } - if text.len() > 5 { - fragments.push(format!("**{}:** {}", crate::config::get().user_name, text)); - total += text.len(); - } - } - } else if msg_type == "assistant" { - if let Some(text) = extract_text_content(&obj) { - let text = strip_system_tags(&text); - if text.len() > 10 { - fragments.push(format!("**{}:** {}", crate::config::get().assistant_name, text)); - total += text.len(); - } - } + // Only include external user messages + if msg.role == "user" { + if msg.user_type.as_deref() != Some("external") { continue; } + if msg.text.starts_with("[Request interrupted") { continue; } } + let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name }; + fragments.push(format!("**{}:** {}", role, msg.text)); + total += msg.text.len(); if total > max_chars { break; } } fragments.join("\n\n") } -fn extract_text_content(obj: &serde_json::Value) -> Option { - let msg = obj.get("message")?; - let content = msg.get("content")?; - if let Some(s) = content.as_str() { - return Some(s.to_string()); - } - if let Some(arr) = content.as_array() { - let texts: Vec<&str> = arr.iter() - .filter_map(|b| { - if b.get("type")?.as_str()? == "text" { - b.get("text")?.as_str() - } else { - None - } - }) - .collect(); - if !texts.is_empty() { - return Some(texts.join("\n")); - } - } - None -} - /// Count short user messages (dialogue turns) in a JSONL fn count_dialogue_turns(path: &Path) -> usize { - let Ok(content) = fs::read_to_string(path) else { return 0 }; - content.lines() - .filter_map(|line| serde_json::from_str::(line).ok()) - .filter(|obj| { - obj.get("type").and_then(|v| v.as_str()) == Some("user") - && obj.get("userType").and_then(|v| v.as_str()) == Some("external") - }) - .filter(|obj| { - let text = extract_text_content(obj).unwrap_or_default(); - text.len() > 5 && text.len() < 500 - && !text.starts_with("[Request interrupted") - && !text.starts_with("Implement the following") - }) + let messages = super::transcript::parse_transcript(path).unwrap_or_default(); + messages.iter() + .filter(|m| m.role == "user" + && m.user_type.as_deref() == Some("external") + && m.text.len() > 5 + && m.text.len() < 500 + && !m.text.starts_with("[Request interrupted") + && !m.text.starts_with("Implement the following")) .count() } diff --git a/poc-memory/src/agents/mod.rs b/poc-memory/src/agents/mod.rs index c3c83a1..689346d 100644 --- a/poc-memory/src/agents/mod.rs +++ b/poc-memory/src/agents/mod.rs @@ -13,7 +13,9 @@ // fact_mine — fact extraction from transcripts // digest — episodic digest generation (daily/weekly/monthly) // daemon — background job scheduler +// transcript — shared JSONL transcript parsing +pub mod transcript; pub mod llm; pub mod prompts; pub mod audit; diff --git a/poc-memory/src/agents/transcript.rs b/poc-memory/src/agents/transcript.rs new file mode 100644 index 0000000..109fc54 --- /dev/null +++ b/poc-memory/src/agents/transcript.rs @@ -0,0 +1,94 @@ +// Shared JSONL transcript parsing +// +// Three agents (enrich, fact_mine, knowledge) all parse Claude Code JSONL +// transcripts. This module provides the shared core: parse each line, extract +// message type, text content from string-or-array blocks, timestamp, and +// user type. Callers filter and transform as needed. + +use std::fs; +use std::path::Path; + +/// A single message extracted from a JSONL transcript. +pub struct TranscriptMessage { + /// 1-based line number in the JSONL file. + pub line: usize, + /// Raw role: "user" or "assistant". + pub role: String, + /// Extracted text content (trimmed, blocks joined with newlines). + pub text: String, + /// ISO timestamp from the message, or empty string. + pub timestamp: String, + /// For user messages: "external", "internal", etc. None for assistant. + pub user_type: Option, +} + +/// Parse a JSONL transcript into structured messages. +/// +/// Extracts all user and assistant messages. Content blocks of type "text" +/// are joined; tool_use, tool_result, thinking blocks are skipped. +/// System-reminder blocks are filtered out. +pub fn parse_transcript(path: &Path) -> Result, String> { + let content = fs::read_to_string(path) + .map_err(|e| format!("read {}: {}", path.display(), e))?; + + let mut messages = Vec::new(); + for (i, line) in content.lines().enumerate() { + let Ok(obj) = serde_json::from_str::(line) else { continue }; + + let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or(""); + if msg_type != "user" && msg_type != "assistant" { continue; } + + let timestamp = obj.get("timestamp") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + let user_type = obj.get("userType") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + + let Some(text) = extract_text_content(&obj) else { continue }; + let text = text.trim().to_string(); + if text.is_empty() { continue; } + + messages.push(TranscriptMessage { + line: i + 1, + role: msg_type.to_string(), + text, + timestamp, + user_type, + }); + } + + Ok(messages) +} + +/// Extract text content from a JSONL message object. +/// +/// Handles both string content and array-of-blocks content (filtering to +/// type="text" blocks only). Strips `` tags. +fn extract_text_content(obj: &serde_json::Value) -> Option { + let msg = obj.get("message").unwrap_or(obj); + let content = msg.get("content")?; + + let text = match content { + serde_json::Value::String(s) => s.clone(), + serde_json::Value::Array(arr) => { + let texts: Vec<&str> = arr.iter() + .filter_map(|block| { + let block_type = block.get("type").and_then(|v| v.as_str())?; + if block_type != "text" { return None; } + let t = block.get("text").and_then(|v| v.as_str())?; + // Skip system-reminder blocks entirely + if t.contains("") { return None; } + Some(t) + }) + .collect(); + if texts.is_empty() { return None; } + texts.join("\n") + } + _ => return None, + }; + + Some(text) +} diff --git a/poc-memory/src/neuro/rewrite.rs b/poc-memory/src/neuro/rewrite.rs index aa7a2ed..5a7d01a 100644 --- a/poc-memory/src/neuro/rewrite.rs +++ b/poc-memory/src/neuro/rewrite.rs @@ -5,6 +5,28 @@ use crate::store::{Store, new_relation}; use crate::graph::Graph; use crate::similarity; +/// Collect (key, content) pairs for all section children of a file-level node. +fn section_children<'a>(store: &'a Store, file_key: &str) -> Vec<(&'a str, &'a str)> { + let prefix = format!("{}#", file_key); + store.nodes.iter() + .filter(|(k, _)| k.starts_with(&prefix)) + .map(|(k, n)| (k.as_str(), n.content.as_str())) + .collect() +} + +/// Find the best matching candidate by cosine similarity against content. +/// Returns (key, similarity) if any candidate exceeds threshold. +fn best_match(candidates: &[(&str, &str)], content: &str, threshold: f32) -> Option<(String, f32)> { + let (best_key, best_sim) = candidates.iter() + .map(|(key, text)| (*key, similarity::cosine_similarity(content, text))) + .max_by(|a, b| a.1.total_cmp(&b.1))?; + if best_sim > threshold { + Some((best_key.to_string(), best_sim)) + } else { + None + } +} + /// Refine a link target: if the target is a file-level node with section /// children, find the best-matching section by cosine similarity against /// the source content. Returns the original key if no sections exist or @@ -16,31 +38,13 @@ pub fn refine_target(store: &Store, source_content: &str, target_key: &str) -> S // Only refine file-level nodes (no # in key) if target_key.contains('#') { return target_key.to_string(); } - let prefix = format!("{}#", target_key); - let sections: Vec<(&str, &str)> = store.nodes.iter() - .filter(|(k, _)| k.starts_with(&prefix)) - .map(|(k, n)| (k.as_str(), n.content.as_str())) - .collect(); + let sections = section_children(store, target_key); if sections.is_empty() { return target_key.to_string(); } - let mut best_section = ""; - let mut best_sim = 0.0f32; - - for (section_key, section_content) in §ions { - let sim = similarity::cosine_similarity(source_content, section_content); - if sim > best_sim { - best_sim = sim; - best_section = section_key; - } - } - - // Threshold: only refine if there's a meaningful match - if best_sim > 0.05 && !best_section.is_empty() { - best_section.to_string() - } else { - target_key.to_string() - } + best_match(§ions, source_content, 0.05) + .map(|(key, _)| key) + .unwrap_or_else(|| target_key.to_string()) } /// A proposed link move: from hub→neighbor to section→neighbor @@ -70,16 +74,12 @@ pub fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph) // Only works on file-level nodes that have section children if hub_key.contains('#') { return None; } - let prefix = format!("{}#", hub_key); - let sections: Vec<(&str, &str)> = store.nodes.iter() - .filter(|(k, _)| k.starts_with(&prefix)) - .map(|(k, n)| (k.as_str(), n.content.as_str())) - .collect(); - + let sections = section_children(store, hub_key); if sections.is_empty() { return None; } // Get all neighbors of the hub let neighbors = graph.neighbors(hub_key); + let prefix = format!("{}#", hub_key); let mut moves = Vec::new(); @@ -93,19 +93,7 @@ pub fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph) }; // Find best-matching section by content similarity - let mut best_section = ""; - let mut best_sim = 0.0f32; - - for (section_key, section_content) in §ions { - let sim = similarity::cosine_similarity(neighbor_content, section_content); - if sim > best_sim { - best_sim = sim; - best_section = section_key; - } - } - - // Only propose move if there's a reasonable match - if best_sim > 0.05 && !best_section.is_empty() { + if let Some((best_section, best_sim)) = best_match(§ions, neighbor_content, 0.05) { let snippet = crate::util::first_n_chars( neighbor_content.lines() .find(|l| !l.is_empty() && !l.starts_with("