extract shared transcript parser and similarity matching helpers

- New agents/transcript.rs: shared JSONL parsing for enrich, fact_mine,
  and knowledge (was 3 separate implementations, ~150 lines duplicated)
- New best_match() and section_children() helpers in neuro/rewrite.rs
  (was duplicated find-best-by-similarity loop + section collection)
- Net -153 lines
This commit is contained in:
ProofOfConcept 2026-03-08 21:42:53 -04:00
parent 7c491e92eb
commit 92f3ba5acf
6 changed files with 166 additions and 225 deletions

View file

@ -7,11 +7,11 @@
use crate::config;
use super::llm;
use super::transcript;
use crate::store::{self, Provenance};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::fs;
use std::path::Path;
const CHARS_PER_TOKEN: usize = 4;
@ -75,81 +75,27 @@ pub struct Fact {
pub source_offset: Option<usize>,
}
struct Message {
role: String,
text: String,
timestamp: String,
}
/// Extract user/assistant text messages from a JSONL transcript.
fn extract_conversation(path: &Path) -> Vec<Message> {
let cfg = config::get();
let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
let mut messages = Vec::new();
for line in content.lines() {
let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
if msg_type != "user" && msg_type != "assistant" {
continue;
}
let timestamp = obj.get("timestamp")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let msg = obj.get("message").unwrap_or(&obj);
let content = msg.get("content");
let text = match content {
Some(serde_json::Value::String(s)) => s.clone(),
Some(serde_json::Value::Array(arr)) => {
let texts: Vec<&str> = arr.iter()
.filter_map(|block| {
let obj = block.as_object()?;
if obj.get("type")?.as_str()? != "text" {
return None;
}
let t = obj.get("text")?.as_str()?;
if t.contains("<system-reminder>") {
return None;
}
Some(t)
})
.collect();
texts.join("\n")
}
_ => continue,
};
let text = text.trim().to_string();
if text.len() < 20 {
continue;
}
let role = if msg_type == "user" {
cfg.user_name.clone()
} else {
cfg.assistant_name.clone()
};
messages.push(Message { role, text, timestamp });
}
messages
fn extract_messages(path: &Path) -> Vec<transcript::TranscriptMessage> {
transcript::parse_transcript(path)
.unwrap_or_default()
.into_iter()
.filter(|m| m.text.len() >= 20)
.collect()
}
/// Format messages into a single text for chunking.
fn format_for_extraction(messages: &[Message]) -> String {
fn format_for_extraction(messages: &[transcript::TranscriptMessage]) -> String {
let cfg = config::get();
messages.iter()
.map(|msg| {
let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
if ts.is_empty() {
format!("[{}] {}", msg.role, text)
format!("[{}] {}", role, text)
} else {
format!("[{} {}] {}", msg.role, ts, text)
format!("[{} {}] {}", role, ts, text)
}
})
.collect::<Vec<_>>()
@ -224,7 +170,7 @@ pub fn mine_transcript(
log(&format!("Mining: {}", filename));
let messages = extract_conversation(path);
let messages = extract_messages(path);
if messages.is_empty() {
log("No messages found");
return Ok(Vec::new());
@ -322,7 +268,7 @@ pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result
let mut all_facts = Vec::new();
for path in paths {
let messages = extract_conversation(path);
let messages = extract_messages(path);
if messages.len() < min_messages {
eprintln!("Skipping {} ({} messages < {})",
path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),