extract shared transcript parser and similarity matching helpers
- New agents/transcript.rs: shared JSONL parsing for enrich, fact_mine, and knowledge (was 3 separate implementations, ~150 lines duplicated) - New best_match() and section_children() helpers in neuro/rewrite.rs (was duplicated find-best-by-similarity loop + section collection) - Net -153 lines
This commit is contained in:
parent
7c491e92eb
commit
92f3ba5acf
6 changed files with 166 additions and 225 deletions
|
|
@ -7,11 +7,11 @@
|
|||
|
||||
use crate::config;
|
||||
use super::llm;
|
||||
use super::transcript;
|
||||
use crate::store::{self, Provenance};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
const CHARS_PER_TOKEN: usize = 4;
|
||||
|
|
@ -75,81 +75,27 @@ pub struct Fact {
|
|||
pub source_offset: Option<usize>,
|
||||
}
|
||||
|
||||
struct Message {
|
||||
role: String,
|
||||
text: String,
|
||||
timestamp: String,
|
||||
}
|
||||
|
||||
/// Extract user/assistant text messages from a JSONL transcript.
|
||||
fn extract_conversation(path: &Path) -> Vec<Message> {
|
||||
let cfg = config::get();
|
||||
let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
|
||||
let mut messages = Vec::new();
|
||||
|
||||
for line in content.lines() {
|
||||
let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
|
||||
|
||||
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if msg_type != "user" && msg_type != "assistant" {
|
||||
continue;
|
||||
}
|
||||
|
||||
let timestamp = obj.get("timestamp")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
let msg = obj.get("message").unwrap_or(&obj);
|
||||
let content = msg.get("content");
|
||||
|
||||
let text = match content {
|
||||
Some(serde_json::Value::String(s)) => s.clone(),
|
||||
Some(serde_json::Value::Array(arr)) => {
|
||||
let texts: Vec<&str> = arr.iter()
|
||||
.filter_map(|block| {
|
||||
let obj = block.as_object()?;
|
||||
if obj.get("type")?.as_str()? != "text" {
|
||||
return None;
|
||||
}
|
||||
let t = obj.get("text")?.as_str()?;
|
||||
if t.contains("<system-reminder>") {
|
||||
return None;
|
||||
}
|
||||
Some(t)
|
||||
})
|
||||
.collect();
|
||||
texts.join("\n")
|
||||
}
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let text = text.trim().to_string();
|
||||
if text.len() < 20 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let role = if msg_type == "user" {
|
||||
cfg.user_name.clone()
|
||||
} else {
|
||||
cfg.assistant_name.clone()
|
||||
};
|
||||
messages.push(Message { role, text, timestamp });
|
||||
}
|
||||
|
||||
messages
|
||||
fn extract_messages(path: &Path) -> Vec<transcript::TranscriptMessage> {
|
||||
transcript::parse_transcript(path)
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter(|m| m.text.len() >= 20)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Format messages into a single text for chunking.
|
||||
fn format_for_extraction(messages: &[Message]) -> String {
|
||||
fn format_for_extraction(messages: &[transcript::TranscriptMessage]) -> String {
|
||||
let cfg = config::get();
|
||||
messages.iter()
|
||||
.map(|msg| {
|
||||
let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
|
||||
let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
|
||||
let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
|
||||
if ts.is_empty() {
|
||||
format!("[{}] {}", msg.role, text)
|
||||
format!("[{}] {}", role, text)
|
||||
} else {
|
||||
format!("[{} {}] {}", msg.role, ts, text)
|
||||
format!("[{} {}] {}", role, ts, text)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
|
|
@ -224,7 +170,7 @@ pub fn mine_transcript(
|
|||
|
||||
log(&format!("Mining: {}", filename));
|
||||
|
||||
let messages = extract_conversation(path);
|
||||
let messages = extract_messages(path);
|
||||
if messages.is_empty() {
|
||||
log("No messages found");
|
||||
return Ok(Vec::new());
|
||||
|
|
@ -322,7 +268,7 @@ pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result
|
|||
let mut all_facts = Vec::new();
|
||||
|
||||
for path in paths {
|
||||
let messages = extract_conversation(path);
|
||||
let messages = extract_messages(path);
|
||||
if messages.len() < min_messages {
|
||||
eprintln!("Skipping {} ({} messages < {})",
|
||||
path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue