// fact_mine.rs — extract atomic factual claims from conversation transcripts // // Chunks conversation text into overlapping windows, sends each to Haiku // for extraction, deduplicates by claim text. Output: JSON array of facts. // // Uses Haiku (not Sonnet) for cost efficiency on high-volume extraction. use crate::config; use crate::llm; use crate::store::{self, Provenance}; use serde::{Deserialize, Serialize}; use std::collections::HashSet; use std::fs; use std::path::Path; const CHARS_PER_TOKEN: usize = 4; const WINDOW_TOKENS: usize = 2000; const OVERLAP_TOKENS: usize = 200; const WINDOW_CHARS: usize = WINDOW_TOKENS * CHARS_PER_TOKEN; const OVERLAP_CHARS: usize = OVERLAP_TOKENS * CHARS_PER_TOKEN; fn extraction_prompt() -> String { let cfg = config::get(); format!( r#"Extract atomic factual claims from this conversation excerpt. Speakers are labeled [{user}] and [{assistant}] in the transcript. Use their proper names in claims — not "the user" or "the assistant." Each claim should be: - A single verifiable statement - Specific enough to be useful in isolation - Tagged with domain (e.g., bcachefs/btree, bcachefs/alloc, bcachefs/journal, bcachefs/ec, bcachefs/reconcile, rust/idioms, workflow/preferences, linux/kernel, memory/design, identity/personal) - Tagged with confidence: "stated" (explicitly said), "implied" (logically follows), or "speculative" (hypothesis, not confirmed) - Include which speaker said it ("{user}", "{assistant}", or "Unknown") Do NOT extract: - Opinions or subjective assessments - Conversational filler or greetings - Things that are obviously common knowledge - Restatements of the same fact (pick the clearest version) - System messages, tool outputs, or error logs (extract what was LEARNED from them) - Anything about the conversation itself ("{user} and {assistant} discussed...") - Facts only relevant to this specific conversation (e.g. transient file paths, mid-debug state) Output as a JSON array. Each element: {{ "claim": "the exact factual statement", "domain": "category/subcategory", "confidence": "stated|implied|speculative", "speaker": "{user}|{assistant}|Unknown" }} If the excerpt contains no extractable facts, output an empty array: [] --- CONVERSATION EXCERPT --- "#, user = cfg.user_name, assistant = cfg.assistant_name) } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Fact { pub claim: String, pub domain: String, pub confidence: String, pub speaker: String, #[serde(skip_serializing_if = "Option::is_none")] pub source_file: Option, #[serde(skip_serializing_if = "Option::is_none")] pub source_chunk: Option, #[serde(skip_serializing_if = "Option::is_none")] pub source_offset: Option, } struct Message { role: String, text: String, timestamp: String, } /// Extract user/assistant text messages from a JSONL transcript. fn extract_conversation(path: &Path) -> Vec { let cfg = config::get(); let Ok(content) = fs::read_to_string(path) else { return Vec::new() }; let mut messages = Vec::new(); for line in content.lines() { let Ok(obj) = serde_json::from_str::(line) else { continue }; let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or(""); if msg_type != "user" && msg_type != "assistant" { continue; } let timestamp = obj.get("timestamp") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); let msg = obj.get("message").unwrap_or(&obj); let content = msg.get("content"); let text = match content { Some(serde_json::Value::String(s)) => s.clone(), Some(serde_json::Value::Array(arr)) => { let texts: Vec<&str> = arr.iter() .filter_map(|block| { let obj = block.as_object()?; if obj.get("type")?.as_str()? != "text" { return None; } let t = obj.get("text")?.as_str()?; if t.contains("") { return None; } Some(t) }) .collect(); texts.join("\n") } _ => continue, }; let text = text.trim().to_string(); if text.len() < 20 { continue; } let role = if msg_type == "user" { cfg.user_name.clone() } else { cfg.assistant_name.clone() }; messages.push(Message { role, text, timestamp }); } messages } /// Format messages into a single text for chunking. fn format_for_extraction(messages: &[Message]) -> String { messages.iter() .map(|msg| { let text = if msg.text.len() > 3000 { // Find a char boundary near 2800 let trunc = msg.text.floor_char_boundary(2800); format!("{}\n[...truncated...]", &msg.text[..trunc]) } else { msg.text.clone() }; let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" }; if ts.is_empty() { format!("[{}] {}", msg.role, text) } else { format!("[{} {}] {}", msg.role, ts, text) } }) .collect::>() .join("\n\n") } /// Split text into overlapping windows, breaking at paragraph boundaries. fn chunk_text(text: &str) -> Vec<(usize, &str)> { let mut chunks = Vec::new(); let mut start = 0; while start < text.len() { let mut end = text.floor_char_boundary((start + WINDOW_CHARS).min(text.len())); // Try to break at a paragraph boundary if end < text.len() { if let Some(para) = text[start..end].rfind("\n\n") { if para > WINDOW_CHARS / 2 { end = start + para; } } } chunks.push((start, &text[start..end])); let next = text.floor_char_boundary(end.saturating_sub(OVERLAP_CHARS)); if next <= start { start = end; } else { start = next; } } chunks } /// Parse JSON facts from model response. fn parse_facts(response: &str) -> Vec { let cleaned = response.trim(); // Strip markdown code block let cleaned = if cleaned.starts_with("```") { cleaned.lines() .filter(|l| !l.starts_with("```")) .collect::>() .join("\n") } else { cleaned.to_string() }; // Find JSON array let start = cleaned.find('['); let end = cleaned.rfind(']'); let (Some(start), Some(end)) = (start, end) else { return Vec::new() }; serde_json::from_str(&cleaned[start..=end]).unwrap_or_default() } /// Mine a single transcript for atomic facts. pub fn mine_transcript(path: &Path, dry_run: bool) -> Result, String> { let filename = path.file_name() .map(|n| n.to_string_lossy().to_string()) .unwrap_or_else(|| "unknown".into()); eprintln!("Mining: {}", filename); let messages = extract_conversation(path); if messages.is_empty() { eprintln!(" No messages found"); return Ok(Vec::new()); } eprintln!(" {} messages extracted", messages.len()); let text = format_for_extraction(&messages); let chunks = chunk_text(&text); eprintln!(" {} chunks ({} chars)", chunks.len(), text.len()); if dry_run { for (i, (offset, chunk)) in chunks.iter().enumerate() { eprintln!("\n--- Chunk {} (offset {}, {} chars) ---", i + 1, offset, chunk.len()); let preview = if chunk.len() > 500 { &chunk[..chunk.floor_char_boundary(500)] } else { chunk }; eprintln!("{}", preview); if chunk.len() > 500 { eprintln!(" ... ({} more chars)", chunk.len() - 500); } } return Ok(Vec::new()); } let prompt_prefix = extraction_prompt(); let mut all_facts = Vec::new(); for (i, (_offset, chunk)) in chunks.iter().enumerate() { eprint!(" Chunk {}/{} ({} chars)...", i + 1, chunks.len(), chunk.len()); let prompt = format!("{}{}", prompt_prefix, chunk); let response = match llm::call_haiku("fact-mine", &prompt) { Ok(r) => r, Err(e) => { eprintln!(" error: {}", e); continue; } }; let mut facts = parse_facts(&response); for fact in &mut facts { fact.source_file = Some(filename.clone()); fact.source_chunk = Some(i + 1); fact.source_offset = Some(*_offset); } eprintln!(" {} facts", facts.len()); all_facts.extend(facts); } // Deduplicate by claim text let mut seen = HashSet::new(); let before = all_facts.len(); all_facts.retain(|f| seen.insert(f.claim.to_lowercase())); let dupes = before - all_facts.len(); if dupes > 0 { eprintln!(" {} duplicates removed", dupes); } eprintln!(" Total: {} unique facts", all_facts.len()); Ok(all_facts) } /// Mine a transcript and store facts in the capnp store. /// Returns the number of facts stored. pub fn mine_and_store(path: &Path) -> Result { let facts = mine_transcript(path, false)?; if facts.is_empty() { return Ok(0); } let filename = path.file_name() .map(|n| n.to_string_lossy().to_string()) .unwrap_or_else(|| "unknown".into()); // Store as a single node keyed by transcript filename let key = format!("_facts-{}", filename.trim_end_matches(".jsonl")); let json = serde_json::to_string_pretty(&facts) .map_err(|e| format!("serialize facts: {}", e))?; let mut store = store::Store::load()?; store.upsert_provenance(&key, &json, Provenance::AgentFactMine)?; store.save()?; eprintln!(" Stored {} facts as {}", facts.len(), key); Ok(facts.len()) } /// Mine transcripts, returning all facts. Skips files with fewer than min_messages. pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result, String> { let mut all_facts = Vec::new(); for path in paths { let messages = extract_conversation(path); if messages.len() < min_messages { eprintln!("Skipping {} ({} messages < {})", path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(), messages.len(), min_messages); continue; } let facts = mine_transcript(path, dry_run)?; all_facts.extend(facts); } Ok(all_facts) }