2026-03-05 15:31:08 -05:00
|
|
|
// fact_mine.rs — extract atomic factual claims from conversation transcripts
|
|
|
|
|
//
|
|
|
|
|
// Chunks conversation text into overlapping windows, sends each to Haiku
|
|
|
|
|
// for extraction, deduplicates by claim text. Output: JSON array of facts.
|
|
|
|
|
//
|
|
|
|
|
// Uses Haiku (not Sonnet) for cost efficiency on high-volume extraction.
|
|
|
|
|
|
2026-03-05 15:41:35 -05:00
|
|
|
use crate::config;
|
2026-03-05 15:31:08 -05:00
|
|
|
use crate::llm;
|
|
|
|
|
use crate::store::{self, Provenance};
|
|
|
|
|
|
|
|
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
|
use std::collections::HashSet;
|
|
|
|
|
use std::fs;
|
|
|
|
|
use std::path::Path;
|
|
|
|
|
|
|
|
|
|
const CHARS_PER_TOKEN: usize = 4;
|
|
|
|
|
const WINDOW_TOKENS: usize = 2000;
|
|
|
|
|
const OVERLAP_TOKENS: usize = 200;
|
|
|
|
|
const WINDOW_CHARS: usize = WINDOW_TOKENS * CHARS_PER_TOKEN;
|
|
|
|
|
const OVERLAP_CHARS: usize = OVERLAP_TOKENS * CHARS_PER_TOKEN;
|
|
|
|
|
|
2026-03-05 15:41:35 -05:00
|
|
|
fn extraction_prompt() -> String {
|
|
|
|
|
let cfg = config::get();
|
|
|
|
|
format!(
|
|
|
|
|
r#"Extract atomic factual claims from this conversation excerpt.
|
|
|
|
|
|
|
|
|
|
Speakers are labeled [{user}] and [{assistant}] in the transcript.
|
|
|
|
|
Use their proper names in claims — not "the user" or "the assistant."
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
Each claim should be:
|
|
|
|
|
- A single verifiable statement
|
|
|
|
|
- Specific enough to be useful in isolation
|
|
|
|
|
- Tagged with domain (e.g., bcachefs/btree, bcachefs/alloc, bcachefs/journal,
|
|
|
|
|
bcachefs/ec, bcachefs/reconcile, rust/idioms, workflow/preferences,
|
|
|
|
|
linux/kernel, memory/design, identity/personal)
|
|
|
|
|
- Tagged with confidence: "stated" (explicitly said), "implied" (logically follows),
|
|
|
|
|
or "speculative" (hypothesis, not confirmed)
|
2026-03-05 15:41:35 -05:00
|
|
|
- Include which speaker said it ("{user}", "{assistant}", or "Unknown")
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
Do NOT extract:
|
|
|
|
|
- Opinions or subjective assessments
|
|
|
|
|
- Conversational filler or greetings
|
|
|
|
|
- Things that are obviously common knowledge
|
|
|
|
|
- Restatements of the same fact (pick the clearest version)
|
|
|
|
|
- System messages, tool outputs, or error logs (extract what was LEARNED from them)
|
2026-03-05 15:41:35 -05:00
|
|
|
- Anything about the conversation itself ("{user} and {assistant} discussed...")
|
2026-03-05 22:59:58 -05:00
|
|
|
- Facts only relevant to this specific conversation (e.g. transient file paths, mid-debug state)
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
Output as a JSON array. Each element:
|
2026-03-05 15:41:35 -05:00
|
|
|
{{
|
2026-03-05 15:31:08 -05:00
|
|
|
"claim": "the exact factual statement",
|
|
|
|
|
"domain": "category/subcategory",
|
|
|
|
|
"confidence": "stated|implied|speculative",
|
2026-03-05 15:41:35 -05:00
|
|
|
"speaker": "{user}|{assistant}|Unknown"
|
|
|
|
|
}}
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
If the excerpt contains no extractable facts, output an empty array: []
|
|
|
|
|
|
|
|
|
|
--- CONVERSATION EXCERPT ---
|
2026-03-05 15:41:35 -05:00
|
|
|
"#, user = cfg.user_name, assistant = cfg.assistant_name)
|
|
|
|
|
}
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
|
|
|
pub struct Fact {
|
|
|
|
|
pub claim: String,
|
|
|
|
|
pub domain: String,
|
|
|
|
|
pub confidence: String,
|
|
|
|
|
pub speaker: String,
|
|
|
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
|
|
|
pub source_file: Option<String>,
|
|
|
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
|
|
|
pub source_chunk: Option<usize>,
|
|
|
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
|
|
|
pub source_offset: Option<usize>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct Message {
|
|
|
|
|
role: String,
|
|
|
|
|
text: String,
|
|
|
|
|
timestamp: String,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Extract user/assistant text messages from a JSONL transcript.
|
|
|
|
|
fn extract_conversation(path: &Path) -> Vec<Message> {
|
2026-03-05 15:41:35 -05:00
|
|
|
let cfg = config::get();
|
2026-03-05 15:31:08 -05:00
|
|
|
let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
|
|
|
|
|
let mut messages = Vec::new();
|
|
|
|
|
|
|
|
|
|
for line in content.lines() {
|
|
|
|
|
let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
|
|
|
|
|
|
|
|
|
|
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
|
|
|
|
|
if msg_type != "user" && msg_type != "assistant" {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let timestamp = obj.get("timestamp")
|
|
|
|
|
.and_then(|v| v.as_str())
|
|
|
|
|
.unwrap_or("")
|
|
|
|
|
.to_string();
|
|
|
|
|
|
|
|
|
|
let msg = obj.get("message").unwrap_or(&obj);
|
|
|
|
|
let content = msg.get("content");
|
|
|
|
|
|
|
|
|
|
let text = match content {
|
|
|
|
|
Some(serde_json::Value::String(s)) => s.clone(),
|
|
|
|
|
Some(serde_json::Value::Array(arr)) => {
|
|
|
|
|
let texts: Vec<&str> = arr.iter()
|
|
|
|
|
.filter_map(|block| {
|
|
|
|
|
let obj = block.as_object()?;
|
|
|
|
|
if obj.get("type")?.as_str()? != "text" {
|
|
|
|
|
return None;
|
|
|
|
|
}
|
|
|
|
|
let t = obj.get("text")?.as_str()?;
|
|
|
|
|
if t.contains("<system-reminder>") {
|
|
|
|
|
return None;
|
|
|
|
|
}
|
|
|
|
|
Some(t)
|
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
texts.join("\n")
|
|
|
|
|
}
|
|
|
|
|
_ => continue,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let text = text.trim().to_string();
|
|
|
|
|
if text.len() < 20 {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 15:41:35 -05:00
|
|
|
let role = if msg_type == "user" {
|
|
|
|
|
cfg.user_name.clone()
|
|
|
|
|
} else {
|
|
|
|
|
cfg.assistant_name.clone()
|
|
|
|
|
};
|
2026-03-05 15:31:08 -05:00
|
|
|
messages.push(Message { role, text, timestamp });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
messages
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Format messages into a single text for chunking.
|
|
|
|
|
fn format_for_extraction(messages: &[Message]) -> String {
|
|
|
|
|
messages.iter()
|
|
|
|
|
.map(|msg| {
|
2026-03-08 21:13:02 -04:00
|
|
|
let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
|
2026-03-05 15:31:08 -05:00
|
|
|
let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
|
|
|
|
|
if ts.is_empty() {
|
|
|
|
|
format!("[{}] {}", msg.role, text)
|
|
|
|
|
} else {
|
|
|
|
|
format!("[{} {}] {}", msg.role, ts, text)
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
|
.join("\n\n")
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Split text into overlapping windows, breaking at paragraph boundaries.
|
|
|
|
|
fn chunk_text(text: &str) -> Vec<(usize, &str)> {
|
|
|
|
|
let mut chunks = Vec::new();
|
|
|
|
|
let mut start = 0;
|
|
|
|
|
|
|
|
|
|
while start < text.len() {
|
|
|
|
|
let mut end = text.floor_char_boundary((start + WINDOW_CHARS).min(text.len()));
|
|
|
|
|
|
|
|
|
|
// Try to break at a paragraph boundary
|
|
|
|
|
if end < text.len() {
|
|
|
|
|
if let Some(para) = text[start..end].rfind("\n\n") {
|
|
|
|
|
if para > WINDOW_CHARS / 2 {
|
|
|
|
|
end = start + para;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
chunks.push((start, &text[start..end]));
|
|
|
|
|
|
|
|
|
|
let next = text.floor_char_boundary(end.saturating_sub(OVERLAP_CHARS));
|
|
|
|
|
if next <= start {
|
|
|
|
|
start = end;
|
|
|
|
|
} else {
|
|
|
|
|
start = next;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
chunks
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Parse JSON facts from model response.
|
|
|
|
|
fn parse_facts(response: &str) -> Vec<Fact> {
|
|
|
|
|
let cleaned = response.trim();
|
|
|
|
|
// Strip markdown code block
|
|
|
|
|
let cleaned = if cleaned.starts_with("```") {
|
|
|
|
|
cleaned.lines()
|
|
|
|
|
.filter(|l| !l.starts_with("```"))
|
|
|
|
|
.collect::<Vec<_>>()
|
|
|
|
|
.join("\n")
|
|
|
|
|
} else {
|
|
|
|
|
cleaned.to_string()
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Find JSON array
|
|
|
|
|
let start = cleaned.find('[');
|
|
|
|
|
let end = cleaned.rfind(']');
|
|
|
|
|
let (Some(start), Some(end)) = (start, end) else { return Vec::new() };
|
|
|
|
|
|
|
|
|
|
serde_json::from_str(&cleaned[start..=end]).unwrap_or_default()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Mine a single transcript for atomic facts.
|
2026-03-08 18:31:31 -04:00
|
|
|
/// The optional `progress` callback receives status strings (e.g. "chunk 3/47").
|
|
|
|
|
pub fn mine_transcript(
|
|
|
|
|
path: &Path,
|
|
|
|
|
dry_run: bool,
|
|
|
|
|
progress: Option<&dyn Fn(&str)>,
|
|
|
|
|
) -> Result<Vec<Fact>, String> {
|
2026-03-05 15:31:08 -05:00
|
|
|
let filename = path.file_name()
|
|
|
|
|
.map(|n| n.to_string_lossy().to_string())
|
|
|
|
|
.unwrap_or_else(|| "unknown".into());
|
2026-03-08 18:31:31 -04:00
|
|
|
let log = |msg: &str| {
|
|
|
|
|
eprintln!("{}", msg);
|
|
|
|
|
if let Some(cb) = progress { cb(msg); }
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
log(&format!("Mining: {}", filename));
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
let messages = extract_conversation(path);
|
|
|
|
|
if messages.is_empty() {
|
2026-03-08 18:31:31 -04:00
|
|
|
log("No messages found");
|
2026-03-05 15:31:08 -05:00
|
|
|
return Ok(Vec::new());
|
|
|
|
|
}
|
2026-03-08 18:31:31 -04:00
|
|
|
log(&format!("{} messages extracted", messages.len()));
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
let text = format_for_extraction(&messages);
|
|
|
|
|
let chunks = chunk_text(&text);
|
2026-03-08 18:31:31 -04:00
|
|
|
log(&format!("{} chunks ({} chars)", chunks.len(), text.len()));
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
if dry_run {
|
|
|
|
|
for (i, (offset, chunk)) in chunks.iter().enumerate() {
|
|
|
|
|
eprintln!("\n--- Chunk {} (offset {}, {} chars) ---", i + 1, offset, chunk.len());
|
2026-03-08 21:13:02 -04:00
|
|
|
eprintln!("{}", crate::util::truncate(chunk, 500, ""));
|
2026-03-05 15:31:08 -05:00
|
|
|
if chunk.len() > 500 {
|
|
|
|
|
eprintln!(" ... ({} more chars)", chunk.len() - 500);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return Ok(Vec::new());
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-05 15:41:35 -05:00
|
|
|
let prompt_prefix = extraction_prompt();
|
2026-03-05 15:31:08 -05:00
|
|
|
let mut all_facts = Vec::new();
|
|
|
|
|
for (i, (_offset, chunk)) in chunks.iter().enumerate() {
|
2026-03-08 18:31:31 -04:00
|
|
|
let status = format!("chunk {}/{} ({} chars)", i + 1, chunks.len(), chunk.len());
|
|
|
|
|
eprint!(" {}...", status);
|
|
|
|
|
if let Some(cb) = progress { cb(&status); }
|
2026-03-05 15:31:08 -05:00
|
|
|
|
2026-03-08 18:31:31 -04:00
|
|
|
let prompt = format!("{}{}\n\n--- END OF EXCERPT ---\n\nReturn ONLY a JSON array of factual claims, or [] if none.", prompt_prefix, chunk);
|
llm: full per-agent usage logging with prompts and responses
Log every model call to ~/.claude/memory/llm-logs/YYYY-MM-DD.md with
full prompt, response, agent type, model, duration, and status. One
file per day, markdown formatted for easy reading.
Agent types: fact-mine, experience-mine, consolidate, knowledge,
digest, enrich, audit. This gives visibility into what each agent
is doing and whether to adjust prompts or frequency.
2026-03-05 22:52:08 -05:00
|
|
|
let response = match llm::call_haiku("fact-mine", &prompt) {
|
2026-03-05 15:31:08 -05:00
|
|
|
Ok(r) => r,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
eprintln!(" error: {}", e);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let mut facts = parse_facts(&response);
|
|
|
|
|
for fact in &mut facts {
|
|
|
|
|
fact.source_file = Some(filename.clone());
|
|
|
|
|
fact.source_chunk = Some(i + 1);
|
|
|
|
|
fact.source_offset = Some(*_offset);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
eprintln!(" {} facts", facts.len());
|
|
|
|
|
all_facts.extend(facts);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Deduplicate by claim text
|
|
|
|
|
let mut seen = HashSet::new();
|
|
|
|
|
let before = all_facts.len();
|
|
|
|
|
all_facts.retain(|f| seen.insert(f.claim.to_lowercase()));
|
|
|
|
|
let dupes = before - all_facts.len();
|
|
|
|
|
if dupes > 0 {
|
2026-03-08 18:31:31 -04:00
|
|
|
log(&format!("{} duplicates removed", dupes));
|
2026-03-05 15:31:08 -05:00
|
|
|
}
|
|
|
|
|
|
2026-03-08 18:31:31 -04:00
|
|
|
log(&format!("Total: {} unique facts", all_facts.len()));
|
2026-03-05 15:31:08 -05:00
|
|
|
Ok(all_facts)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Mine a transcript and store facts in the capnp store.
|
|
|
|
|
/// Returns the number of facts stored.
|
2026-03-08 18:31:31 -04:00
|
|
|
/// The optional `progress` callback receives status strings for daemon display.
|
|
|
|
|
pub fn mine_and_store(
|
|
|
|
|
path: &Path,
|
|
|
|
|
progress: Option<&dyn Fn(&str)>,
|
|
|
|
|
) -> Result<usize, String> {
|
|
|
|
|
let facts = mine_transcript(path, false, progress)?;
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
let filename = path.file_name()
|
|
|
|
|
.map(|n| n.to_string_lossy().to_string())
|
|
|
|
|
.unwrap_or_else(|| "unknown".into());
|
|
|
|
|
|
|
|
|
|
let key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));
|
2026-03-08 18:31:31 -04:00
|
|
|
|
|
|
|
|
// Always write a marker so we don't re-queue empty transcripts
|
|
|
|
|
let json = if facts.is_empty() {
|
|
|
|
|
"[]".to_string()
|
|
|
|
|
} else {
|
|
|
|
|
serde_json::to_string_pretty(&facts)
|
|
|
|
|
.map_err(|e| format!("serialize facts: {}", e))?
|
|
|
|
|
};
|
2026-03-05 15:31:08 -05:00
|
|
|
|
|
|
|
|
let mut store = store::Store::load()?;
|
|
|
|
|
store.upsert_provenance(&key, &json, Provenance::AgentFactMine)?;
|
|
|
|
|
store.save()?;
|
|
|
|
|
|
|
|
|
|
eprintln!(" Stored {} facts as {}", facts.len(), key);
|
|
|
|
|
Ok(facts.len())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Mine transcripts, returning all facts. Skips files with fewer than min_messages.
|
|
|
|
|
pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result<Vec<Fact>, String> {
|
|
|
|
|
let mut all_facts = Vec::new();
|
|
|
|
|
|
|
|
|
|
for path in paths {
|
|
|
|
|
let messages = extract_conversation(path);
|
|
|
|
|
if messages.len() < min_messages {
|
|
|
|
|
eprintln!("Skipping {} ({} messages < {})",
|
|
|
|
|
path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
|
|
|
|
|
messages.len(), min_messages);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-08 18:31:31 -04:00
|
|
|
let facts = mine_transcript(path, dry_run, None)?;
|
2026-03-05 15:31:08 -05:00
|
|
|
all_facts.extend(facts);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(all_facts)
|
|
|
|
|
}
|