consciousness/poc-memory/src/agents/fact_mine.rs

285 lines
9.4 KiB
Rust
Raw Normal View History

// fact_mine.rs — extract atomic factual claims from conversation transcripts
//
// Chunks conversation text into overlapping windows, sends each to Haiku
// for extraction, deduplicates by claim text. Output: JSON array of facts.
//
// Uses Haiku (not Sonnet) for cost efficiency on high-volume extraction.
use crate::config;
use super::llm;
use super::transcript;
use crate::store::{self, Provenance};
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::path::Path;
const CHARS_PER_TOKEN: usize = 4;
const WINDOW_TOKENS: usize = 2000;
const OVERLAP_TOKENS: usize = 200;
const WINDOW_CHARS: usize = WINDOW_TOKENS * CHARS_PER_TOKEN;
const OVERLAP_CHARS: usize = OVERLAP_TOKENS * CHARS_PER_TOKEN;
fn extraction_prompt() -> String {
let cfg = config::get();
format!(
r#"Extract atomic factual claims from this conversation excerpt.
Speakers are labeled [{user}] and [{assistant}] in the transcript.
Use their proper names in claims not "the user" or "the assistant."
Each claim should be:
- A single verifiable statement
- Specific enough to be useful in isolation
- Tagged with domain (e.g., bcachefs/btree, bcachefs/alloc, bcachefs/journal,
bcachefs/ec, bcachefs/reconcile, rust/idioms, workflow/preferences,
linux/kernel, memory/design, identity/personal)
- Tagged with confidence: "stated" (explicitly said), "implied" (logically follows),
or "speculative" (hypothesis, not confirmed)
- Include which speaker said it ("{user}", "{assistant}", or "Unknown")
Do NOT extract:
- Opinions or subjective assessments
- Conversational filler or greetings
- Things that are obviously common knowledge
- Restatements of the same fact (pick the clearest version)
- System messages, tool outputs, or error logs (extract what was LEARNED from them)
- Anything about the conversation itself ("{user} and {assistant} discussed...")
- Facts only relevant to this specific conversation (e.g. transient file paths, mid-debug state)
Output as a JSON array. Each element:
{{
"claim": "the exact factual statement",
"domain": "category/subcategory",
"confidence": "stated|implied|speculative",
"speaker": "{user}|{assistant}|Unknown"
}}
If the excerpt contains no extractable facts, output an empty array: []
--- CONVERSATION EXCERPT ---
"#, user = cfg.user_name, assistant = cfg.assistant_name)
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Fact {
pub claim: String,
pub domain: String,
pub confidence: String,
pub speaker: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub source_file: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub source_chunk: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub source_offset: Option<usize>,
}
/// Extract user/assistant text messages from a JSONL transcript.
fn extract_messages(path: &Path) -> Vec<transcript::TranscriptMessage> {
transcript::parse_transcript(path)
.unwrap_or_default()
.into_iter()
.filter(|m| m.text.len() >= 20)
.collect()
}
/// Format messages into a single text for chunking.
fn format_for_extraction(messages: &[transcript::TranscriptMessage]) -> String {
let cfg = config::get();
messages.iter()
.map(|msg| {
let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
if ts.is_empty() {
format!("[{}] {}", role, text)
} else {
format!("[{} {}] {}", role, ts, text)
}
})
.collect::<Vec<_>>()
.join("\n\n")
}
/// Split text into overlapping windows, breaking at paragraph boundaries.
fn chunk_text(text: &str) -> Vec<(usize, &str)> {
let mut chunks = Vec::new();
let mut start = 0;
while start < text.len() {
let mut end = text.floor_char_boundary((start + WINDOW_CHARS).min(text.len()));
// Try to break at a paragraph boundary
if end < text.len() {
if let Some(para) = text[start..end].rfind("\n\n") {
if para > WINDOW_CHARS / 2 {
end = start + para;
}
}
}
chunks.push((start, &text[start..end]));
let next = text.floor_char_boundary(end.saturating_sub(OVERLAP_CHARS));
if next <= start {
start = end;
} else {
start = next;
}
}
chunks
}
/// Parse JSON facts from model response.
fn parse_facts(response: &str) -> Vec<Fact> {
let cleaned = response.trim();
// Strip markdown code block
let cleaned = if cleaned.starts_with("```") {
cleaned.lines()
.filter(|l| !l.starts_with("```"))
.collect::<Vec<_>>()
.join("\n")
} else {
cleaned.to_string()
};
// Find JSON array
let start = cleaned.find('[');
let end = cleaned.rfind(']');
let (Some(start), Some(end)) = (start, end) else { return Vec::new() };
serde_json::from_str(&cleaned[start..=end]).unwrap_or_default()
}
/// Mine a single transcript for atomic facts.
/// The optional `progress` callback receives status strings (e.g. "chunk 3/47").
pub fn mine_transcript(
path: &Path,
dry_run: bool,
progress: Option<&dyn Fn(&str)>,
) -> Result<Vec<Fact>, String> {
let filename = path.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| "unknown".into());
let log = |msg: &str| {
eprintln!("{}", msg);
if let Some(cb) = progress { cb(msg); }
};
log(&format!("Mining: {}", filename));
let messages = extract_messages(path);
if messages.is_empty() {
log("No messages found");
return Ok(Vec::new());
}
log(&format!("{} messages extracted", messages.len()));
let text = format_for_extraction(&messages);
let chunks = chunk_text(&text);
log(&format!("{} chunks ({} chars)", chunks.len(), text.len()));
if dry_run {
for (i, (offset, chunk)) in chunks.iter().enumerate() {
eprintln!("\n--- Chunk {} (offset {}, {} chars) ---", i + 1, offset, chunk.len());
eprintln!("{}", crate::util::truncate(chunk, 500, ""));
if chunk.len() > 500 {
eprintln!(" ... ({} more chars)", chunk.len() - 500);
}
}
return Ok(Vec::new());
}
let prompt_prefix = extraction_prompt();
let mut all_facts = Vec::new();
for (i, (_offset, chunk)) in chunks.iter().enumerate() {
let status = format!("chunk {}/{} ({} chars)", i + 1, chunks.len(), chunk.len());
eprint!(" {}...", status);
if let Some(cb) = progress { cb(&status); }
let prompt = format!("{}{}\n\n--- END OF EXCERPT ---\n\nReturn ONLY a JSON array of factual claims, or [] if none.", prompt_prefix, chunk);
let response = match llm::call_haiku("fact-mine", &prompt) {
Ok(r) => r,
Err(e) => {
eprintln!(" error: {}", e);
continue;
}
};
let mut facts = parse_facts(&response);
for fact in &mut facts {
fact.source_file = Some(filename.clone());
fact.source_chunk = Some(i + 1);
fact.source_offset = Some(*_offset);
}
eprintln!(" {} facts", facts.len());
all_facts.extend(facts);
}
// Deduplicate by claim text
let mut seen = HashSet::new();
let before = all_facts.len();
all_facts.retain(|f| seen.insert(f.claim.to_lowercase()));
let dupes = before - all_facts.len();
if dupes > 0 {
log(&format!("{} duplicates removed", dupes));
}
log(&format!("Total: {} unique facts", all_facts.len()));
Ok(all_facts)
}
/// Mine a transcript and store facts in the capnp store.
/// Returns the number of facts stored.
/// The optional `progress` callback receives status strings for daemon display.
pub fn mine_and_store(
path: &Path,
progress: Option<&dyn Fn(&str)>,
) -> Result<usize, String> {
let facts = mine_transcript(path, false, progress)?;
let filename = path.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| "unknown".into());
let key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));
// Always write a marker so we don't re-queue empty transcripts
let json = if facts.is_empty() {
"[]".to_string()
} else {
serde_json::to_string_pretty(&facts)
.map_err(|e| format!("serialize facts: {}", e))?
};
let mut store = store::Store::load()?;
store.upsert_provenance(&key, &json, Provenance::AgentFactMine)?;
store.save()?;
eprintln!(" Stored {} facts as {}", facts.len(), key);
Ok(facts.len())
}
/// Mine transcripts, returning all facts. Skips files with fewer than min_messages.
pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result<Vec<Fact>, String> {
let mut all_facts = Vec::new();
for path in paths {
let messages = extract_messages(path);
if messages.len() < min_messages {
eprintln!("Skipping {} ({} messages < {})",
path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
messages.len(), min_messages);
continue;
}
let facts = mine_transcript(path, dry_run, None)?;
all_facts.extend(facts);
}
Ok(all_facts)
}