consciousness/src/fact_mine.rs

// fact_mine.rs — extract atomic factual claims from conversation transcripts
//
// Chunks conversation text into overlapping windows, sends each to Haiku
// for extraction, deduplicates by claim text. Output: JSON array of facts.
//
// Uses Haiku (not Sonnet) for cost efficiency on high-volume extraction.

use crate::config;
use crate::llm;
use crate::store::{self, Provenance};

use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::fs;
use std::path::Path;

const CHARS_PER_TOKEN: usize = 4;
const WINDOW_TOKENS: usize = 2000;
const OVERLAP_TOKENS: usize = 200;
const WINDOW_CHARS: usize = WINDOW_TOKENS * CHARS_PER_TOKEN;
const OVERLAP_CHARS: usize = OVERLAP_TOKENS * CHARS_PER_TOKEN;

fn extraction_prompt() -> String {
    let cfg = config::get();
    format!(
r#"Extract atomic factual claims from this conversation excerpt.

Speakers are labeled [{user}] and [{assistant}] in the transcript.
Use their proper names in claims — not "the user" or "the assistant."

Each claim should be:
- A single verifiable statement
- Specific enough to be useful in isolation
- Tagged with domain (e.g., bcachefs/btree, bcachefs/alloc, bcachefs/journal,
  bcachefs/ec, bcachefs/reconcile, rust/idioms, workflow/preferences,
  linux/kernel, memory/design, identity/personal)
- Tagged with confidence: "stated" (explicitly said), "implied" (logically follows),
  or "speculative" (hypothesis, not confirmed)
- Include which speaker said it ("{user}", "{assistant}", or "Unknown")

Do NOT extract:
- Opinions or subjective assessments
- Conversational filler or greetings
- Things that are obviously common knowledge
- Restatements of the same fact (pick the clearest version)
- System messages, tool outputs, or error logs (extract what was LEARNED from them)
- Anything about the conversation itself ("{user} and {assistant} discussed...")
- Facts only relevant to this specific conversation (e.g. transient file paths, mid-debug state)

Output as a JSON array. Each element:
{{
    "claim": "the exact factual statement",
    "domain": "category/subcategory",
    "confidence": "stated|implied|speculative",
    "speaker": "{user}|{assistant}|Unknown"
}}

If the excerpt contains no extractable facts, output an empty array: []

--- CONVERSATION EXCERPT ---
"#, user = cfg.user_name, assistant = cfg.assistant_name)
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Fact {
    pub claim: String,
    pub domain: String,
    pub confidence: String,
    pub speaker: String,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub source_file: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub source_chunk: Option<usize>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub source_offset: Option<usize>,
}

struct Message {
    role: String,
    text: String,
    timestamp: String,
}

/// Extract user/assistant text messages from a JSONL transcript.
fn extract_conversation(path: &Path) -> Vec<Message> {
    let cfg = config::get();
    let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
    let mut messages = Vec::new();

    for line in content.lines() {
        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };

        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
        if msg_type != "user" && msg_type != "assistant" {
            continue;
        }

        let timestamp = obj.get("timestamp")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string();

        let msg = obj.get("message").unwrap_or(&obj);
        let content = msg.get("content");

        let text = match content {
            Some(serde_json::Value::String(s)) => s.clone(),
            Some(serde_json::Value::Array(arr)) => {
                let texts: Vec<&str> = arr.iter()
                    .filter_map(|block| {
                        let obj = block.as_object()?;
                        if obj.get("type")?.as_str()? != "text" {
                            return None;
                        }
                        let t = obj.get("text")?.as_str()?;
                        if t.contains("<system-reminder>") {
                            return None;
                        }
                        Some(t)
                    })
                    .collect();
                texts.join("\n")
            }
            _ => continue,
        };

        let text = text.trim().to_string();
        if text.len() < 20 {
            continue;
        }

        let role = if msg_type == "user" {
            cfg.user_name.clone()
        } else {
            cfg.assistant_name.clone()
        };
        messages.push(Message { role, text, timestamp });
    }

    messages
}

/// Format messages into a single text for chunking.
fn format_for_extraction(messages: &[Message]) -> String {
    messages.iter()
        .map(|msg| {
            let text = if msg.text.len() > 3000 {
                // Find a char boundary near 2800
                let trunc = msg.text.floor_char_boundary(2800);
                format!("{}\n[...truncated...]", &msg.text[..trunc])
            } else {
                msg.text.clone()
            };
            let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
            if ts.is_empty() {
                format!("[{}] {}", msg.role, text)
            } else {
                format!("[{} {}] {}", msg.role, ts, text)
            }
        })
        .collect::<Vec<_>>()
        .join("\n\n")
}

/// Split text into overlapping windows, breaking at paragraph boundaries.
fn chunk_text(text: &str) -> Vec<(usize, &str)> {
    let mut chunks = Vec::new();
    let mut start = 0;

    while start < text.len() {
        let mut end = text.floor_char_boundary((start + WINDOW_CHARS).min(text.len()));

        // Try to break at a paragraph boundary
        if end < text.len() {
            if let Some(para) = text[start..end].rfind("\n\n") {
                if para > WINDOW_CHARS / 2 {
                    end = start + para;
                }
            }
        }

        chunks.push((start, &text[start..end]));

        let next = text.floor_char_boundary(end.saturating_sub(OVERLAP_CHARS));
        if next <= start {
            start = end;
        } else {
            start = next;
        }
    }

    chunks
}

/// Parse JSON facts from model response.
fn parse_facts(response: &str) -> Vec<Fact> {
    let cleaned = response.trim();
    // Strip markdown code block
    let cleaned = if cleaned.starts_with("```") {
        cleaned.lines()
            .filter(|l| !l.starts_with("```"))
            .collect::<Vec<_>>()
            .join("\n")
    } else {
        cleaned.to_string()
    };

    // Find JSON array
    let start = cleaned.find('[');
    let end = cleaned.rfind(']');
    let (Some(start), Some(end)) = (start, end) else { return Vec::new() };

    serde_json::from_str(&cleaned[start..=end]).unwrap_or_default()
}

/// Mine a single transcript for atomic facts.
/// The optional `progress` callback receives status strings (e.g. "chunk 3/47").
pub fn mine_transcript(
    path: &Path,
    dry_run: bool,
    progress: Option<&dyn Fn(&str)>,
) -> Result<Vec<Fact>, String> {
    let filename = path.file_name()
        .map(|n| n.to_string_lossy().to_string())
        .unwrap_or_else(|| "unknown".into());
    let log = |msg: &str| {
        eprintln!("{}", msg);
        if let Some(cb) = progress { cb(msg); }
    };

    log(&format!("Mining: {}", filename));

    let messages = extract_conversation(path);
    if messages.is_empty() {
        log("No messages found");
        return Ok(Vec::new());
    }
    log(&format!("{} messages extracted", messages.len()));

    let text = format_for_extraction(&messages);
    let chunks = chunk_text(&text);
    log(&format!("{} chunks ({} chars)", chunks.len(), text.len()));

    if dry_run {
        for (i, (offset, chunk)) in chunks.iter().enumerate() {
            eprintln!("\n--- Chunk {} (offset {}, {} chars) ---", i + 1, offset, chunk.len());
            let preview = if chunk.len() > 500 { &chunk[..chunk.floor_char_boundary(500)] } else { chunk };
            eprintln!("{}", preview);
            if chunk.len() > 500 {
                eprintln!("  ... ({} more chars)", chunk.len() - 500);
            }
        }
        return Ok(Vec::new());
    }

    let prompt_prefix = extraction_prompt();
    let mut all_facts = Vec::new();
    for (i, (_offset, chunk)) in chunks.iter().enumerate() {
        let status = format!("chunk {}/{} ({} chars)", i + 1, chunks.len(), chunk.len());
        eprint!("  {}...", status);
        if let Some(cb) = progress { cb(&status); }

        let prompt = format!("{}{}\n\n--- END OF EXCERPT ---\n\nReturn ONLY a JSON array of factual claims, or [] if none.", prompt_prefix, chunk);
        let response = match llm::call_haiku("fact-mine", &prompt) {
            Ok(r) => r,
            Err(e) => {
                eprintln!(" error: {}", e);
                continue;
            }
        };

        let mut facts = parse_facts(&response);
        for fact in &mut facts {
            fact.source_file = Some(filename.clone());
            fact.source_chunk = Some(i + 1);
            fact.source_offset = Some(*_offset);
        }

        eprintln!(" {} facts", facts.len());
        all_facts.extend(facts);
    }

    // Deduplicate by claim text
    let mut seen = HashSet::new();
    let before = all_facts.len();
    all_facts.retain(|f| seen.insert(f.claim.to_lowercase()));
    let dupes = before - all_facts.len();
    if dupes > 0 {
        log(&format!("{} duplicates removed", dupes));
    }

    log(&format!("Total: {} unique facts", all_facts.len()));
    Ok(all_facts)
}

/// Mine a transcript and store facts in the capnp store.
/// Returns the number of facts stored.
/// The optional `progress` callback receives status strings for daemon display.
pub fn mine_and_store(
    path: &Path,
    progress: Option<&dyn Fn(&str)>,
) -> Result<usize, String> {
    let facts = mine_transcript(path, false, progress)?;

    let filename = path.file_name()
        .map(|n| n.to_string_lossy().to_string())
        .unwrap_or_else(|| "unknown".into());

    let key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));

    // Always write a marker so we don't re-queue empty transcripts
    let json = if facts.is_empty() {
        "[]".to_string()
    } else {
        serde_json::to_string_pretty(&facts)
            .map_err(|e| format!("serialize facts: {}", e))?
    };

    let mut store = store::Store::load()?;
    store.upsert_provenance(&key, &json, Provenance::AgentFactMine)?;
    store.save()?;

    eprintln!("  Stored {} facts as {}", facts.len(), key);
    Ok(facts.len())
}

/// Mine transcripts, returning all facts. Skips files with fewer than min_messages.
pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result<Vec<Fact>, String> {
    let mut all_facts = Vec::new();

    for path in paths {
        let messages = extract_conversation(path);
        if messages.len() < min_messages {
            eprintln!("Skipping {} ({} messages < {})",
                path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
                messages.len(), min_messages);
            continue;
        }

        let facts = mine_transcript(path, dry_run, None)?;
        all_facts.extend(facts);
    }

    Ok(all_facts)
}