daemon: resource-gated scheduling, fact-mine integration, systemd

Daemon improvements: - Use jobkit's new .resource(&pool) API instead of pool.acquire() inside closures — tasks wait in the pool's queue, not on worker threads - LLM pool capacity 1 to control token burn rate - Workers reduced from 7 to 4 (2 loops + 2 for jobs) - Session watcher: per-tick stats logging (stale/mined/open/queued) - Log rotation: truncate to last half when over 1MB - Duration tracking and stderr capture for job failures - Process uptime shown in status display - Replace fuser subprocess with /proc/*/fd/ scan Fact-mine integration: - mine_and_store() writes extracted facts to store nodes - fact-mine-store CLI subcommand for daemon to shell out to - Chained as dependent task after experience-mine per session Infra: - systemd user service at ~/.config/systemd/user/poc-memory.service - .cargo/config.toml: force frame pointers for profiling
2026-03-05 15:31:08 -05:00 · 2026-03-05 15:31:08 -05:00 · 37e0ce96ea
commit 37e0ce96ea
parent 552d255dc3
4 changed files with 656 additions and 125 deletions
--- a/src/fact_mine.rs
+++ b/src/fact_mine.rs
@ -0,0 +1,312 @@
+// fact_mine.rs — extract atomic factual claims from conversation transcripts
+//
+// Chunks conversation text into overlapping windows, sends each to Haiku
+// for extraction, deduplicates by claim text. Output: JSON array of facts.
+//
+// Uses Haiku (not Sonnet) for cost efficiency on high-volume extraction.
+
+use crate::llm;
+use crate::store::{self, Provenance};
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashSet;
+use std::fs;
+use std::path::Path;
+
+const CHARS_PER_TOKEN: usize = 4;
+const WINDOW_TOKENS: usize = 2000;
+const OVERLAP_TOKENS: usize = 200;
+const WINDOW_CHARS: usize = WINDOW_TOKENS * CHARS_PER_TOKEN;
+const OVERLAP_CHARS: usize = OVERLAP_TOKENS * CHARS_PER_TOKEN;
+
+const EXTRACTION_PROMPT: &str = r#"Extract atomic factual claims from this conversation excerpt.
+
+Each claim should be:
+- A single verifiable statement
+- Specific enough to be useful in isolation
+- Tagged with domain (e.g., bcachefs/btree, bcachefs/alloc, bcachefs/journal,
+  bcachefs/ec, bcachefs/reconcile, rust/idioms, workflow/preferences,
+  linux/kernel, memory/design, identity/personal)
+- Tagged with confidence: "stated" (explicitly said), "implied" (logically follows),
+  or "speculative" (hypothesis, not confirmed)
+- Include which speaker said it (Kent, PoC/ProofOfConcept, or Unknown)
+
+Do NOT extract:
+- Opinions or subjective assessments
+- Conversational filler or greetings
+- Things that are obviously common knowledge
+- Restatements of the same fact (pick the clearest version)
+- System messages, tool outputs, or error logs (extract what was LEARNED from them)
+- Anything about the conversation itself ("Kent and PoC discussed...")
+
+Output as a JSON array. Each element:
+{
+    "claim": "the exact factual statement",
+    "domain": "category/subcategory",
+    "confidence": "stated|implied|speculative",
+    "speaker": "Kent|PoC|Unknown"
+}
+
+If the excerpt contains no extractable facts, output an empty array: []
+
+--- CONVERSATION EXCERPT ---
+"#;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Fact {
+    pub claim: String,
+    pub domain: String,
+    pub confidence: String,
+    pub speaker: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub source_file: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub source_chunk: Option<usize>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub source_offset: Option<usize>,
+}
+
+struct Message {
+    role: String,
+    text: String,
+    timestamp: String,
+}
+
+/// Extract user/assistant text messages from a JSONL transcript.
+fn extract_conversation(path: &Path) -> Vec<Message> {
+    let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
+    let mut messages = Vec::new();
+
+    for line in content.lines() {
+        let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
+
+        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
+        if msg_type != "user" && msg_type != "assistant" {
+            continue;
+        }
+
+        let timestamp = obj.get("timestamp")
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        let msg = obj.get("message").unwrap_or(&obj);
+        let content = msg.get("content");
+
+        let text = match content {
+            Some(serde_json::Value::String(s)) => s.clone(),
+            Some(serde_json::Value::Array(arr)) => {
+                let texts: Vec<&str> = arr.iter()
+                    .filter_map(|block| {
+                        let obj = block.as_object()?;
+                        if obj.get("type")?.as_str()? != "text" {
+                            return None;
+                        }
+                        let t = obj.get("text")?.as_str()?;
+                        if t.contains("<system-reminder>") {
+                            return None;
+                        }
+                        Some(t)
+                    })
+                    .collect();
+                texts.join("\n")
+            }
+            _ => continue,
+        };
+
+        let text = text.trim().to_string();
+        if text.len() < 20 {
+            continue;
+        }
+
+        let role = if msg_type == "user" { "Kent" } else { "PoC" }.to_string();
+        messages.push(Message { role, text, timestamp });
+    }
+
+    messages
+}
+
+/// Format messages into a single text for chunking.
+fn format_for_extraction(messages: &[Message]) -> String {
+    messages.iter()
+        .map(|msg| {
+            let text = if msg.text.len() > 3000 {
+                // Find a char boundary near 2800
+                let trunc = msg.text.floor_char_boundary(2800);
+                format!("{}\n[...truncated...]", &msg.text[..trunc])
+            } else {
+                msg.text.clone()
+            };
+            let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
+            if ts.is_empty() {
+                format!("[{}] {}", msg.role, text)
+            } else {
+                format!("[{} {}] {}", msg.role, ts, text)
+            }
+        })
+        .collect::<Vec<_>>()
+        .join("\n\n")
+}
+
+/// Split text into overlapping windows, breaking at paragraph boundaries.
+fn chunk_text(text: &str) -> Vec<(usize, &str)> {
+    let mut chunks = Vec::new();
+    let mut start = 0;
+
+    while start < text.len() {
+        let mut end = text.floor_char_boundary((start + WINDOW_CHARS).min(text.len()));
+
+        // Try to break at a paragraph boundary
+        if end < text.len() {
+            if let Some(para) = text[start..end].rfind("\n\n") {
+                if para > WINDOW_CHARS / 2 {
+                    end = start + para;
+                }
+            }
+        }
+
+        chunks.push((start, &text[start..end]));
+
+        let next = text.floor_char_boundary(end.saturating_sub(OVERLAP_CHARS));
+        if next <= start {
+            start = end;
+        } else {
+            start = next;
+        }
+    }
+
+    chunks
+}
+
+/// Parse JSON facts from model response.
+fn parse_facts(response: &str) -> Vec<Fact> {
+    let cleaned = response.trim();
+    // Strip markdown code block
+    let cleaned = if cleaned.starts_with("```") {
+        cleaned.lines()
+            .filter(|l| !l.starts_with("```"))
+            .collect::<Vec<_>>()
+            .join("\n")
+    } else {
+        cleaned.to_string()
+    };
+
+    // Find JSON array
+    let start = cleaned.find('[');
+    let end = cleaned.rfind(']');
+    let (Some(start), Some(end)) = (start, end) else { return Vec::new() };
+
+    serde_json::from_str(&cleaned[start..=end]).unwrap_or_default()
+}
+
+/// Mine a single transcript for atomic facts.
+pub fn mine_transcript(path: &Path, dry_run: bool) -> Result<Vec<Fact>, String> {
+    let filename = path.file_name()
+        .map(|n| n.to_string_lossy().to_string())
+        .unwrap_or_else(|| "unknown".into());
+    eprintln!("Mining: {}", filename);
+
+    let messages = extract_conversation(path);
+    if messages.is_empty() {
+        eprintln!("  No messages found");
+        return Ok(Vec::new());
+    }
+    eprintln!("  {} messages extracted", messages.len());
+
+    let text = format_for_extraction(&messages);
+    let chunks = chunk_text(&text);
+    eprintln!("  {} chunks ({} chars)", chunks.len(), text.len());
+
+    if dry_run {
+        for (i, (offset, chunk)) in chunks.iter().enumerate() {
+            eprintln!("\n--- Chunk {} (offset {}, {} chars) ---", i + 1, offset, chunk.len());
+            let preview = if chunk.len() > 500 { &chunk[..500] } else { chunk };
+            eprintln!("{}", preview);
+            if chunk.len() > 500 {
+                eprintln!("  ... ({} more chars)", chunk.len() - 500);
+            }
+        }
+        return Ok(Vec::new());
+    }
+
+    let mut all_facts = Vec::new();
+    for (i, (_offset, chunk)) in chunks.iter().enumerate() {
+        eprint!("  Chunk {}/{} ({} chars)...", i + 1, chunks.len(), chunk.len());
+
+        let prompt = format!("{}{}", EXTRACTION_PROMPT, chunk);
+        let response = match llm::call_haiku(&prompt) {
+            Ok(r) => r,
+            Err(e) => {
+                eprintln!(" error: {}", e);
+                continue;
+            }
+        };
+
+        let mut facts = parse_facts(&response);
+        for fact in &mut facts {
+            fact.source_file = Some(filename.clone());
+            fact.source_chunk = Some(i + 1);
+            fact.source_offset = Some(*_offset);
+        }
+
+        eprintln!(" {} facts", facts.len());
+        all_facts.extend(facts);
+    }
+
+    // Deduplicate by claim text
+    let mut seen = HashSet::new();
+    let before = all_facts.len();
+    all_facts.retain(|f| seen.insert(f.claim.to_lowercase()));
+    let dupes = before - all_facts.len();
+    if dupes > 0 {
+        eprintln!("  {} duplicates removed", dupes);
+    }
+
+    eprintln!("  Total: {} unique facts", all_facts.len());
+    Ok(all_facts)
+}
+
+/// Mine a transcript and store facts in the capnp store.
+/// Returns the number of facts stored.
+pub fn mine_and_store(path: &Path) -> Result<usize, String> {
+    let facts = mine_transcript(path, false)?;
+    if facts.is_empty() {
+        return Ok(0);
+    }
+
+    let filename = path.file_name()
+        .map(|n| n.to_string_lossy().to_string())
+        .unwrap_or_else(|| "unknown".into());
+
+    // Store as a single node keyed by transcript filename
+    let key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));
+    let json = serde_json::to_string_pretty(&facts)
+        .map_err(|e| format!("serialize facts: {}", e))?;
+
+    let mut store = store::Store::load()?;
+    store.upsert_provenance(&key, &json, Provenance::AgentFactMine)?;
+    store.save()?;
+
+    eprintln!("  Stored {} facts as {}", facts.len(), key);
+    Ok(facts.len())
+}
+
+/// Mine transcripts, returning all facts. Skips files with fewer than min_messages.
+pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result<Vec<Fact>, String> {
+    let mut all_facts = Vec::new();
+
+    for path in paths {
+        let messages = extract_conversation(path);
+        if messages.len() < min_messages {
+            eprintln!("Skipping {} ({} messages < {})",
+                path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
+                messages.len(), min_messages);
+            continue;
+        }
+
+        let facts = mine_transcript(path, dry_run)?;
+        all_facts.extend(facts);
+    }
+
+    Ok(all_facts)
+}