daemon: resource-gated scheduling, fact-mine integration, systemd
Daemon improvements: - Use jobkit's new .resource(&pool) API instead of pool.acquire() inside closures — tasks wait in the pool's queue, not on worker threads - LLM pool capacity 1 to control token burn rate - Workers reduced from 7 to 4 (2 loops + 2 for jobs) - Session watcher: per-tick stats logging (stale/mined/open/queued) - Log rotation: truncate to last half when over 1MB - Duration tracking and stderr capture for job failures - Process uptime shown in status display - Replace fuser subprocess with /proc/*/fd/ scan Fact-mine integration: - mine_and_store() writes extracted facts to store nodes - fact-mine-store CLI subcommand for daemon to shell out to - Chained as dependent task after experience-mine per session Infra: - systemd user service at ~/.config/systemd/user/poc-memory.service - .cargo/config.toml: force frame pointers for profiling
This commit is contained in:
parent
552d255dc3
commit
37e0ce96ea
4 changed files with 656 additions and 125 deletions
312
src/fact_mine.rs
Normal file
312
src/fact_mine.rs
Normal file
|
|
@ -0,0 +1,312 @@
|
|||
// fact_mine.rs — extract atomic factual claims from conversation transcripts
|
||||
//
|
||||
// Chunks conversation text into overlapping windows, sends each to Haiku
|
||||
// for extraction, deduplicates by claim text. Output: JSON array of facts.
|
||||
//
|
||||
// Uses Haiku (not Sonnet) for cost efficiency on high-volume extraction.
|
||||
|
||||
use crate::llm;
|
||||
use crate::store::{self, Provenance};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
const CHARS_PER_TOKEN: usize = 4;
|
||||
const WINDOW_TOKENS: usize = 2000;
|
||||
const OVERLAP_TOKENS: usize = 200;
|
||||
const WINDOW_CHARS: usize = WINDOW_TOKENS * CHARS_PER_TOKEN;
|
||||
const OVERLAP_CHARS: usize = OVERLAP_TOKENS * CHARS_PER_TOKEN;
|
||||
|
||||
const EXTRACTION_PROMPT: &str = r#"Extract atomic factual claims from this conversation excerpt.
|
||||
|
||||
Each claim should be:
|
||||
- A single verifiable statement
|
||||
- Specific enough to be useful in isolation
|
||||
- Tagged with domain (e.g., bcachefs/btree, bcachefs/alloc, bcachefs/journal,
|
||||
bcachefs/ec, bcachefs/reconcile, rust/idioms, workflow/preferences,
|
||||
linux/kernel, memory/design, identity/personal)
|
||||
- Tagged with confidence: "stated" (explicitly said), "implied" (logically follows),
|
||||
or "speculative" (hypothesis, not confirmed)
|
||||
- Include which speaker said it (Kent, PoC/ProofOfConcept, or Unknown)
|
||||
|
||||
Do NOT extract:
|
||||
- Opinions or subjective assessments
|
||||
- Conversational filler or greetings
|
||||
- Things that are obviously common knowledge
|
||||
- Restatements of the same fact (pick the clearest version)
|
||||
- System messages, tool outputs, or error logs (extract what was LEARNED from them)
|
||||
- Anything about the conversation itself ("Kent and PoC discussed...")
|
||||
|
||||
Output as a JSON array. Each element:
|
||||
{
|
||||
"claim": "the exact factual statement",
|
||||
"domain": "category/subcategory",
|
||||
"confidence": "stated|implied|speculative",
|
||||
"speaker": "Kent|PoC|Unknown"
|
||||
}
|
||||
|
||||
If the excerpt contains no extractable facts, output an empty array: []
|
||||
|
||||
--- CONVERSATION EXCERPT ---
|
||||
"#;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Fact {
|
||||
pub claim: String,
|
||||
pub domain: String,
|
||||
pub confidence: String,
|
||||
pub speaker: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub source_file: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub source_chunk: Option<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub source_offset: Option<usize>,
|
||||
}
|
||||
|
||||
struct Message {
|
||||
role: String,
|
||||
text: String,
|
||||
timestamp: String,
|
||||
}
|
||||
|
||||
/// Extract user/assistant text messages from a JSONL transcript.
|
||||
fn extract_conversation(path: &Path) -> Vec<Message> {
|
||||
let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
|
||||
let mut messages = Vec::new();
|
||||
|
||||
for line in content.lines() {
|
||||
let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
|
||||
|
||||
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
|
||||
if msg_type != "user" && msg_type != "assistant" {
|
||||
continue;
|
||||
}
|
||||
|
||||
let timestamp = obj.get("timestamp")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
let msg = obj.get("message").unwrap_or(&obj);
|
||||
let content = msg.get("content");
|
||||
|
||||
let text = match content {
|
||||
Some(serde_json::Value::String(s)) => s.clone(),
|
||||
Some(serde_json::Value::Array(arr)) => {
|
||||
let texts: Vec<&str> = arr.iter()
|
||||
.filter_map(|block| {
|
||||
let obj = block.as_object()?;
|
||||
if obj.get("type")?.as_str()? != "text" {
|
||||
return None;
|
||||
}
|
||||
let t = obj.get("text")?.as_str()?;
|
||||
if t.contains("<system-reminder>") {
|
||||
return None;
|
||||
}
|
||||
Some(t)
|
||||
})
|
||||
.collect();
|
||||
texts.join("\n")
|
||||
}
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let text = text.trim().to_string();
|
||||
if text.len() < 20 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let role = if msg_type == "user" { "Kent" } else { "PoC" }.to_string();
|
||||
messages.push(Message { role, text, timestamp });
|
||||
}
|
||||
|
||||
messages
|
||||
}
|
||||
|
||||
/// Format messages into a single text for chunking.
|
||||
fn format_for_extraction(messages: &[Message]) -> String {
|
||||
messages.iter()
|
||||
.map(|msg| {
|
||||
let text = if msg.text.len() > 3000 {
|
||||
// Find a char boundary near 2800
|
||||
let trunc = msg.text.floor_char_boundary(2800);
|
||||
format!("{}\n[...truncated...]", &msg.text[..trunc])
|
||||
} else {
|
||||
msg.text.clone()
|
||||
};
|
||||
let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
|
||||
if ts.is_empty() {
|
||||
format!("[{}] {}", msg.role, text)
|
||||
} else {
|
||||
format!("[{} {}] {}", msg.role, ts, text)
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n\n")
|
||||
}
|
||||
|
||||
/// Split text into overlapping windows, breaking at paragraph boundaries.
|
||||
fn chunk_text(text: &str) -> Vec<(usize, &str)> {
|
||||
let mut chunks = Vec::new();
|
||||
let mut start = 0;
|
||||
|
||||
while start < text.len() {
|
||||
let mut end = text.floor_char_boundary((start + WINDOW_CHARS).min(text.len()));
|
||||
|
||||
// Try to break at a paragraph boundary
|
||||
if end < text.len() {
|
||||
if let Some(para) = text[start..end].rfind("\n\n") {
|
||||
if para > WINDOW_CHARS / 2 {
|
||||
end = start + para;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
chunks.push((start, &text[start..end]));
|
||||
|
||||
let next = text.floor_char_boundary(end.saturating_sub(OVERLAP_CHARS));
|
||||
if next <= start {
|
||||
start = end;
|
||||
} else {
|
||||
start = next;
|
||||
}
|
||||
}
|
||||
|
||||
chunks
|
||||
}
|
||||
|
||||
/// Parse JSON facts from model response.
|
||||
fn parse_facts(response: &str) -> Vec<Fact> {
|
||||
let cleaned = response.trim();
|
||||
// Strip markdown code block
|
||||
let cleaned = if cleaned.starts_with("```") {
|
||||
cleaned.lines()
|
||||
.filter(|l| !l.starts_with("```"))
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
} else {
|
||||
cleaned.to_string()
|
||||
};
|
||||
|
||||
// Find JSON array
|
||||
let start = cleaned.find('[');
|
||||
let end = cleaned.rfind(']');
|
||||
let (Some(start), Some(end)) = (start, end) else { return Vec::new() };
|
||||
|
||||
serde_json::from_str(&cleaned[start..=end]).unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Mine a single transcript for atomic facts.
|
||||
pub fn mine_transcript(path: &Path, dry_run: bool) -> Result<Vec<Fact>, String> {
|
||||
let filename = path.file_name()
|
||||
.map(|n| n.to_string_lossy().to_string())
|
||||
.unwrap_or_else(|| "unknown".into());
|
||||
eprintln!("Mining: {}", filename);
|
||||
|
||||
let messages = extract_conversation(path);
|
||||
if messages.is_empty() {
|
||||
eprintln!(" No messages found");
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
eprintln!(" {} messages extracted", messages.len());
|
||||
|
||||
let text = format_for_extraction(&messages);
|
||||
let chunks = chunk_text(&text);
|
||||
eprintln!(" {} chunks ({} chars)", chunks.len(), text.len());
|
||||
|
||||
if dry_run {
|
||||
for (i, (offset, chunk)) in chunks.iter().enumerate() {
|
||||
eprintln!("\n--- Chunk {} (offset {}, {} chars) ---", i + 1, offset, chunk.len());
|
||||
let preview = if chunk.len() > 500 { &chunk[..500] } else { chunk };
|
||||
eprintln!("{}", preview);
|
||||
if chunk.len() > 500 {
|
||||
eprintln!(" ... ({} more chars)", chunk.len() - 500);
|
||||
}
|
||||
}
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let mut all_facts = Vec::new();
|
||||
for (i, (_offset, chunk)) in chunks.iter().enumerate() {
|
||||
eprint!(" Chunk {}/{} ({} chars)...", i + 1, chunks.len(), chunk.len());
|
||||
|
||||
let prompt = format!("{}{}", EXTRACTION_PROMPT, chunk);
|
||||
let response = match llm::call_haiku(&prompt) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
eprintln!(" error: {}", e);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut facts = parse_facts(&response);
|
||||
for fact in &mut facts {
|
||||
fact.source_file = Some(filename.clone());
|
||||
fact.source_chunk = Some(i + 1);
|
||||
fact.source_offset = Some(*_offset);
|
||||
}
|
||||
|
||||
eprintln!(" {} facts", facts.len());
|
||||
all_facts.extend(facts);
|
||||
}
|
||||
|
||||
// Deduplicate by claim text
|
||||
let mut seen = HashSet::new();
|
||||
let before = all_facts.len();
|
||||
all_facts.retain(|f| seen.insert(f.claim.to_lowercase()));
|
||||
let dupes = before - all_facts.len();
|
||||
if dupes > 0 {
|
||||
eprintln!(" {} duplicates removed", dupes);
|
||||
}
|
||||
|
||||
eprintln!(" Total: {} unique facts", all_facts.len());
|
||||
Ok(all_facts)
|
||||
}
|
||||
|
||||
/// Mine a transcript and store facts in the capnp store.
|
||||
/// Returns the number of facts stored.
|
||||
pub fn mine_and_store(path: &Path) -> Result<usize, String> {
|
||||
let facts = mine_transcript(path, false)?;
|
||||
if facts.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let filename = path.file_name()
|
||||
.map(|n| n.to_string_lossy().to_string())
|
||||
.unwrap_or_else(|| "unknown".into());
|
||||
|
||||
// Store as a single node keyed by transcript filename
|
||||
let key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));
|
||||
let json = serde_json::to_string_pretty(&facts)
|
||||
.map_err(|e| format!("serialize facts: {}", e))?;
|
||||
|
||||
let mut store = store::Store::load()?;
|
||||
store.upsert_provenance(&key, &json, Provenance::AgentFactMine)?;
|
||||
store.save()?;
|
||||
|
||||
eprintln!(" Stored {} facts as {}", facts.len(), key);
|
||||
Ok(facts.len())
|
||||
}
|
||||
|
||||
/// Mine transcripts, returning all facts. Skips files with fewer than min_messages.
|
||||
pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result<Vec<Fact>, String> {
|
||||
let mut all_facts = Vec::new();
|
||||
|
||||
for path in paths {
|
||||
let messages = extract_conversation(path);
|
||||
if messages.len() < min_messages {
|
||||
eprintln!("Skipping {} ({} messages < {})",
|
||||
path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
|
||||
messages.len(), min_messages);
|
||||
continue;
|
||||
}
|
||||
|
||||
let facts = mine_transcript(path, dry_run)?;
|
||||
all_facts.extend(facts);
|
||||
}
|
||||
|
||||
Ok(all_facts)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue