fact-mine: progress callbacks, size-sorted queue, fix empty re-queue

Add optional progress callback to mine_transcript/mine_and_store so
the daemon can display per-chunk status. Sort fact-mine queue by file
size so small transcripts drain first. Write empty marker for
transcripts with no facts to avoid re-queuing them.

Also hardens the extraction prompt suffix.
This commit is contained in:
ProofOfConcept 2026-03-08 18:31:31 -04:00 committed by Kent Overstreet
parent 63910e987c
commit 2aabad4eda
2 changed files with 46 additions and 23 deletions

View file

@ -114,7 +114,8 @@ fn job_fact_mine(ctx: &ExecutionContext, path: &str) -> Result<(), TaskError> {
run_job(ctx, &format!("fact-mine {}", path), || {
ctx.log_line("mining facts");
let p = std::path::Path::new(&path);
let count = crate::fact_mine::mine_and_store(p)?;
let progress = |msg: &str| { ctx.set_progress(msg); };
let count = crate::fact_mine::mine_and_store(p, Some(&progress))?;
ctx.log_line(&format!("{} facts stored", count));
Ok(())
})
@ -465,6 +466,10 @@ pub fn run_daemon() -> Result<(), String> {
}
// Only queue fact-mine when experience backlog is clear
// Sort by file size so small transcripts drain first
needs_fact.sort_by_key(|(_, path_str)| {
fs::metadata(path_str).map(|m| m.len()).unwrap_or(u64::MAX)
});
let mut fact_queued = 0;
if needs_extract.len() == extract_queued {
let fact_budget = MAX_NEW_PER_TICK.saturating_sub(extract_queued);
@ -551,10 +556,10 @@ pub fn run_daemon() -> Result<(), String> {
if last.is_none_or(|d| d < today) {
log_event("scheduler", "daily-trigger", &today.to_string());
// Decay (no API calls, fast)
choir_sched.spawn(format!("decay:{}", today)).init(|ctx| {
job_decay(ctx)
});
// Decay disabled — version spam and premature demotion
// choir_sched.spawn(format!("decay:{}", today)).init(|ctx| {
// job_decay(ctx)
// });
// Consolidation pipeline: consolidate → knowledge-loop → digest
let consolidate = choir_sched.spawn(format!("consolidate:{}", today))

View file

@ -214,22 +214,32 @@ fn parse_facts(response: &str) -> Vec<Fact> {
}
/// Mine a single transcript for atomic facts.
pub fn mine_transcript(path: &Path, dry_run: bool) -> Result<Vec<Fact>, String> {
/// The optional `progress` callback receives status strings (e.g. "chunk 3/47").
pub fn mine_transcript(
path: &Path,
dry_run: bool,
progress: Option<&dyn Fn(&str)>,
) -> Result<Vec<Fact>, String> {
let filename = path.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| "unknown".into());
eprintln!("Mining: {}", filename);
let log = |msg: &str| {
eprintln!("{}", msg);
if let Some(cb) = progress { cb(msg); }
};
log(&format!("Mining: {}", filename));
let messages = extract_conversation(path);
if messages.is_empty() {
eprintln!(" No messages found");
log("No messages found");
return Ok(Vec::new());
}
eprintln!(" {} messages extracted", messages.len());
log(&format!("{} messages extracted", messages.len()));
let text = format_for_extraction(&messages);
let chunks = chunk_text(&text);
eprintln!(" {} chunks ({} chars)", chunks.len(), text.len());
log(&format!("{} chunks ({} chars)", chunks.len(), text.len()));
if dry_run {
for (i, (offset, chunk)) in chunks.iter().enumerate() {
@ -246,9 +256,11 @@ pub fn mine_transcript(path: &Path, dry_run: bool) -> Result<Vec<Fact>, String>
let prompt_prefix = extraction_prompt();
let mut all_facts = Vec::new();
for (i, (_offset, chunk)) in chunks.iter().enumerate() {
eprint!(" Chunk {}/{} ({} chars)...", i + 1, chunks.len(), chunk.len());
let status = format!("chunk {}/{} ({} chars)", i + 1, chunks.len(), chunk.len());
eprint!(" {}...", status);
if let Some(cb) = progress { cb(&status); }
let prompt = format!("{}{}", prompt_prefix, chunk);
let prompt = format!("{}{}\n\n--- END OF EXCERPT ---\n\nReturn ONLY a JSON array of factual claims, or [] if none.", prompt_prefix, chunk);
let response = match llm::call_haiku("fact-mine", &prompt) {
Ok(r) => r,
Err(e) => {
@ -274,29 +286,35 @@ pub fn mine_transcript(path: &Path, dry_run: bool) -> Result<Vec<Fact>, String>
all_facts.retain(|f| seen.insert(f.claim.to_lowercase()));
let dupes = before - all_facts.len();
if dupes > 0 {
eprintln!(" {} duplicates removed", dupes);
log(&format!("{} duplicates removed", dupes));
}
eprintln!(" Total: {} unique facts", all_facts.len());
log(&format!("Total: {} unique facts", all_facts.len()));
Ok(all_facts)
}
/// Mine a transcript and store facts in the capnp store.
/// Returns the number of facts stored.
pub fn mine_and_store(path: &Path) -> Result<usize, String> {
let facts = mine_transcript(path, false)?;
if facts.is_empty() {
return Ok(0);
}
/// The optional `progress` callback receives status strings for daemon display.
pub fn mine_and_store(
path: &Path,
progress: Option<&dyn Fn(&str)>,
) -> Result<usize, String> {
let facts = mine_transcript(path, false, progress)?;
let filename = path.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| "unknown".into());
// Store as a single node keyed by transcript filename
let key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));
let json = serde_json::to_string_pretty(&facts)
.map_err(|e| format!("serialize facts: {}", e))?;
// Always write a marker so we don't re-queue empty transcripts
let json = if facts.is_empty() {
"[]".to_string()
} else {
serde_json::to_string_pretty(&facts)
.map_err(|e| format!("serialize facts: {}", e))?
};
let mut store = store::Store::load()?;
store.upsert_provenance(&key, &json, Provenance::AgentFactMine)?;
@ -319,7 +337,7 @@ pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result
continue;
}
let facts = mine_transcript(path, dry_run)?;
let facts = mine_transcript(path, dry_run, None)?;
all_facts.extend(facts);
}