diff --git a/src/digest.rs b/src/digest.rs index ce8ee63..6b37017 100644 --- a/src/digest.rs +++ b/src/digest.rs @@ -12,7 +12,9 @@ use crate::capnp_store::{self, Store}; use crate::neuro; use regex::Regex; +use std::collections::hash_map::DefaultHasher; use std::fs; +use std::hash::{Hash, Hasher}; use std::path::{Path, PathBuf}; use std::process::Command; @@ -1777,6 +1779,20 @@ pub fn experience_mine( jsonl_path: &str, ) -> Result { println!("Experience mining: {}", jsonl_path); + + // Transcript-level dedup: hash the file content and check if already mined + let transcript_bytes = fs::read(jsonl_path) + .map_err(|e| format!("reading transcript: {}", e))?; + let mut hasher = DefaultHasher::new(); + transcript_bytes.hash(&mut hasher); + let hash = hasher.finish(); + let dedup_key = format!("_mined-transcripts.md#h-{:016x}", hash); + + if store.nodes.contains_key(&dedup_key) { + println!(" Already mined this transcript ({}), skipping.", &dedup_key[24..]); + return Ok(0); + } + let messages = extract_conversation(jsonl_path)?; let conversation = format_conversation(&messages); println!(" {} messages, {} chars", messages.len(), conversation.len()); @@ -1868,9 +1884,16 @@ pub fn experience_mine( println!(" + [{}] {}...", ts, preview); } + // Record this transcript as mined (even if count == 0, to prevent re-runs) + let dedup_content = format!("Mined {} ({} entries)", jsonl_path, count); + let mut dedup_node = Store::new_node(&dedup_key, &dedup_content); + dedup_node.category = capnp_store::Category::Task; + let _ = store.upsert_node(dedup_node); + if count > 0 { - store.save()?; println!(" Saved {} new journal entries.", count); } + store.save()?; + println!("Done: {} new entries mined.", count); Ok(count) }