From d8de2f33f40cc3b197dec92a30e345fa276411c7 Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Sun, 1 Mar 2026 05:18:35 -0500 Subject: [PATCH] experience-mine: transcript-level dedup via content hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running the miner twice on the same transcript produced near-duplicate entries because: 1. Prompt-based dedup (passing recent entries to Sonnet) doesn't catch semantic duplicates written in a different emotional register 2. Key-based dedup (timestamp + content slug) fails because Sonnet assigns different timestamps and wording each run Fix: hash the transcript file content before mining. Store the hash as a _mined-transcripts node. Skip if already present. Limitation: doesn't catch overlapping content when a live transcript grows between runs (content hash changes). This is fine — the miner is intended for archived conversations, not live ones. Tested: second run on same transcript correctly skipped with "Already mined this transcript" message. --- src/digest.rs | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/digest.rs b/src/digest.rs index ce8ee63..6b37017 100644 --- a/src/digest.rs +++ b/src/digest.rs @@ -12,7 +12,9 @@ use crate::capnp_store::{self, Store}; use crate::neuro; use regex::Regex; +use std::collections::hash_map::DefaultHasher; use std::fs; +use std::hash::{Hash, Hasher}; use std::path::{Path, PathBuf}; use std::process::Command; @@ -1777,6 +1779,20 @@ pub fn experience_mine( jsonl_path: &str, ) -> Result { println!("Experience mining: {}", jsonl_path); + + // Transcript-level dedup: hash the file content and check if already mined + let transcript_bytes = fs::read(jsonl_path) + .map_err(|e| format!("reading transcript: {}", e))?; + let mut hasher = DefaultHasher::new(); + transcript_bytes.hash(&mut hasher); + let hash = hasher.finish(); + let dedup_key = format!("_mined-transcripts.md#h-{:016x}", hash); + + if store.nodes.contains_key(&dedup_key) { + println!(" Already mined this transcript ({}), skipping.", &dedup_key[24..]); + return Ok(0); + } + let messages = extract_conversation(jsonl_path)?; let conversation = format_conversation(&messages); println!(" {} messages, {} chars", messages.len(), conversation.len()); @@ -1868,9 +1884,16 @@ pub fn experience_mine( println!(" + [{}] {}...", ts, preview); } + // Record this transcript as mined (even if count == 0, to prevent re-runs) + let dedup_content = format!("Mined {} ({} entries)", jsonl_path, count); + let mut dedup_node = Store::new_node(&dedup_key, &dedup_content); + dedup_node.category = capnp_store::Category::Task; + let _ = store.upsert_node(dedup_node); + if count > 0 { - store.save()?; println!(" Saved {} new journal entries.", count); } + store.save()?; + println!("Done: {} new entries mined.", count); Ok(count) }