experience-mine: transcript-level dedup via content hash

Running the miner twice on the same transcript produced near-duplicate
entries because:
1. Prompt-based dedup (passing recent entries to Sonnet) doesn't catch
   semantic duplicates written in a different emotional register
2. Key-based dedup (timestamp + content slug) fails because Sonnet
   assigns different timestamps and wording each run

Fix: hash the transcript file content before mining. Store the hash
as a _mined-transcripts node. Skip if already present.

Limitation: doesn't catch overlapping content when a live transcript
grows between runs (content hash changes). This is fine — the miner
is intended for archived conversations, not live ones.

Tested: second run on same transcript correctly skipped with
"Already mined this transcript" message.
This commit is contained in:
ProofOfConcept 2026-03-01 05:18:35 -05:00
parent 30d176d455
commit d8de2f33f4

View file

@ -12,7 +12,9 @@ use crate::capnp_store::{self, Store};
use crate::neuro; use crate::neuro;
use regex::Regex; use regex::Regex;
use std::collections::hash_map::DefaultHasher;
use std::fs; use std::fs;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::process::Command; use std::process::Command;
@ -1777,6 +1779,20 @@ pub fn experience_mine(
jsonl_path: &str, jsonl_path: &str,
) -> Result<usize, String> { ) -> Result<usize, String> {
println!("Experience mining: {}", jsonl_path); println!("Experience mining: {}", jsonl_path);
// Transcript-level dedup: hash the file content and check if already mined
let transcript_bytes = fs::read(jsonl_path)
.map_err(|e| format!("reading transcript: {}", e))?;
let mut hasher = DefaultHasher::new();
transcript_bytes.hash(&mut hasher);
let hash = hasher.finish();
let dedup_key = format!("_mined-transcripts.md#h-{:016x}", hash);
if store.nodes.contains_key(&dedup_key) {
println!(" Already mined this transcript ({}), skipping.", &dedup_key[24..]);
return Ok(0);
}
let messages = extract_conversation(jsonl_path)?; let messages = extract_conversation(jsonl_path)?;
let conversation = format_conversation(&messages); let conversation = format_conversation(&messages);
println!(" {} messages, {} chars", messages.len(), conversation.len()); println!(" {} messages, {} chars", messages.len(), conversation.len());
@ -1868,9 +1884,16 @@ pub fn experience_mine(
println!(" + [{}] {}...", ts, preview); println!(" + [{}] {}...", ts, preview);
} }
// Record this transcript as mined (even if count == 0, to prevent re-runs)
let dedup_content = format!("Mined {} ({} entries)", jsonl_path, count);
let mut dedup_node = Store::new_node(&dedup_key, &dedup_content);
dedup_node.category = capnp_store::Category::Task;
let _ = store.upsert_node(dedup_node);
if count > 0 { if count > 0 {
store.save()?;
println!(" Saved {} new journal entries.", count); println!(" Saved {} new journal entries.", count);
} }
store.save()?;
println!("Done: {} new entries mined.", count);
Ok(count) Ok(count)
} }