experience-mine: transcript-level dedup via content hash
Running the miner twice on the same transcript produced near-duplicate entries because: 1. Prompt-based dedup (passing recent entries to Sonnet) doesn't catch semantic duplicates written in a different emotional register 2. Key-based dedup (timestamp + content slug) fails because Sonnet assigns different timestamps and wording each run Fix: hash the transcript file content before mining. Store the hash as a _mined-transcripts node. Skip if already present. Limitation: doesn't catch overlapping content when a live transcript grows between runs (content hash changes). This is fine — the miner is intended for archived conversations, not live ones. Tested: second run on same transcript correctly skipped with "Already mined this transcript" message.
This commit is contained in:
parent
30d176d455
commit
d8de2f33f4
1 changed files with 24 additions and 1 deletions
|
|
@ -12,7 +12,9 @@ use crate::capnp_store::{self, Store};
|
|||
use crate::neuro;
|
||||
|
||||
use regex::Regex;
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::fs;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
|
||||
|
|
@ -1777,6 +1779,20 @@ pub fn experience_mine(
|
|||
jsonl_path: &str,
|
||||
) -> Result<usize, String> {
|
||||
println!("Experience mining: {}", jsonl_path);
|
||||
|
||||
// Transcript-level dedup: hash the file content and check if already mined
|
||||
let transcript_bytes = fs::read(jsonl_path)
|
||||
.map_err(|e| format!("reading transcript: {}", e))?;
|
||||
let mut hasher = DefaultHasher::new();
|
||||
transcript_bytes.hash(&mut hasher);
|
||||
let hash = hasher.finish();
|
||||
let dedup_key = format!("_mined-transcripts.md#h-{:016x}", hash);
|
||||
|
||||
if store.nodes.contains_key(&dedup_key) {
|
||||
println!(" Already mined this transcript ({}), skipping.", &dedup_key[24..]);
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let messages = extract_conversation(jsonl_path)?;
|
||||
let conversation = format_conversation(&messages);
|
||||
println!(" {} messages, {} chars", messages.len(), conversation.len());
|
||||
|
|
@ -1868,9 +1884,16 @@ pub fn experience_mine(
|
|||
println!(" + [{}] {}...", ts, preview);
|
||||
}
|
||||
|
||||
// Record this transcript as mined (even if count == 0, to prevent re-runs)
|
||||
let dedup_content = format!("Mined {} ({} entries)", jsonl_path, count);
|
||||
let mut dedup_node = Store::new_node(&dedup_key, &dedup_content);
|
||||
dedup_node.category = capnp_store::Category::Task;
|
||||
let _ = store.upsert_node(dedup_node);
|
||||
|
||||
if count > 0 {
|
||||
store.save()?;
|
||||
println!(" Saved {} new journal entries.", count);
|
||||
}
|
||||
store.save()?;
|
||||
println!("Done: {} new entries mined.", count);
|
||||
Ok(count)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue