observation extractor: per-segment dedup using shared transcript helpers

The observation agent was re-extracting the same conversations every
consolidation run because select_conversation_fragments had no tracking
of what had already been processed.

Extract shared helpers from the fact miner's dedup pattern:
  - transcript_key(prefix, path): namespaced key from prefix + filename
  - segment_key(base, idx): per-segment key
  - keys_with_prefix(prefix): bulk lookup from store
  - unmined_segments(path, prefix, known): find unprocessed segments
  - mark_segment(...): mark a segment as processed

Rewrite select_conversation_fragments to use these with
_observed-transcripts prefix. Each compaction segment within a
transcript is now tracked independently — new segments from ongoing
sessions get picked up, already-processed segments are skipped.
This commit is contained in:
ProofOfConcept 2026-03-12 18:03:26 -04:00 committed by Kent Overstreet
parent 9d1d690f17
commit 10499a98ea
2 changed files with 121 additions and 63 deletions

View file

@ -40,25 +40,78 @@ pub fn is_transcript_mined(store: &impl StoreView, path: &str) -> bool {
/// Dedup key for a transcript based on its filename (UUID).
/// Used by the daemon reconcile loop — no file reads needed.
pub fn transcript_filename_key(path: &str) -> String {
transcript_key("_mined-transcripts", path)
}
/// Build a namespaced transcript key from a prefix and path.
pub fn transcript_key(prefix: &str, path: &str) -> String {
let filename = std::path::Path::new(path)
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| path.to_string());
format!("_mined-transcripts#f-{}", filename)
format!("{}#f-{}", prefix, filename)
}
/// Per-segment key: `{base_key}.{segment_index}`
pub fn segment_key(base: &str, segment: usize) -> String {
format!("{}.{}", base, segment)
}
/// Load all keys with a given prefix from the store.
pub fn keys_with_prefix(prefix: &str) -> HashSet<String> {
use crate::store::AnyView;
let Ok(view) = AnyView::load() else { return HashSet::new() };
let mut keys = HashSet::new();
view.for_each_node(|key, _, _| {
if key.starts_with(prefix) {
keys.insert(key.to_string());
}
});
keys
}
/// Find unmined segments for a transcript file against a set of known keys.
/// Returns segment indices that haven't been processed yet.
pub fn unmined_segments(
path: &std::path::Path,
prefix: &str,
known: &HashSet<String>,
) -> Vec<(usize, Vec<(usize, String, String, String)>)> {
let path_str = path.to_string_lossy();
let base = transcript_key(prefix, &path_str);
let messages = match extract_conversation(&path_str) {
Ok(m) => m,
Err(_) => return Vec::new(),
};
let segments = split_on_compaction(messages);
segments.into_iter()
.enumerate()
.filter(|(i, _)| !known.contains(&segment_key(&base, *i)))
.collect()
}
/// Mark a segment as processed in the store.
pub fn mark_segment(
store: &mut Store,
path: &str,
prefix: &str,
segment: usize,
provenance: &str,
content: &str,
) {
let base = transcript_key(prefix, path);
let key = segment_key(&base, segment);
let mut node = new_node(&key, content);
node.provenance = provenance.to_string();
let _ = store.upsert_node(node);
}
/// Get the set of all mined transcript keys (both content-hash and filename)
/// from the store. Load once per daemon tick, check many.
pub fn mined_transcript_keys() -> HashSet<String> {
use crate::store::AnyView;
let Ok(view) = AnyView::load() else { return HashSet::new() };
let mut keys = HashSet::new();
view.for_each_node(|key, _, _| {
if key.starts_with("_mined-transcripts#") {
keys.insert(key.to_string());
}
});
keys
keys_with_prefix("_mined-transcripts#")
}