observation extractor: per-segment dedup using shared transcript helpers

The observation agent was re-extracting the same conversations every consolidation run because select_conversation_fragments had no tracking of what had already been processed. Extract shared helpers from the fact miner's dedup pattern: - transcript_key(prefix, path): namespaced key from prefix + filename - segment_key(base, idx): per-segment key - keys_with_prefix(prefix): bulk lookup from store - unmined_segments(path, prefix, known): find unprocessed segments - mark_segment(...): mark a segment as processed Rewrite select_conversation_fragments to use these with _observed-transcripts prefix. Each compaction segment within a transcript is now tracked independently — new segments from ongoing sessions get picked up, already-processed segments are skipped.
2026-03-12 18:03:26 -04:00 · 2026-03-12 18:03:26 -04:00 · 10499a98ea
commit 10499a98ea
parent 9d1d690f17
2 changed files with 121 additions and 63 deletions
--- a/poc-memory/src/agents/enrich.rs
+++ b/poc-memory/src/agents/enrich.rs
@ -40,25 +40,78 @@ pub fn is_transcript_mined(store: &impl StoreView, path: &str) -> bool {
 /// Dedup key for a transcript based on its filename (UUID).
 /// Used by the daemon reconcile loop — no file reads needed.
 pub fn transcript_filename_key(path: &str) -> String {
+    transcript_key("_mined-transcripts", path)
+}
+
+/// Build a namespaced transcript key from a prefix and path.
+pub fn transcript_key(prefix: &str, path: &str) -> String {
    let filename = std::path::Path::new(path)
        .file_stem()
        .map(|s| s.to_string_lossy().to_string())
        .unwrap_or_else(|| path.to_string());
-    format!("_mined-transcripts#f-{}", filename)
+    format!("{}#f-{}", prefix, filename)
+}
+
+/// Per-segment key: `{base_key}.{segment_index}`
+pub fn segment_key(base: &str, segment: usize) -> String {
+    format!("{}.{}", base, segment)
+}
+
+/// Load all keys with a given prefix from the store.
+pub fn keys_with_prefix(prefix: &str) -> HashSet<String> {
+    use crate::store::AnyView;
+    let Ok(view) = AnyView::load() else { return HashSet::new() };
+    let mut keys = HashSet::new();
+    view.for_each_node(|key, _, _| {
+        if key.starts_with(prefix) {
+            keys.insert(key.to_string());
+        }
+    });
+    keys
+}
+
+/// Find unmined segments for a transcript file against a set of known keys.
+/// Returns segment indices that haven't been processed yet.
+pub fn unmined_segments(
+    path: &std::path::Path,
+    prefix: &str,
+    known: &HashSet<String>,
+) -> Vec<(usize, Vec<(usize, String, String, String)>)> {
+    let path_str = path.to_string_lossy();
+    let base = transcript_key(prefix, &path_str);
+
+    let messages = match extract_conversation(&path_str) {
+        Ok(m) => m,
+        Err(_) => return Vec::new(),
+    };
+    let segments = split_on_compaction(messages);
+
+    segments.into_iter()
+        .enumerate()
+        .filter(|(i, _)| !known.contains(&segment_key(&base, *i)))
+        .collect()
+}
+
+/// Mark a segment as processed in the store.
+pub fn mark_segment(
+    store: &mut Store,
+    path: &str,
+    prefix: &str,
+    segment: usize,
+    provenance: &str,
+    content: &str,
+) {
+    let base = transcript_key(prefix, path);
+    let key = segment_key(&base, segment);
+    let mut node = new_node(&key, content);
+    node.provenance = provenance.to_string();
+    let _ = store.upsert_node(node);
 }

 /// Get the set of all mined transcript keys (both content-hash and filename)
 /// from the store. Load once per daemon tick, check many.
 pub fn mined_transcript_keys() -> HashSet<String> {
-    use crate::store::AnyView;
-    let Ok(view) = AnyView::load() else { return HashSet::new() };
-    let mut keys = HashSet::new();
-    view.for_each_node(|key, _, _| {
-        if key.starts_with("_mined-transcripts#") {
-            keys.insert(key.to_string());
-        }
-    });
-    keys
+    keys_with_prefix("_mined-transcripts#")
 }