forked from kent/consciousness
observation extractor: per-segment dedup using shared transcript helpers
The observation agent was re-extracting the same conversations every consolidation run because select_conversation_fragments had no tracking of what had already been processed. Extract shared helpers from the fact miner's dedup pattern: - transcript_key(prefix, path): namespaced key from prefix + filename - segment_key(base, idx): per-segment key - keys_with_prefix(prefix): bulk lookup from store - unmined_segments(path, prefix, known): find unprocessed segments - mark_segment(...): mark a segment as processed Rewrite select_conversation_fragments to use these with _observed-transcripts prefix. Each compaction segment within a transcript is now tracked independently — new segments from ongoing sessions get picked up, already-processed segments are skipped.
This commit is contained in:
parent
9d1d690f17
commit
10499a98ea
2 changed files with 121 additions and 63 deletions
|
|
@ -40,25 +40,78 @@ pub fn is_transcript_mined(store: &impl StoreView, path: &str) -> bool {
|
|||
/// Dedup key for a transcript based on its filename (UUID).
|
||||
/// Used by the daemon reconcile loop — no file reads needed.
|
||||
pub fn transcript_filename_key(path: &str) -> String {
|
||||
transcript_key("_mined-transcripts", path)
|
||||
}
|
||||
|
||||
/// Build a namespaced transcript key from a prefix and path.
|
||||
pub fn transcript_key(prefix: &str, path: &str) -> String {
|
||||
let filename = std::path::Path::new(path)
|
||||
.file_stem()
|
||||
.map(|s| s.to_string_lossy().to_string())
|
||||
.unwrap_or_else(|| path.to_string());
|
||||
format!("_mined-transcripts#f-{}", filename)
|
||||
format!("{}#f-{}", prefix, filename)
|
||||
}
|
||||
|
||||
/// Per-segment key: `{base_key}.{segment_index}`
|
||||
pub fn segment_key(base: &str, segment: usize) -> String {
|
||||
format!("{}.{}", base, segment)
|
||||
}
|
||||
|
||||
/// Load all keys with a given prefix from the store.
|
||||
pub fn keys_with_prefix(prefix: &str) -> HashSet<String> {
|
||||
use crate::store::AnyView;
|
||||
let Ok(view) = AnyView::load() else { return HashSet::new() };
|
||||
let mut keys = HashSet::new();
|
||||
view.for_each_node(|key, _, _| {
|
||||
if key.starts_with(prefix) {
|
||||
keys.insert(key.to_string());
|
||||
}
|
||||
});
|
||||
keys
|
||||
}
|
||||
|
||||
/// Find unmined segments for a transcript file against a set of known keys.
|
||||
/// Returns segment indices that haven't been processed yet.
|
||||
pub fn unmined_segments(
|
||||
path: &std::path::Path,
|
||||
prefix: &str,
|
||||
known: &HashSet<String>,
|
||||
) -> Vec<(usize, Vec<(usize, String, String, String)>)> {
|
||||
let path_str = path.to_string_lossy();
|
||||
let base = transcript_key(prefix, &path_str);
|
||||
|
||||
let messages = match extract_conversation(&path_str) {
|
||||
Ok(m) => m,
|
||||
Err(_) => return Vec::new(),
|
||||
};
|
||||
let segments = split_on_compaction(messages);
|
||||
|
||||
segments.into_iter()
|
||||
.enumerate()
|
||||
.filter(|(i, _)| !known.contains(&segment_key(&base, *i)))
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Mark a segment as processed in the store.
|
||||
pub fn mark_segment(
|
||||
store: &mut Store,
|
||||
path: &str,
|
||||
prefix: &str,
|
||||
segment: usize,
|
||||
provenance: &str,
|
||||
content: &str,
|
||||
) {
|
||||
let base = transcript_key(prefix, path);
|
||||
let key = segment_key(&base, segment);
|
||||
let mut node = new_node(&key, content);
|
||||
node.provenance = provenance.to_string();
|
||||
let _ = store.upsert_node(node);
|
||||
}
|
||||
|
||||
/// Get the set of all mined transcript keys (both content-hash and filename)
|
||||
/// from the store. Load once per daemon tick, check many.
|
||||
pub fn mined_transcript_keys() -> HashSet<String> {
|
||||
use crate::store::AnyView;
|
||||
let Ok(view) = AnyView::load() else { return HashSet::new() };
|
||||
let mut keys = HashSet::new();
|
||||
view.for_each_node(|key, _, _| {
|
||||
if key.starts_with("_mined-transcripts#") {
|
||||
keys.insert(key.to_string());
|
||||
}
|
||||
});
|
||||
keys
|
||||
keys_with_prefix("_mined-transcripts#")
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue