remove dead transcript mining pipeline

Delete enrich.rs (conversation extraction), select_conversation_fragments, mark_observation_done, format_segment, and the {{conversations}} placeholder. Transcript processing is handled by observe/journal agents now. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-02 21:42:03 -04:00 · 2026-04-02 21:42:03 -04:00 · 72d967edbf
commit 72d967edbf
parent 74fce5cf41
5 changed files with 2 additions and 198 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -67,7 +67,7 @@ pub use hippocampus::query::parser as query_parser;
 pub use subconscious as agents;
 pub use subconscious::{
    llm, audit, consolidate, knowledge,
-    enrich, digest, daemon,
+    digest, daemon,
 };
 // Backward compat: memory_search moved from hippocampus to subconscious::hook
 pub use subconscious::hook as memory_search;
--- a/src/subconscious/defs.rs
+++ b/src/subconscious/defs.rs
@ -333,18 +333,6 @@ fn resolve(
            Some(Resolved { text, keys: result_keys })
        }
        "conversations" => {
            let fragments = super::knowledge::select_conversation_fragments(count);
            let fragment_ids: Vec<String> = fragments.iter()
                .map(|(id, _)| id.clone())
                .collect();
            let text = fragments.iter()
                .map(|(id, text)| format!("### Session {}\n\n{}", id, text))
                .collect::<Vec<_>>()
                .join("\n\n---\n\n");
            Some(Resolved { text, keys: fragment_ids })
        }
        "siblings" | "neighborhood" => {
            let mut out = String::new();
            let mut all_keys: Vec<String> = Vec::new();
--- a/src/subconscious/enrich.rs
+++ b/src/subconscious/enrich.rs
@ -1,40 +0,0 @@
 // Conversation extraction from JSONL transcripts
 //
 // extract_conversation  — parse JSONL transcript to messages
 // split_on_compaction   — split messages at compaction boundaries
 /// Extract conversation messages from a JSONL transcript file.
 /// Returns (line_number, role, text, timestamp) tuples.
 pub fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String, String)>, String> {
    let path = std::path::Path::new(jsonl_path);
    let messages = super::transcript::parse_transcript(path)?;
    Ok(messages.into_iter()
        .map(|m| (m.line, m.role, m.text, m.timestamp))
        .collect())
 }
 pub const COMPACTION_MARKER: &str = "This session is being continued from a previous conversation that ran out of context";
 /// Split extracted messages into segments at compaction boundaries.
 /// Each segment represents one continuous conversation before context was compacted.
 pub fn split_on_compaction(messages: Vec<(usize, String, String, String)>) -> Vec<Vec<(usize, String, String, String)>> {
    let mut segments: Vec<Vec<(usize, String, String, String)>> = Vec::new();
    let mut current = Vec::new();
    for msg in messages {
        if msg.1 == "user" && msg.2.starts_with(COMPACTION_MARKER) {
            if !current.is_empty() {
                segments.push(current);
                current = Vec::new();
            }
            current.push(msg);
        } else {
            current.push(msg);
        }
    }
    if !current.is_empty() {
        segments.push(current);
    }
    segments
 }
--- a/src/subconscious/knowledge.rs
+++ b/src/subconscious/knowledge.rs
@ -104,11 +104,6 @@ pub fn run_and_apply_excluded(
 ) -> Result<(), String> {
    let result = run_one_agent_excluded(store, agent_name, batch_size, llm_tag, log, exclude)?;
    // Mark conversation segments as mined after successful processing
    if agent_name == "observation" {
        mark_observation_done(&result.node_keys);
    }
    Ok(())
 }
@ -397,142 +392,4 @@ fn run_one_agent_inner(
 }
 // ---------------------------------------------------------------------------
-// Conversation fragment selection
+// Conversation fragment selection removed — observe/journal agents handle transcripts.
 // ---------------------------------------------------------------------------
 /// Select conversation fragments (per-segment) for the observation extractor.
 /// Uses the transcript-progress.capnp log for dedup — no stub nodes.
 /// Does NOT pre-mark segments; caller must call mark_observation_done() after success.
 pub fn select_conversation_fragments(n: usize) -> Vec<(String, String)> {
    let projects = crate::config::get().projects_dir.clone();
    if !projects.exists() { return Vec::new(); }
    let store = match crate::store::Store::load() {
        Ok(s) => s,
        Err(_) => return Vec::new(),
    };
    let mut jsonl_files: Vec<PathBuf> = Vec::new();
    if let Ok(dirs) = fs::read_dir(&projects) {
        for dir in dirs.filter_map(|e| e.ok()) {
            if !dir.path().is_dir() { continue; }
            if let Ok(files) = fs::read_dir(dir.path()) {
                for f in files.filter_map(|e| e.ok()) {
                    let p = f.path();
                    if p.extension().map(|x| x == "jsonl").unwrap_or(false)
                        && let Ok(meta) = p.metadata()
                            && meta.len() > 50_000 {
                                jsonl_files.push(p);
                            }
                }
            }
        }
    }
    // Collect unmined segments across all transcripts
    let mut candidates: Vec<(String, String)> = Vec::new();
    for path in &jsonl_files {
        let path_str = path.to_string_lossy();
        let messages = match super::enrich::extract_conversation(&path_str) {
            Ok(m) => m,
            Err(_) => continue,
        };
        let session_id = path.file_stem()
            .map(|s| s.to_string_lossy().to_string())
            .unwrap_or_else(|| "unknown".into());
        let segments = super::enrich::split_on_compaction(messages);
        for (seg_idx, segment) in segments.into_iter().enumerate() {
            if store.is_segment_mined(&session_id, seg_idx as u32, "observation") {
                continue;
            }
            // Skip segments with too few assistant messages (rate limits, errors)
            let assistant_msgs = segment.iter()
                .filter(|(_, role, _, _)| role == "assistant")
                .count();
            if assistant_msgs < 2 {
                continue;
            }
            // Skip segments that are just rate limit errors
            let has_rate_limit = segment.iter().any(|(_, _, text, _)|
                text.contains("hit your limit") || text.contains("rate limit"));
            if has_rate_limit && assistant_msgs < 3 {
                continue;
            }
            let text = format_segment(&segment);
            if text.len() < 500 {
                continue;
            }
            const CHUNK_SIZE: usize = 50_000;
            const OVERLAP: usize = 10_000;
            if text.len() <= CHUNK_SIZE {
                let id = format!("{}.{}", session_id, seg_idx);
                candidates.push((id, text));
            } else {
                // Split on line boundaries with overlap
                let lines: Vec<&str> = text.lines().collect();
                let mut start_line = 0;
                let mut chunk_idx = 0;
                while start_line < lines.len() {
                    let mut end_line = start_line;
                    let mut size = 0;
                    while end_line < lines.len() && size < CHUNK_SIZE {
                        size += lines[end_line].len() + 1;
                        end_line += 1;
                    }
                    let chunk: String = lines[start_line..end_line].join("\n");
                    let id = format!("{}.{}.{}", session_id, seg_idx, chunk_idx);
                    candidates.push((id, chunk));
                    if end_line >= lines.len() { break; }
                    // Back up by overlap amount for next chunk
                    let mut overlap_size = 0;
                    let mut overlap_start = end_line;
                    while overlap_start > start_line && overlap_size < OVERLAP {
                        overlap_start -= 1;
                        overlap_size += lines[overlap_start].len() + 1;
                    }
                    start_line = overlap_start;
                    chunk_idx += 1;
                }
            }
        }
        if candidates.len() >= n { break; }
    }
    candidates.truncate(n);
    candidates
 }
 /// Mark observation segments as successfully mined (call AFTER the agent succeeds).
 pub fn mark_observation_done(fragment_ids: &[String]) {
    let mut store = match crate::store::Store::load() {
        Ok(s) => s,
        Err(_) => return,
    };
    for id in fragment_ids {
        if let Some((session_id, seg_str)) = id.rsplit_once('.')
            && let Ok(seg) = seg_str.parse::<u32>() {
                let _ = store.mark_segment_mined(session_id, seg, "observation");
            }
    }
 }
 /// Format a segment's messages into readable text for the observation agent.
 fn format_segment(messages: &[(usize, String, String, String)]) -> String {
    let cfg = crate::config::get();
    let mut fragments = Vec::new();
    for (_, role, text, ts) in messages {
        let min_len = if role == "user" { 5 } else { 10 };
        if text.len() <= min_len { continue; }
        let name = if role == "user" { &cfg.user_name } else { &cfg.assistant_name };
        if ts.is_empty() {
            fragments.push(format!("**{}:** {}", name, text));
        } else {
            fragments.push(format!("**{}** {}: {}", name, &ts[..ts.len().min(19)], text));
        }
    }
    fragments.join("\n\n")
 }
--- a/src/subconscious/mod.rs
+++ b/src/subconscious/mod.rs
@ -26,6 +26,5 @@ pub mod defs;
 pub mod audit;
 pub mod consolidate;
 pub mod knowledge;
 pub mod enrich;
 pub mod digest;
 pub mod daemon;