consciousness/poc-memory/src/enrich.rs

// Journal enrichment and experience mining
//
// Two modes of processing conversation transcripts:
//   journal_enrich  — enrich a specific journal entry with source location and links
//   experience_mine — retroactively find experiential moments not yet journaled
//
// Both extract conversation from JSONL transcripts, build prompts, call Sonnet,
// and apply results to the store.

use crate::llm::{call_sonnet, parse_json_response, semantic_keys};
use crate::neuro;
use crate::store::{self, Store, new_node, new_relation};

use std::collections::hash_map::DefaultHasher;
use std::collections::HashSet;
use std::fs;
use std::hash::{Hash, Hasher};

use crate::store::StoreView;

/// Parse a timestamp string like "2026-03-05T19:56" to unix epoch seconds.
fn parse_timestamp_to_epoch(ts: &str) -> Option<i64> {
    use chrono::{Local, NaiveDateTime, TimeZone};
    // Try common formats
    let formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"];
    for fmt in &formats {
        if let Ok(ndt) = NaiveDateTime::parse_from_str(ts, fmt) {
            if let Some(dt) = Local.from_local_datetime(&ndt).earliest() {
                return Some(dt.timestamp());
            }
        }
    }
    None
}

/// Compute the store dedup key for a transcript file.
/// This is the same key experience_mine uses to mark a transcript as mined.
pub fn transcript_dedup_key(path: &str) -> Result<String, String> {
    let bytes = fs::read(path).map_err(|e| format!("read {}: {}", path, e))?;
    let mut hasher = DefaultHasher::new();
    bytes.hash(&mut hasher);
    Ok(format!("_mined-transcripts#h-{:016x}", hasher.finish()))
}

/// Check if a transcript has already been mined (dedup key exists in store).
pub fn is_transcript_mined(store: &impl StoreView, path: &str) -> bool {
    match transcript_dedup_key(path) {
        Ok(key) => store.node_content(&key).is_some(),
        Err(_) => false,
    }
}

/// Dedup key for a transcript based on its filename (UUID).
/// Used by the daemon reconcile loop — no file reads needed.
pub fn transcript_filename_key(path: &str) -> String {
    let filename = std::path::Path::new(path)
        .file_stem()
        .map(|s| s.to_string_lossy().to_string())
        .unwrap_or_else(|| path.to_string());
    format!("_mined-transcripts#f-{}", filename)
}

/// Get the set of all mined transcript keys (both content-hash and filename)
/// from the store. Load once per daemon tick, check many.
pub fn mined_transcript_keys() -> HashSet<String> {
    use crate::store::AnyView;
    let Ok(view) = AnyView::load() else { return HashSet::new() };
    let mut keys = HashSet::new();
    view.for_each_node(|key, _, _| {
        if key.starts_with("_mined-transcripts#") {
            keys.insert(key.to_string());
        }
    });
    keys
}

/// Check if a transcript has been mined, given a pre-loaded set of mined keys.
/// Checks filename-based key only (no file read). Sessions mined before the
/// filename key was added will pass through and short-circuit in experience_mine
/// via the content hash check — a one-time cost on first restart after this change.
pub fn is_transcript_mined_with_keys(mined: &HashSet<String>, path: &str) -> bool {
    mined.contains(&transcript_filename_key(path))
}

/// Extract user/assistant messages with line numbers from a JSONL transcript.
/// (line_number, role, text, timestamp)
pub fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String, String)>, String> {
    let content = fs::read_to_string(jsonl_path)
        .map_err(|e| format!("read {}: {}", jsonl_path, e))?;

    let mut messages = Vec::new();
    for (i, line) in content.lines().enumerate() {
        let obj: serde_json::Value = match serde_json::from_str(line) {
            Ok(v) => v,
            Err(_) => continue,
        };

        let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
        if msg_type != "user" && msg_type != "assistant" { continue; }

        let timestamp = obj.get("timestamp")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string();

        let msg = obj.get("message").unwrap_or(&obj);
        let content = msg.get("content");

        let text = match content {
            Some(serde_json::Value::String(s)) => s.clone(),
            Some(serde_json::Value::Array(arr)) => {
                arr.iter()
                    .filter_map(|c| {
                        // Only extract text blocks; skip tool_use, tool_result, thinking, etc.
                        let is_text = c.get("type").and_then(|v| v.as_str()) == Some("text");
                        if is_text {
                            c.get("text").and_then(|v| v.as_str()).map(|s| s.to_string())
                        } else {
                            c.as_str().map(|s| s.to_string())
                        }
                    })
                    .collect::<Vec<_>>()
                    .join("\n")
            }
            _ => continue,
        };

        let text = text.trim().to_string();
        if text.is_empty() { continue; }

        messages.push((i + 1, msg_type.to_string(), text, timestamp));
    }

    Ok(messages)
}

pub const COMPACTION_MARKER: &str = "This session is being continued from a previous conversation that ran out of context";

/// Split extracted messages into segments at compaction boundaries.
/// Each segment represents one continuous conversation before context was compacted.
pub fn split_on_compaction(messages: Vec<(usize, String, String, String)>) -> Vec<Vec<(usize, String, String, String)>> {
    let mut segments: Vec<Vec<(usize, String, String, String)>> = Vec::new();
    let mut current = Vec::new();

    for msg in messages {
        if msg.1 == "user" && msg.2.starts_with(COMPACTION_MARKER) {
            if !current.is_empty() {
                segments.push(current);
                current = Vec::new();
            }
            // The continuation message itself is part of the new segment
            current.push(msg);
        } else {
            current.push(msg);
        }
    }
    if !current.is_empty() {
        segments.push(current);
    }

    segments
}

/// Format conversation messages for the prompt (truncating long messages).
fn format_conversation(messages: &[(usize, String, String, String)]) -> String {
    messages.iter()
        .map(|(line, role, text, ts)| {
            let text = crate::util::truncate(text, 1800, "...[truncated]");
            if ts.is_empty() {
                format!("L{} [{}]: {}", line, role, text)
            } else {
                format!("L{} [{}] {}: {}", line, role, &ts[..ts.len().min(19)], text)
            }
        })
        .collect::<Vec<_>>()
        .join("\n\n")
}

fn build_journal_prompt(
    entry_text: &str,
    conversation: &str,
    keys: &[String],
    grep_line: usize,
) -> Result<String, String> {
    let keys_text: String = keys.iter()
        .map(|k| format!("  - {}", k))
        .collect::<Vec<_>>()
        .join("\n");

    neuro::load_prompt("journal-enrich", &[
        ("{{GREP_LINE}}", &grep_line.to_string()),
        ("{{ENTRY_TEXT}}", entry_text),
        ("{{KEYS}}", &keys_text),
        ("{{CONVERSATION}}", conversation),
    ])
}

/// Enrich a journal entry with conversation context and link proposals.
pub fn journal_enrich(
    store: &mut Store,
    jsonl_path: &str,
    entry_text: &str,
    grep_line: usize,
) -> Result<(), String> {
    println!("Extracting conversation from {}...", jsonl_path);
    let messages = extract_conversation(jsonl_path)?;
    let conversation = format_conversation(&messages);
    println!("  {} messages, {} chars", messages.len(), conversation.len());

    let keys = semantic_keys(store);
    println!("  {} semantic keys", keys.len());

    let prompt = build_journal_prompt(entry_text, &conversation, &keys, grep_line)?;
    println!("  Prompt: {} chars (~{} tokens)", prompt.len(), prompt.len() / 4);

    println!("  Calling Sonnet...");
    let response = call_sonnet("enrich", &prompt)?;

    let result = parse_json_response(&response)?;

    // Report results
    let source_start = result.get("source_start").and_then(|v| v.as_u64()).unwrap_or(0);
    let source_end = result.get("source_end").and_then(|v| v.as_u64()).unwrap_or(0);
    let links = result.get("links").and_then(|v| v.as_array());
    let insights = result.get("missed_insights").and_then(|v| v.as_array());

    println!("  Source: L{}-L{}", source_start, source_end);
    println!("  Links: {}", links.map_or(0, |l| l.len()));
    println!("  Missed insights: {}", insights.map_or(0, |l| l.len()));

    // Apply links
    if let Some(links) = links {
        for link in links {
            let target = link.get("target").and_then(|v| v.as_str()).unwrap_or("");
            let reason = link.get("reason").and_then(|v| v.as_str()).unwrap_or("");
            if target.is_empty() || target.starts_with("NOTE:") {
                if let Some(note) = target.strip_prefix("NOTE:") {
                    println!("  NOTE: {} — {}", note, reason);
                }
                continue;
            }

            // Resolve target and find journal node
            let resolved = match store.resolve_key(target) {
                Ok(r) => r,
                Err(_) => { println!("  SKIP {} (not in graph)", target); continue; }
            };
            let source_key = match store.find_journal_node(entry_text) {
                Some(k) => k,
                None => { println!("  SKIP {} (no matching journal node)", target); continue; }
            };

            // Refine target to best-matching section
            let source_content = store.nodes.get(&source_key)
                .map(|n| n.content.as_str()).unwrap_or("");
            let resolved = neuro::refine_target(store, source_content, &resolved);

            let source_uuid = match store.nodes.get(&source_key) {
                Some(n) => n.uuid,
                None => continue,
            };
            let target_uuid = match store.nodes.get(&resolved) {
                Some(n) => n.uuid,
                None => continue,
            };

            let rel = new_relation(
                source_uuid, target_uuid,
                store::RelationType::Link,
                0.5,
                &source_key, &resolved,
            );
            if store.add_relation(rel).is_ok() {
                println!("  LINK {} → {} ({})", source_key, resolved, reason);
            }
        }
    }

    store.save()?;
    Ok(())
}

/// Mine a conversation transcript for experiential moments not yet journaled.
/// If `segment` is Some, only process that compaction segment of the file.
pub fn experience_mine(
    store: &mut Store,
    jsonl_path: &str,
    segment: Option<usize>,
) -> Result<usize, String> {
    println!("Experience mining: {}", jsonl_path);

    // Transcript-level dedup: hash the file content and check if already mined
    let transcript_bytes = fs::read(jsonl_path)
        .map_err(|e| format!("reading transcript: {}", e))?;
    let mut hasher = DefaultHasher::new();
    transcript_bytes.hash(&mut hasher);
    let hash = hasher.finish();
    let dedup_key = format!("_mined-transcripts#h-{:016x}", hash);

    if store.nodes.contains_key(&dedup_key) {
        // Backfill filename key if missing (transcripts mined before this key existed)
        let fname_key = transcript_filename_key(jsonl_path);
        if !store.nodes.contains_key(&fname_key) {
            let mut node = new_node(&fname_key, &format!("Backfilled from {}", dedup_key));
            node.provenance = store::Provenance::AgentExperienceMine;
            let _ = store.upsert_node(node);
            store.save()?;
        }
        println!("  Already mined this transcript ({}), skipping.", &dedup_key[24..]);
        return Ok(0);
    }

    let all_messages = extract_conversation(jsonl_path)?;

    // If segment is specified, extract just that segment; otherwise process all messages
    let messages = match segment {
        Some(idx) => {
            let segments = split_on_compaction(all_messages);
            segments.into_iter().nth(idx)
                .ok_or_else(|| format!("segment {} out of range", idx))?
        }
        None => all_messages,
    };

    let conversation = format_conversation(&messages);
    println!("  {} messages, {} chars", messages.len(), conversation.len());

    // Load core identity nodes for context
    let cfg = crate::config::get();
    let identity: String = cfg.core_nodes.iter()
        .filter_map(|k| store.nodes.get(k).map(|n| n.content.as_str()))
        .collect::<Vec<_>>()
        .join("\n\n");

    // Get recent episodic entries to avoid duplication
    let mut journal: Vec<_> = store.nodes.values()
        .filter(|node| matches!(node.node_type, store::NodeType::EpisodicSession))
        .collect();
    journal.sort_by_key(|n| n.timestamp);
    let recent: String = journal.iter().rev().take(10)
        .map(|n| format!("---\n{}\n", n.content))
        .collect();

    let keys = semantic_keys(store);
    let keys_text: String = keys.iter()
        .map(|k| format!("  - {}", k))
        .collect::<Vec<_>>()
        .join("\n");

    let prompt = neuro::load_prompt("experience", &[
        ("{{IDENTITY}}", &identity),
        ("{{RECENT_JOURNAL}}", &recent),
        ("{{KEYS}}", &keys_text),
        ("{{CONVERSATION}}", &conversation),
    ])?;
    let est_tokens = prompt.len() / 4;
    println!("  Prompt: {} chars (~{} tokens)", prompt.len(), est_tokens);

    if est_tokens > 150_000 {
        println!("  Skipping: prompt too large ({} tokens > 150k limit)", est_tokens);
        return Ok(0);
    }

    println!("  Calling Sonnet...");
    let response = call_sonnet("experience-mine", &prompt)?;

    let entries = parse_json_response(&response)?;
    let entries = match entries.as_array() {
        Some(arr) => arr.clone(),
        None => return Err("expected JSON array".to_string()),
    };

    if entries.is_empty() {
        println!("  No missed experiences found.");
    } else {
        println!("  Found {} experiential moments:", entries.len());
    }
    let mut count = 0;
    for entry in &entries {
        let ts = entry.get("timestamp").and_then(|v| v.as_str()).unwrap_or("");
        let content = entry.get("content").and_then(|v| v.as_str()).unwrap_or("");
        if content.is_empty() { continue; }

        // Format with timestamp header
        let full_content = if ts.is_empty() {
            content.to_string()
        } else {
            format!("## {}\n\n{}", ts, content)
        };

        // Generate key from timestamp
        let key_slug: String = content.chars()
            .filter(|c| c.is_alphanumeric() || *c == ' ')
            .take(50)
            .collect::<String>()
            .trim()
            .to_lowercase()
            .replace(' ', "-");
        let key = if ts.is_empty() {
            format!("journal#j-mined-{}", key_slug)
        } else {
            format!("journal#j-{}-{}", ts.to_lowercase().replace(':', "-"), key_slug)
        };

        // Check for duplicate
        if store.nodes.contains_key(&key) {
            println!("  SKIP {} (duplicate)", key);
            continue;
        }

        // Write to store — use event timestamp, not mining time
        let mut node = new_node(&key, &full_content);
        node.node_type = store::NodeType::EpisodicSession;
        node.provenance = store::Provenance::AgentExperienceMine;
        if !ts.is_empty() {
            if let Some(epoch) = parse_timestamp_to_epoch(ts) {
                node.created_at = epoch;
            }
        }
        let _ = store.upsert_node(node);
        count += 1;

        let preview = crate::util::truncate(content, 77, "...");
        println!("  + [{}] {}", ts, preview);
    }

    // Record this transcript/segment as mined (even if count == 0, to prevent re-runs)
    let fname_key = match segment {
        Some(idx) => format!("{}.{}", transcript_filename_key(jsonl_path), idx),
        None => transcript_filename_key(jsonl_path),
    };
    let dedup_content = format!("Mined {} ({} entries)", jsonl_path, count);
    let mut fname_node = new_node(&fname_key, &dedup_content);
    fname_node.provenance = store::Provenance::AgentExperienceMine;
    let _ = store.upsert_node(fname_node);

    // For unsegmented calls, also write the content-hash key for backwards compat
    if segment.is_none() {
        let mut dedup_node = new_node(&dedup_key, &dedup_content);
        dedup_node.provenance = store::Provenance::AgentExperienceMine;
        let _ = store.upsert_node(dedup_node);
    }

    if count > 0 {
        println!("  Saved {} new journal entries.", count);
    }
    store.save()?;
    println!("Done: {} new entries mined.", count);
    Ok(count)
}