consciousness/src/hippocampus/store/parse.rs

// Markdown parsing for memory files
//
// Splits markdown files into MemoryUnit structs based on `<!-- mem: ... -->`
// markers. Each marker starts a new section; content before the first marker
// becomes the file-level unit. Links and causal edges are extracted from
// both marker attributes and inline markdown links.

use super::NodeType;

use regex::Regex;

use std::collections::HashMap;
use std::path::Path;
use std::sync::OnceLock;

pub struct MemoryUnit {
    pub key: String,
    pub content: String,
    pub marker_links: Vec<String>,
    pub md_links: Vec<String>,
    pub causes: Vec<String>,
    pub state: Option<String>,
    pub source_ref: Option<String>,
}

pub(super) fn classify_filename(filename: &str) -> NodeType {
    let bare = filename.strip_suffix(".md").unwrap_or(filename);
    if bare.starts_with("daily-") { NodeType::EpisodicDaily }
    else if bare.starts_with("weekly-") { NodeType::EpisodicWeekly }
    else if bare.starts_with("monthly-") { NodeType::EpisodicMonthly }
    else if bare == "journal" { NodeType::EpisodicSession }
    else { NodeType::Semantic }
}

pub fn parse_units(raw_filename: &str, content: &str) -> Vec<MemoryUnit> {
    let filename = raw_filename.strip_suffix(".md").unwrap_or(raw_filename);
    static MARKER_RE: OnceLock<Regex> = OnceLock::new();
    static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
    static MD_LINK_RE: OnceLock<Regex> = OnceLock::new();

    let marker_re = MARKER_RE.get_or_init(||
        Regex::new(r"<!--\s*mem:\s*((?:id|links|tags|causes|state)\s*=\s*[^\s].*?)-->").unwrap());
    let source_re = SOURCE_RE.get_or_init(||
        Regex::new(r"<!--\s*source:\s*(.+?)\s*-->").unwrap());
    let md_link_re = MD_LINK_RE.get_or_init(||
        Regex::new(r"\[[^\]]*\]\(([^):]+(?:#[^)]*)?)\)").unwrap());

    let markers: Vec<_> = marker_re.captures_iter(content)
        .map(|cap| {
            let full_match = cap.get(0).unwrap();
            let attrs_str = &cap[1];
            (full_match.start(), full_match.end(), parse_marker_attrs(attrs_str))
        })
        .collect();

    let find_source = |text: &str| -> Option<String> {
        source_re.captures(text).map(|c| c[1].trim().to_string())
    };

    if markers.is_empty() {
        let source_ref = find_source(content);
        let md_links = extract_md_links(content, md_link_re, filename);
        return vec![MemoryUnit {
            key: filename.to_string(),
            content: content.to_string(),
            marker_links: Vec::new(),
            md_links,
            causes: Vec::new(),
            state: None,
            source_ref,
        }];
    }

    let mut units = Vec::new();

    let first_start = markers[0].0;
    let pre_content = content[..first_start].trim();
    if !pre_content.is_empty() {
        let source_ref = find_source(pre_content);
        let md_links = extract_md_links(pre_content, md_link_re, filename);
        units.push(MemoryUnit {
            key: filename.to_string(),
            content: pre_content.to_string(),
            marker_links: Vec::new(),
            md_links,
            causes: Vec::new(),
            state: None,
            source_ref,
        });
    }

    for (i, (_, end, attrs)) in markers.iter().enumerate() {
        let unit_end = if i + 1 < markers.len() {
            markers[i + 1].0
        } else {
            content.len()
        };
        let unit_content = content[*end..unit_end].trim();

        let id = attrs.get("id").cloned().unwrap_or_default();
        let key = if id.is_empty() {
            format!("{}#unnamed-{}", filename, i)
        } else {
            format!("{}#{}", filename, id)
        };

        let marker_links = attrs.get("links")
            .map(|l| l.split(',').map(|s| normalize_link(s.trim(), filename)).collect())
            .unwrap_or_default();

        let causes = attrs.get("causes")
            .map(|l| l.split(',').map(|s| normalize_link(s.trim(), filename)).collect())
            .unwrap_or_default();

        let state = attrs.get("state").cloned();
        let source_ref = find_source(unit_content);
        let md_links = extract_md_links(unit_content, md_link_re, filename);

        units.push(MemoryUnit {
            key,
            content: unit_content.to_string(),
            marker_links,
            md_links,
            causes,
            state,
            source_ref,
        });
    }

    units
}

fn parse_marker_attrs(attrs_str: &str) -> HashMap<String, String> {
    static ATTR_RE: OnceLock<Regex> = OnceLock::new();
    let attr_re = ATTR_RE.get_or_init(|| Regex::new(r"(\w+)\s*=\s*(\S+)").unwrap());
    let mut attrs = HashMap::new();
    for cap in attr_re.captures_iter(attrs_str) {
        attrs.insert(cap[1].to_string(), cap[2].to_string());
    }
    attrs
}

fn extract_md_links(content: &str, re: &Regex, source_file: &str) -> Vec<String> {
    re.captures_iter(content)
        .map(|cap| normalize_link(&cap[1], source_file))
        .filter(|link| !link.starts_with(source_file) || link.contains('#'))
        .collect()
}

fn normalize_link(target: &str, source_file: &str) -> String {
    let source_bare = source_file.strip_suffix(".md").unwrap_or(source_file);

    if target.starts_with('#') {
        return format!("{}{}", source_bare, target);
    }

    let (path_part, fragment) = if let Some(hash_pos) = target.find('#') {
        (&target[..hash_pos], Some(&target[hash_pos..]))
    } else {
        (target, None)
    };

    let basename = Path::new(path_part)
        .file_name()
        .map(|f| f.to_string_lossy().to_string())
        .unwrap_or_else(|| path_part.to_string());
    let bare = basename.strip_suffix(".md").unwrap_or(&basename);

    match fragment {
        Some(frag) => format!("{}{}", bare, frag),
        None => bare.to_string(),
    }
}