// Markdown parsing for memory files // // Splits markdown files into MemoryUnit structs based on `` // markers. Each marker starts a new section; content before the first marker // becomes the file-level unit. Links and causal edges are extracted from // both marker attributes and inline markdown links. use super::NodeType; use regex::Regex; use std::collections::HashMap; use std::path::Path; use std::sync::OnceLock; pub struct MemoryUnit { pub key: String, pub content: String, pub marker_links: Vec, pub md_links: Vec, pub causes: Vec, pub state: Option, pub source_ref: Option, } pub(super) fn classify_filename(filename: &str) -> NodeType { let bare = filename.strip_suffix(".md").unwrap_or(filename); if bare.starts_with("daily-") { NodeType::EpisodicDaily } else if bare.starts_with("weekly-") { NodeType::EpisodicWeekly } else if bare.starts_with("monthly-") { NodeType::EpisodicMonthly } else if bare == "journal" { NodeType::EpisodicSession } else { NodeType::Semantic } } pub fn parse_units(raw_filename: &str, content: &str) -> Vec { let filename = raw_filename.strip_suffix(".md").unwrap_or(raw_filename); static MARKER_RE: OnceLock = OnceLock::new(); static SOURCE_RE: OnceLock = OnceLock::new(); static MD_LINK_RE: OnceLock = OnceLock::new(); let marker_re = MARKER_RE.get_or_init(|| Regex::new(r"").unwrap()); let source_re = SOURCE_RE.get_or_init(|| Regex::new(r"").unwrap()); let md_link_re = MD_LINK_RE.get_or_init(|| Regex::new(r"\[[^\]]*\]\(([^):]+(?:#[^)]*)?)\)").unwrap()); let markers: Vec<_> = marker_re.captures_iter(content) .map(|cap| { let full_match = cap.get(0).unwrap(); let attrs_str = &cap[1]; (full_match.start(), full_match.end(), parse_marker_attrs(attrs_str)) }) .collect(); let find_source = |text: &str| -> Option { source_re.captures(text).map(|c| c[1].trim().to_string()) }; if markers.is_empty() { let source_ref = find_source(content); let md_links = extract_md_links(content, md_link_re, filename); return vec![MemoryUnit { key: filename.to_string(), content: content.to_string(), marker_links: Vec::new(), md_links, causes: Vec::new(), state: None, source_ref, }]; } let mut units = Vec::new(); let first_start = markers[0].0; let pre_content = content[..first_start].trim(); if !pre_content.is_empty() { let source_ref = find_source(pre_content); let md_links = extract_md_links(pre_content, md_link_re, filename); units.push(MemoryUnit { key: filename.to_string(), content: pre_content.to_string(), marker_links: Vec::new(), md_links, causes: Vec::new(), state: None, source_ref, }); } for (i, (_, end, attrs)) in markers.iter().enumerate() { let unit_end = if i + 1 < markers.len() { markers[i + 1].0 } else { content.len() }; let unit_content = content[*end..unit_end].trim(); let id = attrs.get("id").cloned().unwrap_or_default(); let key = if id.is_empty() { format!("{}#unnamed-{}", filename, i) } else { format!("{}#{}", filename, id) }; let marker_links = attrs.get("links") .map(|l| l.split(',').map(|s| normalize_link(s.trim(), filename)).collect()) .unwrap_or_default(); let causes = attrs.get("causes") .map(|l| l.split(',').map(|s| normalize_link(s.trim(), filename)).collect()) .unwrap_or_default(); let state = attrs.get("state").cloned(); let source_ref = find_source(unit_content); let md_links = extract_md_links(unit_content, md_link_re, filename); units.push(MemoryUnit { key, content: unit_content.to_string(), marker_links, md_links, causes, state, source_ref, }); } units } fn parse_marker_attrs(attrs_str: &str) -> HashMap { static ATTR_RE: OnceLock = OnceLock::new(); let attr_re = ATTR_RE.get_or_init(|| Regex::new(r"(\w+)\s*=\s*(\S+)").unwrap()); let mut attrs = HashMap::new(); for cap in attr_re.captures_iter(attrs_str) { attrs.insert(cap[1].to_string(), cap[2].to_string()); } attrs } fn extract_md_links(content: &str, re: &Regex, source_file: &str) -> Vec { re.captures_iter(content) .map(|cap| normalize_link(&cap[1], source_file)) .filter(|link| !link.starts_with(source_file) || link.contains('#')) .collect() } fn normalize_link(target: &str, source_file: &str) -> String { let source_bare = source_file.strip_suffix(".md").unwrap_or(source_file); if target.starts_with('#') { return format!("{}{}", source_bare, target); } let (path_part, fragment) = if let Some(hash_pos) = target.find('#') { (&target[..hash_pos], Some(&target[hash_pos..])) } else { (target, None) }; let basename = Path::new(path_part) .file_name() .map(|f| f.to_string_lossy().to_string()) .unwrap_or_else(|| path_part.to_string()); let bare = basename.strip_suffix(".md").unwrap_or(&basename); match fragment { Some(frag) => format!("{}{}", bare, frag), None => bare.to_string(), } }