forked from kent/consciousness
pub → pub(crate) for SseReader methods (used across child modules). pub → pub(super) for openai::stream_events, tool definitions, store helpers. pub → private for normalize_link and differentiate_hub_with_graph (only used within their own files). Co-Authored-By: Proof of Concept <poc@bcachefs.org>
173 lines
5.7 KiB
Rust
173 lines
5.7 KiB
Rust
// Markdown parsing for memory files
|
|
//
|
|
// Splits markdown files into MemoryUnit structs based on `<!-- mem: ... -->`
|
|
// markers. Each marker starts a new section; content before the first marker
|
|
// becomes the file-level unit. Links and causal edges are extracted from
|
|
// both marker attributes and inline markdown links.
|
|
|
|
use super::NodeType;
|
|
|
|
use regex::Regex;
|
|
|
|
use std::collections::HashMap;
|
|
use std::path::Path;
|
|
use std::sync::OnceLock;
|
|
|
|
pub struct MemoryUnit {
|
|
pub key: String,
|
|
pub content: String,
|
|
pub marker_links: Vec<String>,
|
|
pub md_links: Vec<String>,
|
|
pub causes: Vec<String>,
|
|
pub state: Option<String>,
|
|
pub source_ref: Option<String>,
|
|
}
|
|
|
|
pub(super) fn classify_filename(filename: &str) -> NodeType {
|
|
let bare = filename.strip_suffix(".md").unwrap_or(filename);
|
|
if bare.starts_with("daily-") { NodeType::EpisodicDaily }
|
|
else if bare.starts_with("weekly-") { NodeType::EpisodicWeekly }
|
|
else if bare.starts_with("monthly-") { NodeType::EpisodicMonthly }
|
|
else if bare == "journal" { NodeType::EpisodicSession }
|
|
else { NodeType::Semantic }
|
|
}
|
|
|
|
pub fn parse_units(raw_filename: &str, content: &str) -> Vec<MemoryUnit> {
|
|
let filename = raw_filename.strip_suffix(".md").unwrap_or(raw_filename);
|
|
static MARKER_RE: OnceLock<Regex> = OnceLock::new();
|
|
static SOURCE_RE: OnceLock<Regex> = OnceLock::new();
|
|
static MD_LINK_RE: OnceLock<Regex> = OnceLock::new();
|
|
|
|
let marker_re = MARKER_RE.get_or_init(||
|
|
Regex::new(r"<!--\s*mem:\s*((?:id|links|tags|causes|state)\s*=\s*[^\s].*?)-->").unwrap());
|
|
let source_re = SOURCE_RE.get_or_init(||
|
|
Regex::new(r"<!--\s*source:\s*(.+?)\s*-->").unwrap());
|
|
let md_link_re = MD_LINK_RE.get_or_init(||
|
|
Regex::new(r"\[[^\]]*\]\(([^):]+(?:#[^)]*)?)\)").unwrap());
|
|
|
|
let markers: Vec<_> = marker_re.captures_iter(content)
|
|
.map(|cap| {
|
|
let full_match = cap.get(0).unwrap();
|
|
let attrs_str = &cap[1];
|
|
(full_match.start(), full_match.end(), parse_marker_attrs(attrs_str))
|
|
})
|
|
.collect();
|
|
|
|
let find_source = |text: &str| -> Option<String> {
|
|
source_re.captures(text).map(|c| c[1].trim().to_string())
|
|
};
|
|
|
|
if markers.is_empty() {
|
|
let source_ref = find_source(content);
|
|
let md_links = extract_md_links(content, md_link_re, filename);
|
|
return vec![MemoryUnit {
|
|
key: filename.to_string(),
|
|
content: content.to_string(),
|
|
marker_links: Vec::new(),
|
|
md_links,
|
|
causes: Vec::new(),
|
|
state: None,
|
|
source_ref,
|
|
}];
|
|
}
|
|
|
|
let mut units = Vec::new();
|
|
|
|
let first_start = markers[0].0;
|
|
let pre_content = content[..first_start].trim();
|
|
if !pre_content.is_empty() {
|
|
let source_ref = find_source(pre_content);
|
|
let md_links = extract_md_links(pre_content, md_link_re, filename);
|
|
units.push(MemoryUnit {
|
|
key: filename.to_string(),
|
|
content: pre_content.to_string(),
|
|
marker_links: Vec::new(),
|
|
md_links,
|
|
causes: Vec::new(),
|
|
state: None,
|
|
source_ref,
|
|
});
|
|
}
|
|
|
|
for (i, (_, end, attrs)) in markers.iter().enumerate() {
|
|
let unit_end = if i + 1 < markers.len() {
|
|
markers[i + 1].0
|
|
} else {
|
|
content.len()
|
|
};
|
|
let unit_content = content[*end..unit_end].trim();
|
|
|
|
let id = attrs.get("id").cloned().unwrap_or_default();
|
|
let key = if id.is_empty() {
|
|
format!("{}#unnamed-{}", filename, i)
|
|
} else {
|
|
format!("{}#{}", filename, id)
|
|
};
|
|
|
|
let marker_links = attrs.get("links")
|
|
.map(|l| l.split(',').map(|s| normalize_link(s.trim(), filename)).collect())
|
|
.unwrap_or_default();
|
|
|
|
let causes = attrs.get("causes")
|
|
.map(|l| l.split(',').map(|s| normalize_link(s.trim(), filename)).collect())
|
|
.unwrap_or_default();
|
|
|
|
let state = attrs.get("state").cloned();
|
|
let source_ref = find_source(unit_content);
|
|
let md_links = extract_md_links(unit_content, md_link_re, filename);
|
|
|
|
units.push(MemoryUnit {
|
|
key,
|
|
content: unit_content.to_string(),
|
|
marker_links,
|
|
md_links,
|
|
causes,
|
|
state,
|
|
source_ref,
|
|
});
|
|
}
|
|
|
|
units
|
|
}
|
|
|
|
fn parse_marker_attrs(attrs_str: &str) -> HashMap<String, String> {
|
|
static ATTR_RE: OnceLock<Regex> = OnceLock::new();
|
|
let attr_re = ATTR_RE.get_or_init(|| Regex::new(r"(\w+)\s*=\s*(\S+)").unwrap());
|
|
let mut attrs = HashMap::new();
|
|
for cap in attr_re.captures_iter(attrs_str) {
|
|
attrs.insert(cap[1].to_string(), cap[2].to_string());
|
|
}
|
|
attrs
|
|
}
|
|
|
|
fn extract_md_links(content: &str, re: &Regex, source_file: &str) -> Vec<String> {
|
|
re.captures_iter(content)
|
|
.map(|cap| normalize_link(&cap[1], source_file))
|
|
.filter(|link| !link.starts_with(source_file) || link.contains('#'))
|
|
.collect()
|
|
}
|
|
|
|
fn normalize_link(target: &str, source_file: &str) -> String {
|
|
let source_bare = source_file.strip_suffix(".md").unwrap_or(source_file);
|
|
|
|
if target.starts_with('#') {
|
|
return format!("{}{}", source_bare, target);
|
|
}
|
|
|
|
let (path_part, fragment) = if let Some(hash_pos) = target.find('#') {
|
|
(&target[..hash_pos], Some(&target[hash_pos..]))
|
|
} else {
|
|
(target, None)
|
|
};
|
|
|
|
let basename = Path::new(path_part)
|
|
.file_name()
|
|
.map(|f| f.to_string_lossy().to_string())
|
|
.unwrap_or_else(|| path_part.to_string());
|
|
let bare = basename.strip_suffix(".md").unwrap_or(&basename);
|
|
|
|
match fragment {
|
|
Some(frag) => format!("{}{}", bare, frag),
|
|
None => bare.to_string(),
|
|
}
|
|
}
|