fix unwrap-on-partial_cmp, dedup helpers, O(1) relation dedup
Replace all partial_cmp().unwrap() with total_cmp() in spectral.rs and knowledge.rs — eliminates potential panics on NaN without changing behavior for normal floats. Use existing weighted_distance() and eigenvalue_weights() helpers in nearest_neighbors() and nearest_to_seeds() instead of inlining the same distance computation. Move parse_timestamp_to_epoch() from enrich.rs to util.rs — was duplicated logic, now shared. Replace O(n²) relation existence check in init_from_markdown() with a HashSet of (source, target) UUID pairs. With 26K relations this was scanning linearly for every link in every markdown unit.
This commit is contained in:
parent
2f2c84e1c0
commit
3dddc40841
5 changed files with 55 additions and 63 deletions
|
|
@ -18,20 +18,7 @@ use std::hash::{Hash, Hasher};
|
||||||
|
|
||||||
use crate::store::StoreView;
|
use crate::store::StoreView;
|
||||||
|
|
||||||
/// Parse a timestamp string like "2026-03-05T19:56" to unix epoch seconds.
|
use crate::util::parse_timestamp_to_epoch;
|
||||||
fn parse_timestamp_to_epoch(ts: &str) -> Option<i64> {
|
|
||||||
use chrono::{Local, NaiveDateTime, TimeZone};
|
|
||||||
// Try common formats
|
|
||||||
let formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"];
|
|
||||||
for fmt in &formats {
|
|
||||||
if let Ok(ndt) = NaiveDateTime::parse_from_str(ts, fmt) {
|
|
||||||
if let Some(dt) = Local.from_local_datetime(&ndt).earliest() {
|
|
||||||
return Some(dt.timestamp());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute the store dedup key for a transcript file.
|
/// Compute the store dedup key for a transcript file.
|
||||||
/// This is the same key experience_mine uses to mark a transcript as mined.
|
/// This is the same key experience_mine uses to mark a transcript as mined.
|
||||||
|
|
|
||||||
|
|
@ -522,7 +522,7 @@ fn select_extractor_clusters(_store: &Store, n: usize) -> Vec<Vec<String>> {
|
||||||
.map(|k| (spectral_distance(&embedding, seed, k), **k))
|
.map(|k| (spectral_distance(&embedding, seed, k), **k))
|
||||||
.filter(|(d, _)| d.is_finite())
|
.filter(|(d, _)| d.is_finite())
|
||||||
.collect();
|
.collect();
|
||||||
distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
distances.sort_by(|a, b| a.0.total_cmp(&b.0));
|
||||||
|
|
||||||
let cluster: Vec<String> = std::iter::once((*seed).clone())
|
let cluster: Vec<String> = std::iter::once((*seed).clone())
|
||||||
.chain(distances.iter().take(cluster_size - 1).map(|(_, k)| (*k).clone()))
|
.chain(distances.iter().take(cluster_size - 1).map(|(_, k)| (*k).clone()))
|
||||||
|
|
@ -576,7 +576,7 @@ fn select_connector_pairs(store: &Store, graph: &Graph, n: usize) -> Vec<(Vec<St
|
||||||
.map(|k| (spectral_distance(&embedding, seed, k), *k))
|
.map(|k| (spectral_distance(&embedding, seed, k), *k))
|
||||||
.filter(|(d, _)| *d < 0.5 && d.is_finite())
|
.filter(|(d, _)| *d < 0.5 && d.is_finite())
|
||||||
.collect();
|
.collect();
|
||||||
near.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
near.sort_by(|a, b| a.0.total_cmp(&b.0));
|
||||||
|
|
||||||
for (_, target) in near.iter().take(5) {
|
for (_, target) in near.iter().take(5) {
|
||||||
if !has_edge(store, seed, target) {
|
if !has_edge(store, seed, target) {
|
||||||
|
|
|
||||||
|
|
@ -177,7 +177,7 @@ pub fn print_summary(result: &SpectralResult, graph: &Graph) {
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(i, &v)| (i, v))
|
.map(|(i, &v)| (i, v))
|
||||||
.collect();
|
.collect();
|
||||||
indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
indexed.sort_by(|a, b| a.1.total_cmp(&b.1));
|
||||||
|
|
||||||
// Compute the "spread" — how much this axis differentiates
|
// Compute the "spread" — how much this axis differentiates
|
||||||
let min_val = indexed.first().map(|x| x.1).unwrap_or(0.0);
|
let min_val = indexed.first().map(|x| x.1).unwrap_or(0.0);
|
||||||
|
|
@ -268,25 +268,14 @@ pub fn nearest_neighbors(
|
||||||
None => return vec![],
|
None => return vec![],
|
||||||
};
|
};
|
||||||
|
|
||||||
// Weight by inverse eigenvalue (coarser axes matter more)
|
let weights = eigenvalue_weights(&emb.eigenvalues);
|
||||||
let weights: Vec<f64> = emb.eigenvalues.iter()
|
|
||||||
.map(|&ev| if ev > 1e-8 { 1.0 / ev } else { 0.0 })
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let mut distances: Vec<(String, f64)> = emb.coords.iter()
|
let mut distances: Vec<(String, f64)> = emb.coords.iter()
|
||||||
.filter(|(k, _)| k.as_str() != key)
|
.filter(|(k, _)| k.as_str() != key)
|
||||||
.map(|(k, coords)| {
|
.map(|(k, coords)| (k.clone(), weighted_distance(target, coords, &weights)))
|
||||||
let dist: f64 = target.iter()
|
|
||||||
.zip(coords.iter())
|
|
||||||
.zip(weights.iter())
|
|
||||||
.map(|((&a, &b), &w)| w * (a - b) * (a - b))
|
|
||||||
.sum::<f64>()
|
|
||||||
.sqrt();
|
|
||||||
(k.clone(), dist)
|
|
||||||
})
|
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
distances.sort_by(|a, b| a.1.total_cmp(&b.1));
|
||||||
distances.truncate(k);
|
distances.truncate(k);
|
||||||
distances
|
distances
|
||||||
}
|
}
|
||||||
|
|
@ -298,7 +287,7 @@ pub fn nearest_to_seeds(
|
||||||
seeds: &[&str],
|
seeds: &[&str],
|
||||||
k: usize,
|
k: usize,
|
||||||
) -> Vec<(String, f64)> {
|
) -> Vec<(String, f64)> {
|
||||||
let seed_set: std::collections::HashSet<&str> = seeds.iter().copied().collect();
|
let seed_set: HashSet<&str> = seeds.iter().copied().collect();
|
||||||
|
|
||||||
let seed_coords: Vec<&Vec<f64>> = seeds.iter()
|
let seed_coords: Vec<&Vec<f64>> = seeds.iter()
|
||||||
.filter_map(|s| emb.coords.get(*s))
|
.filter_map(|s| emb.coords.get(*s))
|
||||||
|
|
@ -307,29 +296,19 @@ pub fn nearest_to_seeds(
|
||||||
return vec![];
|
return vec![];
|
||||||
}
|
}
|
||||||
|
|
||||||
let weights: Vec<f64> = emb.eigenvalues.iter()
|
let weights = eigenvalue_weights(&emb.eigenvalues);
|
||||||
.map(|&ev| if ev > 1e-8 { 1.0 / ev } else { 0.0 })
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let mut distances: Vec<(String, f64)> = emb.coords.iter()
|
let mut distances: Vec<(String, f64)> = emb.coords.iter()
|
||||||
.filter(|(k, _)| !seed_set.contains(k.as_str()))
|
.filter(|(k, _)| !seed_set.contains(k.as_str()))
|
||||||
.map(|(k, coords)| {
|
.map(|(k, coords)| {
|
||||||
// Distance to nearest seed
|
|
||||||
let min_dist = seed_coords.iter()
|
let min_dist = seed_coords.iter()
|
||||||
.map(|sc| {
|
.map(|sc| weighted_distance(coords, sc, &weights))
|
||||||
coords.iter()
|
|
||||||
.zip(sc.iter())
|
|
||||||
.zip(weights.iter())
|
|
||||||
.map(|((&a, &b), &w)| w * (a - b) * (a - b))
|
|
||||||
.sum::<f64>()
|
|
||||||
.sqrt()
|
|
||||||
})
|
|
||||||
.fold(f64::MAX, f64::min);
|
.fold(f64::MAX, f64::min);
|
||||||
(k.clone(), min_dist)
|
(k.clone(), min_dist)
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
distances.sort_by(|a, b| a.1.total_cmp(&b.1));
|
||||||
distances.truncate(k);
|
distances.truncate(k);
|
||||||
distances
|
distances
|
||||||
}
|
}
|
||||||
|
|
@ -423,7 +402,7 @@ pub fn analyze_positions(
|
||||||
// Median distance per community for outlier scoring
|
// Median distance per community for outlier scoring
|
||||||
let medians: HashMap<u32, f64> = by_community.into_iter()
|
let medians: HashMap<u32, f64> = by_community.into_iter()
|
||||||
.map(|(comm, mut dists)| {
|
.map(|(comm, mut dists)| {
|
||||||
dists.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
dists.sort_by(|a, b| a.total_cmp(b));
|
||||||
let median = if dists.is_empty() {
|
let median = if dists.is_empty() {
|
||||||
1.0
|
1.0
|
||||||
} else if dists.len() % 2 == 0 {
|
} else if dists.len() % 2 == 0 {
|
||||||
|
|
@ -442,7 +421,7 @@ pub fn analyze_positions(
|
||||||
let (nearest_community, dist_to_nearest) = centers.iter()
|
let (nearest_community, dist_to_nearest) = centers.iter()
|
||||||
.filter(|(&c, _)| c != comm)
|
.filter(|(&c, _)| c != comm)
|
||||||
.map(|(&c, center)| (c, weighted_distance(coords, center, &weights)))
|
.map(|(&c, center)| (c, weighted_distance(coords, center, &weights)))
|
||||||
.min_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
|
.min_by(|a, b| a.1.total_cmp(&b.1))
|
||||||
.unwrap_or((comm, f64::MAX));
|
.unwrap_or((comm, f64::MAX));
|
||||||
|
|
||||||
let median = medians.get(&comm).copied().unwrap_or(1.0);
|
let median = medians.get(&comm).copied().unwrap_or(1.0);
|
||||||
|
|
@ -461,7 +440,7 @@ pub fn analyze_positions(
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
positions.sort_by(|a, b| b.outlier_score.partial_cmp(&a.outlier_score).unwrap());
|
positions.sort_by(|a, b| b.outlier_score.total_cmp(&a.outlier_score));
|
||||||
positions
|
positions
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -494,7 +473,7 @@ pub fn unlinked_neighbors(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pairs.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap());
|
pairs.sort_by(|a, b| a.2.total_cmp(&b.2));
|
||||||
pairs.truncate(max_pairs);
|
pairs.truncate(max_pairs);
|
||||||
pairs
|
pairs
|
||||||
}
|
}
|
||||||
|
|
@ -560,6 +539,6 @@ pub fn dominant_dimensions(emb: &SpectralEmbedding, keys: &[&str]) -> Vec<(usize
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
dim_loading.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
|
dim_loading.sort_by(|a, b| b.1.total_cmp(&a.1));
|
||||||
dim_loading
|
dim_loading
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -79,8 +79,6 @@ impl Store {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// Resolve a link target to (key, uuid).
|
/// Resolve a link target to (key, uuid).
|
||||||
fn resolve_node_uuid(&self, target: &str) -> Option<(String, [u8; 16])> {
|
fn resolve_node_uuid(&self, target: &str) -> Option<(String, [u8; 16])> {
|
||||||
let bare = strip_md_suffix(target);
|
let bare = strip_md_suffix(target);
|
||||||
|
|
@ -103,12 +101,28 @@ impl Store {
|
||||||
let dir = memory_dir();
|
let dir = memory_dir();
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
if dir.exists() {
|
if dir.exists() {
|
||||||
count = self.scan_dir_for_init(&dir)?;
|
// Build edge set for O(1) dedup during ingestion
|
||||||
|
let mut edge_set = self.build_edge_set();
|
||||||
|
count = self.scan_dir_for_init(&dir, &mut edge_set)?;
|
||||||
}
|
}
|
||||||
Ok(count)
|
Ok(count)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn scan_dir_for_init(&mut self, dir: &Path) -> Result<usize, String> {
|
/// Build a HashSet of existing (source, target) UUID pairs for O(1) dedup.
|
||||||
|
fn build_edge_set(&self) -> std::collections::HashSet<([u8; 16], [u8; 16])> {
|
||||||
|
let mut set = std::collections::HashSet::with_capacity(self.relations.len() * 2);
|
||||||
|
for r in &self.relations {
|
||||||
|
set.insert((r.source, r.target));
|
||||||
|
set.insert((r.target, r.source));
|
||||||
|
}
|
||||||
|
set
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scan_dir_for_init(
|
||||||
|
&mut self,
|
||||||
|
dir: &Path,
|
||||||
|
edge_set: &mut std::collections::HashSet<([u8; 16], [u8; 16])>,
|
||||||
|
) -> Result<usize, String> {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let entries = fs::read_dir(dir)
|
let entries = fs::read_dir(dir)
|
||||||
.map_err(|e| format!("read dir {}: {}", dir.display(), e))?;
|
.map_err(|e| format!("read dir {}: {}", dir.display(), e))?;
|
||||||
|
|
@ -116,7 +130,7 @@ impl Store {
|
||||||
for entry in entries.flatten() {
|
for entry in entries.flatten() {
|
||||||
let path = entry.path();
|
let path = entry.path();
|
||||||
if path.is_dir() {
|
if path.is_dir() {
|
||||||
count += self.scan_dir_for_init(&path)?;
|
count += self.scan_dir_for_init(&path, edge_set)?;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let Some(ext) = path.extension() else { continue };
|
let Some(ext) = path.extension() else { continue };
|
||||||
|
|
@ -140,10 +154,9 @@ impl Store {
|
||||||
|
|
||||||
for link in unit.marker_links.iter().chain(unit.md_links.iter()) {
|
for link in unit.marker_links.iter().chain(unit.md_links.iter()) {
|
||||||
let Some((key, uuid)) = self.resolve_node_uuid(link) else { continue };
|
let Some((key, uuid)) = self.resolve_node_uuid(link) else { continue };
|
||||||
let exists = self.relations.iter().any(|r|
|
if !edge_set.contains(&(source_uuid, uuid)) {
|
||||||
(r.source == source_uuid && r.target == uuid) ||
|
edge_set.insert((source_uuid, uuid));
|
||||||
(r.source == uuid && r.target == source_uuid));
|
edge_set.insert((uuid, source_uuid));
|
||||||
if !exists {
|
|
||||||
new_relations.push(new_relation(
|
new_relations.push(new_relation(
|
||||||
source_uuid, uuid, RelationType::Link, 1.0,
|
source_uuid, uuid, RelationType::Link, 1.0,
|
||||||
&unit.key, &key,
|
&unit.key, &key,
|
||||||
|
|
@ -153,10 +166,8 @@ impl Store {
|
||||||
|
|
||||||
for cause in &unit.causes {
|
for cause in &unit.causes {
|
||||||
let Some((key, uuid)) = self.resolve_node_uuid(cause) else { continue };
|
let Some((key, uuid)) = self.resolve_node_uuid(cause) else { continue };
|
||||||
let exists = self.relations.iter().any(|r|
|
if !edge_set.contains(&(uuid, source_uuid)) {
|
||||||
r.source == uuid && r.target == source_uuid
|
edge_set.insert((uuid, source_uuid));
|
||||||
&& r.rel_type == RelationType::Causal);
|
|
||||||
if !exists {
|
|
||||||
new_relations.push(new_relation(
|
new_relations.push(new_relation(
|
||||||
uuid, source_uuid, RelationType::Causal, 1.0,
|
uuid, source_uuid, RelationType::Causal, 1.0,
|
||||||
&key, &unit.key,
|
&key, &unit.key,
|
||||||
|
|
|
||||||
|
|
@ -28,3 +28,18 @@ pub fn truncate(text: &str, max_len: usize, suffix: &str) -> String {
|
||||||
pub fn first_n_chars(s: &str, n: usize) -> String {
|
pub fn first_n_chars(s: &str, n: usize) -> String {
|
||||||
s.chars().take(n).collect()
|
s.chars().take(n).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Parse a timestamp string to unix epoch seconds.
|
||||||
|
/// Handles: "2026-03-05T19:56:00", "2026-03-05T19:56", "2026-03-05 19:56:00", "2026-03-05 19:56"
|
||||||
|
pub fn parse_timestamp_to_epoch(ts: &str) -> Option<i64> {
|
||||||
|
use chrono::{Local, NaiveDateTime, TimeZone};
|
||||||
|
let formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"];
|
||||||
|
for fmt in &formats {
|
||||||
|
if let Ok(ndt) = NaiveDateTime::parse_from_str(ts, fmt) {
|
||||||
|
if let Some(dt) = Local.from_local_datetime(&ndt).earliest() {
|
||||||
|
return Some(dt.timestamp());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue