From 3dddc40841f45f363efb10c8a23c765e148f4e24 Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Sun, 8 Mar 2026 21:22:05 -0400 Subject: [PATCH] fix unwrap-on-partial_cmp, dedup helpers, O(1) relation dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace all partial_cmp().unwrap() with total_cmp() in spectral.rs and knowledge.rs — eliminates potential panics on NaN without changing behavior for normal floats. Use existing weighted_distance() and eigenvalue_weights() helpers in nearest_neighbors() and nearest_to_seeds() instead of inlining the same distance computation. Move parse_timestamp_to_epoch() from enrich.rs to util.rs — was duplicated logic, now shared. Replace O(n²) relation existence check in init_from_markdown() with a HashSet of (source, target) UUID pairs. With 26K relations this was scanning linearly for every link in every markdown unit. --- poc-memory/src/enrich.rs | 15 +----------- poc-memory/src/knowledge.rs | 4 ++-- poc-memory/src/spectral.rs | 47 ++++++++++--------------------------- poc-memory/src/store/mod.rs | 37 +++++++++++++++++++---------- poc-memory/src/util.rs | 15 ++++++++++++ 5 files changed, 55 insertions(+), 63 deletions(-) diff --git a/poc-memory/src/enrich.rs b/poc-memory/src/enrich.rs index e36c48c..559aea6 100644 --- a/poc-memory/src/enrich.rs +++ b/poc-memory/src/enrich.rs @@ -18,20 +18,7 @@ use std::hash::{Hash, Hasher}; use crate::store::StoreView; -/// Parse a timestamp string like "2026-03-05T19:56" to unix epoch seconds. -fn parse_timestamp_to_epoch(ts: &str) -> Option { - use chrono::{Local, NaiveDateTime, TimeZone}; - // Try common formats - let formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"]; - for fmt in &formats { - if let Ok(ndt) = NaiveDateTime::parse_from_str(ts, fmt) { - if let Some(dt) = Local.from_local_datetime(&ndt).earliest() { - return Some(dt.timestamp()); - } - } - } - None -} +use crate::util::parse_timestamp_to_epoch; /// Compute the store dedup key for a transcript file. /// This is the same key experience_mine uses to mark a transcript as mined. diff --git a/poc-memory/src/knowledge.rs b/poc-memory/src/knowledge.rs index db53961..a7471ad 100644 --- a/poc-memory/src/knowledge.rs +++ b/poc-memory/src/knowledge.rs @@ -522,7 +522,7 @@ fn select_extractor_clusters(_store: &Store, n: usize) -> Vec> { .map(|k| (spectral_distance(&embedding, seed, k), **k)) .filter(|(d, _)| d.is_finite()) .collect(); - distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + distances.sort_by(|a, b| a.0.total_cmp(&b.0)); let cluster: Vec = std::iter::once((*seed).clone()) .chain(distances.iter().take(cluster_size - 1).map(|(_, k)| (*k).clone())) @@ -576,7 +576,7 @@ fn select_connector_pairs(store: &Store, graph: &Graph, n: usize) -> Vec<(Vec return vec![], }; - // Weight by inverse eigenvalue (coarser axes matter more) - let weights: Vec = emb.eigenvalues.iter() - .map(|&ev| if ev > 1e-8 { 1.0 / ev } else { 0.0 }) - .collect(); + let weights = eigenvalue_weights(&emb.eigenvalues); let mut distances: Vec<(String, f64)> = emb.coords.iter() .filter(|(k, _)| k.as_str() != key) - .map(|(k, coords)| { - let dist: f64 = target.iter() - .zip(coords.iter()) - .zip(weights.iter()) - .map(|((&a, &b), &w)| w * (a - b) * (a - b)) - .sum::() - .sqrt(); - (k.clone(), dist) - }) + .map(|(k, coords)| (k.clone(), weighted_distance(target, coords, &weights))) .collect(); - distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + distances.sort_by(|a, b| a.1.total_cmp(&b.1)); distances.truncate(k); distances } @@ -298,7 +287,7 @@ pub fn nearest_to_seeds( seeds: &[&str], k: usize, ) -> Vec<(String, f64)> { - let seed_set: std::collections::HashSet<&str> = seeds.iter().copied().collect(); + let seed_set: HashSet<&str> = seeds.iter().copied().collect(); let seed_coords: Vec<&Vec> = seeds.iter() .filter_map(|s| emb.coords.get(*s)) @@ -307,29 +296,19 @@ pub fn nearest_to_seeds( return vec![]; } - let weights: Vec = emb.eigenvalues.iter() - .map(|&ev| if ev > 1e-8 { 1.0 / ev } else { 0.0 }) - .collect(); + let weights = eigenvalue_weights(&emb.eigenvalues); let mut distances: Vec<(String, f64)> = emb.coords.iter() .filter(|(k, _)| !seed_set.contains(k.as_str())) .map(|(k, coords)| { - // Distance to nearest seed let min_dist = seed_coords.iter() - .map(|sc| { - coords.iter() - .zip(sc.iter()) - .zip(weights.iter()) - .map(|((&a, &b), &w)| w * (a - b) * (a - b)) - .sum::() - .sqrt() - }) + .map(|sc| weighted_distance(coords, sc, &weights)) .fold(f64::MAX, f64::min); (k.clone(), min_dist) }) .collect(); - distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + distances.sort_by(|a, b| a.1.total_cmp(&b.1)); distances.truncate(k); distances } @@ -423,7 +402,7 @@ pub fn analyze_positions( // Median distance per community for outlier scoring let medians: HashMap = by_community.into_iter() .map(|(comm, mut dists)| { - dists.sort_by(|a, b| a.partial_cmp(b).unwrap()); + dists.sort_by(|a, b| a.total_cmp(b)); let median = if dists.is_empty() { 1.0 } else if dists.len() % 2 == 0 { @@ -442,7 +421,7 @@ pub fn analyze_positions( let (nearest_community, dist_to_nearest) = centers.iter() .filter(|(&c, _)| c != comm) .map(|(&c, center)| (c, weighted_distance(coords, center, &weights))) - .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap()) + .min_by(|a, b| a.1.total_cmp(&b.1)) .unwrap_or((comm, f64::MAX)); let median = medians.get(&comm).copied().unwrap_or(1.0); @@ -461,7 +440,7 @@ pub fn analyze_positions( }) .collect(); - positions.sort_by(|a, b| b.outlier_score.partial_cmp(&a.outlier_score).unwrap()); + positions.sort_by(|a, b| b.outlier_score.total_cmp(&a.outlier_score)); positions } @@ -494,7 +473,7 @@ pub fn unlinked_neighbors( } } - pairs.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap()); + pairs.sort_by(|a, b| a.2.total_cmp(&b.2)); pairs.truncate(max_pairs); pairs } @@ -560,6 +539,6 @@ pub fn dominant_dimensions(emb: &SpectralEmbedding, keys: &[&str]) -> Vec<(usize }) .collect(); - dim_loading.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + dim_loading.sort_by(|a, b| b.1.total_cmp(&a.1)); dim_loading } diff --git a/poc-memory/src/store/mod.rs b/poc-memory/src/store/mod.rs index a49a2f6..ee83ef5 100644 --- a/poc-memory/src/store/mod.rs +++ b/poc-memory/src/store/mod.rs @@ -79,8 +79,6 @@ impl Store { } } - - /// Resolve a link target to (key, uuid). fn resolve_node_uuid(&self, target: &str) -> Option<(String, [u8; 16])> { let bare = strip_md_suffix(target); @@ -103,12 +101,28 @@ impl Store { let dir = memory_dir(); let mut count = 0; if dir.exists() { - count = self.scan_dir_for_init(&dir)?; + // Build edge set for O(1) dedup during ingestion + let mut edge_set = self.build_edge_set(); + count = self.scan_dir_for_init(&dir, &mut edge_set)?; } Ok(count) } - fn scan_dir_for_init(&mut self, dir: &Path) -> Result { + /// Build a HashSet of existing (source, target) UUID pairs for O(1) dedup. + fn build_edge_set(&self) -> std::collections::HashSet<([u8; 16], [u8; 16])> { + let mut set = std::collections::HashSet::with_capacity(self.relations.len() * 2); + for r in &self.relations { + set.insert((r.source, r.target)); + set.insert((r.target, r.source)); + } + set + } + + fn scan_dir_for_init( + &mut self, + dir: &Path, + edge_set: &mut std::collections::HashSet<([u8; 16], [u8; 16])>, + ) -> Result { let mut count = 0; let entries = fs::read_dir(dir) .map_err(|e| format!("read dir {}: {}", dir.display(), e))?; @@ -116,7 +130,7 @@ impl Store { for entry in entries.flatten() { let path = entry.path(); if path.is_dir() { - count += self.scan_dir_for_init(&path)?; + count += self.scan_dir_for_init(&path, edge_set)?; continue; } let Some(ext) = path.extension() else { continue }; @@ -140,10 +154,9 @@ impl Store { for link in unit.marker_links.iter().chain(unit.md_links.iter()) { let Some((key, uuid)) = self.resolve_node_uuid(link) else { continue }; - let exists = self.relations.iter().any(|r| - (r.source == source_uuid && r.target == uuid) || - (r.source == uuid && r.target == source_uuid)); - if !exists { + if !edge_set.contains(&(source_uuid, uuid)) { + edge_set.insert((source_uuid, uuid)); + edge_set.insert((uuid, source_uuid)); new_relations.push(new_relation( source_uuid, uuid, RelationType::Link, 1.0, &unit.key, &key, @@ -153,10 +166,8 @@ impl Store { for cause in &unit.causes { let Some((key, uuid)) = self.resolve_node_uuid(cause) else { continue }; - let exists = self.relations.iter().any(|r| - r.source == uuid && r.target == source_uuid - && r.rel_type == RelationType::Causal); - if !exists { + if !edge_set.contains(&(uuid, source_uuid)) { + edge_set.insert((uuid, source_uuid)); new_relations.push(new_relation( uuid, source_uuid, RelationType::Causal, 1.0, &key, &unit.key, diff --git a/poc-memory/src/util.rs b/poc-memory/src/util.rs index 8799e12..8c475f7 100644 --- a/poc-memory/src/util.rs +++ b/poc-memory/src/util.rs @@ -28,3 +28,18 @@ pub fn truncate(text: &str, max_len: usize, suffix: &str) -> String { pub fn first_n_chars(s: &str, n: usize) -> String { s.chars().take(n).collect() } + +/// Parse a timestamp string to unix epoch seconds. +/// Handles: "2026-03-05T19:56:00", "2026-03-05T19:56", "2026-03-05 19:56:00", "2026-03-05 19:56" +pub fn parse_timestamp_to_epoch(ts: &str) -> Option { + use chrono::{Local, NaiveDateTime, TimeZone}; + let formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"]; + for fmt in &formats { + if let Ok(ndt) = NaiveDateTime::parse_from_str(ts, fmt) { + if let Some(dt) = Local.from_local_datetime(&ndt).earliest() { + return Some(dt.timestamp()); + } + } + } + None +}