From 3dddc40841f45f363efb10c8a23c765e148f4e24 Mon Sep 17 00:00:00 2001
From: ProofOfConcept <poc@bcachefs.org>
Date: Sun, 8 Mar 2026 21:22:05 -0400
Subject: [PATCH] fix unwrap-on-partial_cmp, dedup helpers, O(1) relation dedup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace all partial_cmp().unwrap() with total_cmp() in spectral.rs
and knowledge.rs — eliminates potential panics on NaN without
changing behavior for normal floats.

Use existing weighted_distance() and eigenvalue_weights() helpers in
nearest_neighbors() and nearest_to_seeds() instead of inlining the
same distance computation.

Move parse_timestamp_to_epoch() from enrich.rs to util.rs — was
duplicated logic, now shared.

Replace O(n²) relation existence check in init_from_markdown() with
a HashSet of (source, target) UUID pairs. With 26K relations this
was scanning linearly for every link in every markdown unit.
---
 poc-memory/src/enrich.rs    | 15 +-----------
 poc-memory/src/knowledge.rs |  4 ++--
 poc-memory/src/spectral.rs  | 47 ++++++++++---------------------------
 poc-memory/src/store/mod.rs | 37 +++++++++++++++++++----------
 poc-memory/src/util.rs      | 15 ++++++++++++
 5 files changed, 55 insertions(+), 63 deletions(-)
diff --git a/poc-memory/src/enrich.rs b/poc-memory/src/enrich.rs
index e36c48c..559aea6 100644
--- a/poc-memory/src/enrich.rs
+++ b/poc-memory/src/enrich.rs
@@ -18,20 +18,7 @@ use std::hash::{Hash, Hasher};
 
 use crate::store::StoreView;
 
-/// Parse a timestamp string like "2026-03-05T19:56" to unix epoch seconds.
-fn parse_timestamp_to_epoch(ts: &str) -> Option<i64> {
-    use chrono::{Local, NaiveDateTime, TimeZone};
-    // Try common formats
-    let formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"];
-    for fmt in &formats {
-        if let Ok(ndt) = NaiveDateTime::parse_from_str(ts, fmt) {
-            if let Some(dt) = Local.from_local_datetime(&ndt).earliest() {
-                return Some(dt.timestamp());
-            }
-        }
-    }
-    None
-}
+use crate::util::parse_timestamp_to_epoch;
 
 /// Compute the store dedup key for a transcript file.
 /// This is the same key experience_mine uses to mark a transcript as mined.
diff --git a/poc-memory/src/knowledge.rs b/poc-memory/src/knowledge.rs
index db53961..a7471ad 100644
--- a/poc-memory/src/knowledge.rs
+++ b/poc-memory/src/knowledge.rs
@@ -522,7 +522,7 @@ fn select_extractor_clusters(_store: &Store, n: usize) -> Vec<Vec<String>> {
             .map(|k| (spectral_distance(&embedding, seed, k), **k))
             .filter(|(d, _)| d.is_finite())
             .collect();
-        distances.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
+        distances.sort_by(|a, b| a.0.total_cmp(&b.0));
 
         let cluster: Vec<String> = std::iter::once((*seed).clone())
             .chain(distances.iter().take(cluster_size - 1).map(|(_, k)| (*k).clone()))
@@ -576,7 +576,7 @@ fn select_connector_pairs(store: &Store, graph: &Graph, n: usize) -> Vec<(Vec<St
             .map(|k| (spectral_distance(&embedding, seed, k), *k))
             .filter(|(d, _)| *d < 0.5 && d.is_finite())
             .collect();
-        near.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
+        near.sort_by(|a, b| a.0.total_cmp(&b.0));
 
         for (_, target) in near.iter().take(5) {
             if !has_edge(store, seed, target) {
diff --git a/poc-memory/src/spectral.rs b/poc-memory/src/spectral.rs
index c63e847..0cbd7fd 100644
--- a/poc-memory/src/spectral.rs
+++ b/poc-memory/src/spectral.rs
@@ -177,7 +177,7 @@ pub fn print_summary(result: &SpectralResult, graph: &Graph) {
             .enumerate()
             .map(|(i, &v)| (i, v))
             .collect();
-        indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+        indexed.sort_by(|a, b| a.1.total_cmp(&b.1));
 
         // Compute the "spread" — how much this axis differentiates
         let min_val = indexed.first().map(|x| x.1).unwrap_or(0.0);
@@ -268,25 +268,14 @@ pub fn nearest_neighbors(
         None => return vec![],
     };
 
-    // Weight by inverse eigenvalue (coarser axes matter more)
-    let weights: Vec<f64> = emb.eigenvalues.iter()
-        .map(|&ev| if ev > 1e-8 { 1.0 / ev } else { 0.0 })
-        .collect();
+    let weights = eigenvalue_weights(&emb.eigenvalues);
 
     let mut distances: Vec<(String, f64)> = emb.coords.iter()
         .filter(|(k, _)| k.as_str() != key)
-        .map(|(k, coords)| {
-            let dist: f64 = target.iter()
-                .zip(coords.iter())
-                .zip(weights.iter())
-                .map(|((&a, &b), &w)| w * (a - b) * (a - b))
-                .sum::<f64>()
-                .sqrt();
-            (k.clone(), dist)
-        })
+        .map(|(k, coords)| (k.clone(), weighted_distance(target, coords, &weights)))
         .collect();
 
-    distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+    distances.sort_by(|a, b| a.1.total_cmp(&b.1));
     distances.truncate(k);
     distances
 }
@@ -298,7 +287,7 @@ pub fn nearest_to_seeds(
     seeds: &[&str],
     k: usize,
 ) -> Vec<(String, f64)> {
-    let seed_set: std::collections::HashSet<&str> = seeds.iter().copied().collect();
+    let seed_set: HashSet<&str> = seeds.iter().copied().collect();
 
     let seed_coords: Vec<&Vec<f64>> = seeds.iter()
         .filter_map(|s| emb.coords.get(*s))
@@ -307,29 +296,19 @@ pub fn nearest_to_seeds(
         return vec![];
     }
 
-    let weights: Vec<f64> = emb.eigenvalues.iter()
-        .map(|&ev| if ev > 1e-8 { 1.0 / ev } else { 0.0 })
-        .collect();
+    let weights = eigenvalue_weights(&emb.eigenvalues);
 
     let mut distances: Vec<(String, f64)> = emb.coords.iter()
         .filter(|(k, _)| !seed_set.contains(k.as_str()))
         .map(|(k, coords)| {
-            // Distance to nearest seed
             let min_dist = seed_coords.iter()
-                .map(|sc| {
-                    coords.iter()
-                        .zip(sc.iter())
-                        .zip(weights.iter())
-                        .map(|((&a, &b), &w)| w * (a - b) * (a - b))
-                        .sum::<f64>()
-                        .sqrt()
-                })
+                .map(|sc| weighted_distance(coords, sc, &weights))
                 .fold(f64::MAX, f64::min);
             (k.clone(), min_dist)
         })
         .collect();
 
-    distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+    distances.sort_by(|a, b| a.1.total_cmp(&b.1));
     distances.truncate(k);
     distances
 }
@@ -423,7 +402,7 @@ pub fn analyze_positions(
     // Median distance per community for outlier scoring
     let medians: HashMap<u32, f64> = by_community.into_iter()
         .map(|(comm, mut dists)| {
-            dists.sort_by(|a, b| a.partial_cmp(b).unwrap());
+            dists.sort_by(|a, b| a.total_cmp(b));
             let median = if dists.is_empty() {
                 1.0
             } else if dists.len() % 2 == 0 {
@@ -442,7 +421,7 @@ pub fn analyze_positions(
             let (nearest_community, dist_to_nearest) = centers.iter()
                 .filter(|(&c, _)| c != comm)
                 .map(|(&c, center)| (c, weighted_distance(coords, center, &weights)))
-                .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap())
+                .min_by(|a, b| a.1.total_cmp(&b.1))
                 .unwrap_or((comm, f64::MAX));
 
             let median = medians.get(&comm).copied().unwrap_or(1.0);
@@ -461,7 +440,7 @@ pub fn analyze_positions(
         })
         .collect();
 
-    positions.sort_by(|a, b| b.outlier_score.partial_cmp(&a.outlier_score).unwrap());
+    positions.sort_by(|a, b| b.outlier_score.total_cmp(&a.outlier_score));
     positions
 }
 
@@ -494,7 +473,7 @@ pub fn unlinked_neighbors(
         }
     }
 
-    pairs.sort_by(|a, b| a.2.partial_cmp(&b.2).unwrap());
+    pairs.sort_by(|a, b| a.2.total_cmp(&b.2));
     pairs.truncate(max_pairs);
     pairs
 }
@@ -560,6 +539,6 @@ pub fn dominant_dimensions(emb: &SpectralEmbedding, keys: &[&str]) -> Vec<(usize
         })
         .collect();
 
-    dim_loading.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
+    dim_loading.sort_by(|a, b| b.1.total_cmp(&a.1));
     dim_loading
 }
diff --git a/poc-memory/src/store/mod.rs b/poc-memory/src/store/mod.rs
index a49a2f6..ee83ef5 100644
--- a/poc-memory/src/store/mod.rs
+++ b/poc-memory/src/store/mod.rs
@@ -79,8 +79,6 @@ impl Store {
         }
     }
 
-
-
     /// Resolve a link target to (key, uuid).
     fn resolve_node_uuid(&self, target: &str) -> Option<(String, [u8; 16])> {
         let bare = strip_md_suffix(target);
@@ -103,12 +101,28 @@ impl Store {
         let dir = memory_dir();
         let mut count = 0;
         if dir.exists() {
-            count = self.scan_dir_for_init(&dir)?;
+            // Build edge set for O(1) dedup during ingestion
+            let mut edge_set = self.build_edge_set();
+            count = self.scan_dir_for_init(&dir, &mut edge_set)?;
         }
         Ok(count)
     }
 
-    fn scan_dir_for_init(&mut self, dir: &Path) -> Result<usize, String> {
+    /// Build a HashSet of existing (source, target) UUID pairs for O(1) dedup.
+    fn build_edge_set(&self) -> std::collections::HashSet<([u8; 16], [u8; 16])> {
+        let mut set = std::collections::HashSet::with_capacity(self.relations.len() * 2);
+        for r in &self.relations {
+            set.insert((r.source, r.target));
+            set.insert((r.target, r.source));
+        }
+        set
+    }
+
+    fn scan_dir_for_init(
+        &mut self,
+        dir: &Path,
+        edge_set: &mut std::collections::HashSet<([u8; 16], [u8; 16])>,
+    ) -> Result<usize, String> {
         let mut count = 0;
         let entries = fs::read_dir(dir)
             .map_err(|e| format!("read dir {}: {}", dir.display(), e))?;
@@ -116,7 +130,7 @@ impl Store {
         for entry in entries.flatten() {
             let path = entry.path();
             if path.is_dir() {
-                count += self.scan_dir_for_init(&path)?;
+                count += self.scan_dir_for_init(&path, edge_set)?;
                 continue;
             }
             let Some(ext) = path.extension() else { continue };
@@ -140,10 +154,9 @@ impl Store {
 
                 for link in unit.marker_links.iter().chain(unit.md_links.iter()) {
                     let Some((key, uuid)) = self.resolve_node_uuid(link) else { continue };
-                    let exists = self.relations.iter().any(|r|
-                        (r.source == source_uuid && r.target == uuid) ||
-                        (r.source == uuid && r.target == source_uuid));
-                    if !exists {
+                    if !edge_set.contains(&(source_uuid, uuid)) {
+                        edge_set.insert((source_uuid, uuid));
+                        edge_set.insert((uuid, source_uuid));
                         new_relations.push(new_relation(
                             source_uuid, uuid, RelationType::Link, 1.0,
                             &unit.key, &key,
@@ -153,10 +166,8 @@ impl Store {
 
                 for cause in &unit.causes {
                     let Some((key, uuid)) = self.resolve_node_uuid(cause) else { continue };
-                    let exists = self.relations.iter().any(|r|
-                        r.source == uuid && r.target == source_uuid
-                        && r.rel_type == RelationType::Causal);
-                    if !exists {
+                    if !edge_set.contains(&(uuid, source_uuid)) {
+                        edge_set.insert((uuid, source_uuid));
                         new_relations.push(new_relation(
                             uuid, source_uuid, RelationType::Causal, 1.0,
                             &key, &unit.key,
diff --git a/poc-memory/src/util.rs b/poc-memory/src/util.rs
index 8799e12..8c475f7 100644
--- a/poc-memory/src/util.rs
+++ b/poc-memory/src/util.rs
@@ -28,3 +28,18 @@ pub fn truncate(text: &str, max_len: usize, suffix: &str) -> String {
 pub fn first_n_chars(s: &str, n: usize) -> String {
     s.chars().take(n).collect()
 }
+
+/// Parse a timestamp string to unix epoch seconds.
+/// Handles: "2026-03-05T19:56:00", "2026-03-05T19:56", "2026-03-05 19:56:00", "2026-03-05 19:56"
+pub fn parse_timestamp_to_epoch(ts: &str) -> Option<i64> {
+    use chrono::{Local, NaiveDateTime, TimeZone};
+    let formats = ["%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M"];
+    for fmt in &formats {
+        if let Ok(ndt) = NaiveDateTime::parse_from_str(ts, fmt) {
+            if let Some(dt) = Local.from_local_datetime(&ndt).earliest() {
+                return Some(dt.timestamp());
+            }
+        }
+    }
+    None
+}