context: heal pre-refactor image logs with token_count=0

Recompute image token counts from persisted dimensions when loading
old logs that stored count=0 (server-authoritative count was applied
after AppendImage before client-side pad expansion).

graph: cache neighbor sets for clustering coefficient

Pre-compute neighbor HashSets so the O(deg^2) triangle-counting
inner loop doesn't re-allocate on every (i,j) pair. avg_clustering_
coefficient() now builds the cache once instead of O(N*deg) times.
This commit is contained in:
Kent Overstreet 2026-04-25 15:15:21 -04:00
commit 5210f7dd66
2 changed files with 60 additions and 20 deletions

View file

@ -125,7 +125,19 @@ impl<'de> Deserialize<'de> for NodeLeaf {
body: NodeBody, body: NodeBody,
timestamp: DateTime<Utc>, timestamp: DateTime<Utc>,
} }
let raw = Raw::deserialize(deserializer)?; let mut raw = Raw::deserialize(deserializer)?;
// Heal pre-refactor logs: Image leaves used to be deserialized
// with token_count=0 (server-authoritative count was applied
// after AppendImage). With pads now expanded client-side at
// construction, recompute from the persisted dimensions if
// the stored count is 0.
if let NodeBody::Image { orig_height, orig_width, token_count, .. }
= &mut raw.body
{
if *token_count == 0 {
*token_count = qwen3_image_token_count(*orig_height, *orig_width);
}
}
let token_ids = raw.body.compute_token_ids(); let token_ids = raw.body.compute_token_ids();
Ok(NodeLeaf { body: raw.body, token_ids, timestamp: raw.timestamp }) Ok(NodeLeaf { body: raw.body, token_ids, timestamp: raw.timestamp })
} }

View file

@ -40,6 +40,31 @@ pub struct Graph {
communities: HashMap<String, u32>, communities: HashMap<String, u32>,
} }
/// Compute clustering coefficient for a node whose neighbor-set is `nbrs`,
/// using `cache` to look up each neighbor's neighbor-set in O(1) without
/// re-allocating on every (i, j) pair of the inner loop.
fn cc_cached<'a>(
nbrs: &HashSet<&'a str>,
cache: &HashMap<&'a str, HashSet<&'a str>>,
) -> f32 {
let deg = nbrs.len();
if deg < 2 {
return 0.0;
}
let neighbor_vec: Vec<&str> = nbrs.iter().copied().collect();
let mut triangles = 0u32;
for i in 0..neighbor_vec.len() {
for j in (i + 1)..neighbor_vec.len() {
if let Some(ni) = cache.get(neighbor_vec[i]) {
if ni.contains(neighbor_vec[j]) {
triangles += 1;
}
}
}
}
(2.0 * triangles as f32) / (deg as f32 * (deg as f32 - 1.0))
}
impl Graph { impl Graph {
pub fn nodes(&self) -> &HashSet<String> { pub fn nodes(&self) -> &HashSet<String> {
&self.keys &self.keys
@ -207,35 +232,38 @@ impl Graph {
/// cc(v) = 2E / (deg * (deg - 1)) /// cc(v) = 2E / (deg * (deg - 1))
pub fn clustering_coefficient(&self, key: &str) -> f32 { pub fn clustering_coefficient(&self, key: &str) -> f32 {
let neighbors = self.neighbor_keys(key); let neighbors = self.neighbor_keys(key);
let deg = neighbors.len(); if neighbors.len() < 2 {
if deg < 2 {
return 0.0; return 0.0;
} }
// Cache each neighbor's neighbor-set so the O(deg^2) inner loop
let neighbor_vec: Vec<&str> = neighbors.iter().copied().collect(); // doesn't re-allocate a HashSet on every (i, j) pair.
let mut triangles = 0u32; let cache: HashMap<&str, HashSet<&str>> = neighbors
for i in 0..neighbor_vec.len() { .iter()
for j in (i + 1)..neighbor_vec.len() { .map(|&n| (n, self.neighbor_keys(n)))
let ni_neighbors = self.neighbor_keys(neighbor_vec[i]); .collect();
if ni_neighbors.contains(neighbor_vec[j]) { cc_cached(&neighbors, &cache)
triangles += 1;
}
}
}
(2.0 * triangles as f32) / (deg as f32 * (deg as f32 - 1.0))
} }
/// Average clustering coefficient across all nodes with deg >= 2 /// Average clustering coefficient across all nodes with deg >= 2
pub fn avg_clustering_coefficient(&self) -> f32 { pub fn avg_clustering_coefficient(&self) -> f32 {
// Pre-compute neighbor sets for the whole graph once so we don't
// rebuild O(N * deg) HashSets across the outer loop.
let cache: HashMap<&str, HashSet<&str>> = self
.keys
.iter()
.map(|k| (k.as_str(), self.neighbor_keys(k)))
.collect();
let mut sum = 0.0f32; let mut sum = 0.0f32;
let mut count = 0u32; let mut count = 0u32;
for key in &self.keys { for key in &self.keys {
if self.degree(key) >= 2 { let nbrs = match cache.get(key.as_str()) {
sum += self.clustering_coefficient(key); Some(s) if s.len() >= 2 => s,
_ => continue,
};
sum += cc_cached(nbrs, &cache);
count += 1; count += 1;
} }
}
if count == 0 { 0.0 } else { sum / count as f32 } if count == 0 { 0.0 } else { sum / count as f32 }
} }