extract shared transcript parser and similarity matching helpers
- New agents/transcript.rs: shared JSONL parsing for enrich, fact_mine, and knowledge (was 3 separate implementations, ~150 lines duplicated) - New best_match() and section_children() helpers in neuro/rewrite.rs (was duplicated find-best-by-similarity loop + section collection) - Net -153 lines
This commit is contained in:
parent
7c491e92eb
commit
92f3ba5acf
6 changed files with 166 additions and 225 deletions
|
|
@ -72,53 +72,11 @@ pub fn is_transcript_mined_with_keys(mined: &HashSet<String>, path: &str) -> boo
|
||||||
/// Extract user/assistant messages with line numbers from a JSONL transcript.
|
/// Extract user/assistant messages with line numbers from a JSONL transcript.
|
||||||
/// (line_number, role, text, timestamp)
|
/// (line_number, role, text, timestamp)
|
||||||
pub fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String, String)>, String> {
|
pub fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String, String)>, String> {
|
||||||
let content = fs::read_to_string(jsonl_path)
|
let path = std::path::Path::new(jsonl_path);
|
||||||
.map_err(|e| format!("read {}: {}", jsonl_path, e))?;
|
let messages = super::transcript::parse_transcript(path)?;
|
||||||
|
Ok(messages.into_iter()
|
||||||
let mut messages = Vec::new();
|
.map(|m| (m.line, m.role, m.text, m.timestamp))
|
||||||
for (i, line) in content.lines().enumerate() {
|
.collect())
|
||||||
let obj: serde_json::Value = match serde_json::from_str(line) {
|
|
||||||
Ok(v) => v,
|
|
||||||
Err(_) => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
|
|
||||||
if msg_type != "user" && msg_type != "assistant" { continue; }
|
|
||||||
|
|
||||||
let timestamp = obj.get("timestamp")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let msg = obj.get("message").unwrap_or(&obj);
|
|
||||||
let content = msg.get("content");
|
|
||||||
|
|
||||||
let text = match content {
|
|
||||||
Some(serde_json::Value::String(s)) => s.clone(),
|
|
||||||
Some(serde_json::Value::Array(arr)) => {
|
|
||||||
arr.iter()
|
|
||||||
.filter_map(|c| {
|
|
||||||
// Only extract text blocks; skip tool_use, tool_result, thinking, etc.
|
|
||||||
let is_text = c.get("type").and_then(|v| v.as_str()) == Some("text");
|
|
||||||
if is_text {
|
|
||||||
c.get("text").and_then(|v| v.as_str()).map(|s| s.to_string())
|
|
||||||
} else {
|
|
||||||
c.as_str().map(|s| s.to_string())
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.join("\n")
|
|
||||||
}
|
|
||||||
_ => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
let text = text.trim().to_string();
|
|
||||||
if text.is_empty() { continue; }
|
|
||||||
|
|
||||||
messages.push((i + 1, msg_type.to_string(), text, timestamp));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(messages)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const COMPACTION_MARKER: &str = "This session is being continued from a previous conversation that ran out of context";
|
pub const COMPACTION_MARKER: &str = "This session is being continued from a previous conversation that ran out of context";
|
||||||
|
|
|
||||||
|
|
@ -7,11 +7,11 @@
|
||||||
|
|
||||||
use crate::config;
|
use crate::config;
|
||||||
use super::llm;
|
use super::llm;
|
||||||
|
use super::transcript;
|
||||||
use crate::store::{self, Provenance};
|
use crate::store::{self, Provenance};
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::fs;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
const CHARS_PER_TOKEN: usize = 4;
|
const CHARS_PER_TOKEN: usize = 4;
|
||||||
|
|
@ -75,81 +75,27 @@ pub struct Fact {
|
||||||
pub source_offset: Option<usize>,
|
pub source_offset: Option<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Message {
|
|
||||||
role: String,
|
|
||||||
text: String,
|
|
||||||
timestamp: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract user/assistant text messages from a JSONL transcript.
|
/// Extract user/assistant text messages from a JSONL transcript.
|
||||||
fn extract_conversation(path: &Path) -> Vec<Message> {
|
fn extract_messages(path: &Path) -> Vec<transcript::TranscriptMessage> {
|
||||||
let cfg = config::get();
|
transcript::parse_transcript(path)
|
||||||
let Ok(content) = fs::read_to_string(path) else { return Vec::new() };
|
.unwrap_or_default()
|
||||||
let mut messages = Vec::new();
|
.into_iter()
|
||||||
|
.filter(|m| m.text.len() >= 20)
|
||||||
for line in content.lines() {
|
.collect()
|
||||||
let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
|
|
||||||
|
|
||||||
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
|
|
||||||
if msg_type != "user" && msg_type != "assistant" {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let timestamp = obj.get("timestamp")
|
|
||||||
.and_then(|v| v.as_str())
|
|
||||||
.unwrap_or("")
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let msg = obj.get("message").unwrap_or(&obj);
|
|
||||||
let content = msg.get("content");
|
|
||||||
|
|
||||||
let text = match content {
|
|
||||||
Some(serde_json::Value::String(s)) => s.clone(),
|
|
||||||
Some(serde_json::Value::Array(arr)) => {
|
|
||||||
let texts: Vec<&str> = arr.iter()
|
|
||||||
.filter_map(|block| {
|
|
||||||
let obj = block.as_object()?;
|
|
||||||
if obj.get("type")?.as_str()? != "text" {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
let t = obj.get("text")?.as_str()?;
|
|
||||||
if t.contains("<system-reminder>") {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
Some(t)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
texts.join("\n")
|
|
||||||
}
|
|
||||||
_ => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
let text = text.trim().to_string();
|
|
||||||
if text.len() < 20 {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let role = if msg_type == "user" {
|
|
||||||
cfg.user_name.clone()
|
|
||||||
} else {
|
|
||||||
cfg.assistant_name.clone()
|
|
||||||
};
|
|
||||||
messages.push(Message { role, text, timestamp });
|
|
||||||
}
|
|
||||||
|
|
||||||
messages
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Format messages into a single text for chunking.
|
/// Format messages into a single text for chunking.
|
||||||
fn format_for_extraction(messages: &[Message]) -> String {
|
fn format_for_extraction(messages: &[transcript::TranscriptMessage]) -> String {
|
||||||
|
let cfg = config::get();
|
||||||
messages.iter()
|
messages.iter()
|
||||||
.map(|msg| {
|
.map(|msg| {
|
||||||
|
let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
|
||||||
let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
|
let text = crate::util::truncate(&msg.text, 2800, "\n[...truncated...]");
|
||||||
let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
|
let ts = if msg.timestamp.len() >= 19 { &msg.timestamp[..19] } else { "" };
|
||||||
if ts.is_empty() {
|
if ts.is_empty() {
|
||||||
format!("[{}] {}", msg.role, text)
|
format!("[{}] {}", role, text)
|
||||||
} else {
|
} else {
|
||||||
format!("[{} {}] {}", msg.role, ts, text)
|
format!("[{} {}] {}", role, ts, text)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
|
|
@ -224,7 +170,7 @@ pub fn mine_transcript(
|
||||||
|
|
||||||
log(&format!("Mining: {}", filename));
|
log(&format!("Mining: {}", filename));
|
||||||
|
|
||||||
let messages = extract_conversation(path);
|
let messages = extract_messages(path);
|
||||||
if messages.is_empty() {
|
if messages.is_empty() {
|
||||||
log("No messages found");
|
log("No messages found");
|
||||||
return Ok(Vec::new());
|
return Ok(Vec::new());
|
||||||
|
|
@ -322,7 +268,7 @@ pub fn mine_batch(paths: &[&Path], min_messages: usize, dry_run: bool) -> Result
|
||||||
let mut all_facts = Vec::new();
|
let mut all_facts = Vec::new();
|
||||||
|
|
||||||
for path in paths {
|
for path in paths {
|
||||||
let messages = extract_conversation(path);
|
let messages = extract_messages(path);
|
||||||
if messages.len() < min_messages {
|
if messages.len() < min_messages {
|
||||||
eprintln!("Skipping {} ({} messages < {})",
|
eprintln!("Skipping {} ({} messages < {})",
|
||||||
path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
|
path.file_name().map(|n| n.to_string_lossy()).unwrap_or_default(),
|
||||||
|
|
|
||||||
|
|
@ -333,84 +333,41 @@ fn get_graph_topology(store: &Store, graph: &Graph) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Strip <system-reminder> blocks from text
|
/// Strip <system-reminder> blocks from text
|
||||||
fn strip_system_tags(text: &str) -> String {
|
|
||||||
let re = Regex::new(r"(?s)<system-reminder>.*?</system-reminder>").unwrap();
|
|
||||||
re.replace_all(text, "").trim().to_string()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract human-readable dialogue from a conversation JSONL
|
/// Extract human-readable dialogue from a conversation JSONL
|
||||||
fn extract_conversation_text(path: &Path, max_chars: usize) -> String {
|
fn extract_conversation_text(path: &Path, max_chars: usize) -> String {
|
||||||
let Ok(content) = fs::read_to_string(path) else { return String::new() };
|
let cfg = crate::config::get();
|
||||||
|
let messages = super::transcript::parse_transcript(path).unwrap_or_default();
|
||||||
let mut fragments = Vec::new();
|
let mut fragments = Vec::new();
|
||||||
let mut total = 0;
|
let mut total = 0;
|
||||||
|
|
||||||
for line in content.lines() {
|
for msg in &messages {
|
||||||
let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
|
let min_len = if msg.role == "user" { 5 } else { 10 };
|
||||||
|
if msg.text.len() <= min_len { continue; }
|
||||||
|
|
||||||
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
|
// Only include external user messages
|
||||||
|
if msg.role == "user" {
|
||||||
if msg_type == "user" && obj.get("userType").and_then(|v| v.as_str()) == Some("external") {
|
if msg.user_type.as_deref() != Some("external") { continue; }
|
||||||
if let Some(text) = extract_text_content(&obj) {
|
if msg.text.starts_with("[Request interrupted") { continue; }
|
||||||
let text = strip_system_tags(&text);
|
|
||||||
if text.starts_with("[Request interrupted") { continue; }
|
|
||||||
if text.len() > 5 {
|
|
||||||
fragments.push(format!("**{}:** {}", crate::config::get().user_name, text));
|
|
||||||
total += text.len();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if msg_type == "assistant" {
|
|
||||||
if let Some(text) = extract_text_content(&obj) {
|
|
||||||
let text = strip_system_tags(&text);
|
|
||||||
if text.len() > 10 {
|
|
||||||
fragments.push(format!("**{}:** {}", crate::config::get().assistant_name, text));
|
|
||||||
total += text.len();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let role = if msg.role == "user" { &cfg.user_name } else { &cfg.assistant_name };
|
||||||
|
fragments.push(format!("**{}:** {}", role, msg.text));
|
||||||
|
total += msg.text.len();
|
||||||
if total > max_chars { break; }
|
if total > max_chars { break; }
|
||||||
}
|
}
|
||||||
fragments.join("\n\n")
|
fragments.join("\n\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_text_content(obj: &serde_json::Value) -> Option<String> {
|
|
||||||
let msg = obj.get("message")?;
|
|
||||||
let content = msg.get("content")?;
|
|
||||||
if let Some(s) = content.as_str() {
|
|
||||||
return Some(s.to_string());
|
|
||||||
}
|
|
||||||
if let Some(arr) = content.as_array() {
|
|
||||||
let texts: Vec<&str> = arr.iter()
|
|
||||||
.filter_map(|b| {
|
|
||||||
if b.get("type")?.as_str()? == "text" {
|
|
||||||
b.get("text")?.as_str()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
if !texts.is_empty() {
|
|
||||||
return Some(texts.join("\n"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Count short user messages (dialogue turns) in a JSONL
|
/// Count short user messages (dialogue turns) in a JSONL
|
||||||
fn count_dialogue_turns(path: &Path) -> usize {
|
fn count_dialogue_turns(path: &Path) -> usize {
|
||||||
let Ok(content) = fs::read_to_string(path) else { return 0 };
|
let messages = super::transcript::parse_transcript(path).unwrap_or_default();
|
||||||
content.lines()
|
messages.iter()
|
||||||
.filter_map(|line| serde_json::from_str::<serde_json::Value>(line).ok())
|
.filter(|m| m.role == "user"
|
||||||
.filter(|obj| {
|
&& m.user_type.as_deref() == Some("external")
|
||||||
obj.get("type").and_then(|v| v.as_str()) == Some("user")
|
&& m.text.len() > 5
|
||||||
&& obj.get("userType").and_then(|v| v.as_str()) == Some("external")
|
&& m.text.len() < 500
|
||||||
})
|
&& !m.text.starts_with("[Request interrupted")
|
||||||
.filter(|obj| {
|
&& !m.text.starts_with("Implement the following"))
|
||||||
let text = extract_text_content(obj).unwrap_or_default();
|
|
||||||
text.len() > 5 && text.len() < 500
|
|
||||||
&& !text.starts_with("[Request interrupted")
|
|
||||||
&& !text.starts_with("Implement the following")
|
|
||||||
})
|
|
||||||
.count()
|
.count()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,9 @@
|
||||||
// fact_mine — fact extraction from transcripts
|
// fact_mine — fact extraction from transcripts
|
||||||
// digest — episodic digest generation (daily/weekly/monthly)
|
// digest — episodic digest generation (daily/weekly/monthly)
|
||||||
// daemon — background job scheduler
|
// daemon — background job scheduler
|
||||||
|
// transcript — shared JSONL transcript parsing
|
||||||
|
|
||||||
|
pub mod transcript;
|
||||||
pub mod llm;
|
pub mod llm;
|
||||||
pub mod prompts;
|
pub mod prompts;
|
||||||
pub mod audit;
|
pub mod audit;
|
||||||
|
|
|
||||||
94
poc-memory/src/agents/transcript.rs
Normal file
94
poc-memory/src/agents/transcript.rs
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
// Shared JSONL transcript parsing
|
||||||
|
//
|
||||||
|
// Three agents (enrich, fact_mine, knowledge) all parse Claude Code JSONL
|
||||||
|
// transcripts. This module provides the shared core: parse each line, extract
|
||||||
|
// message type, text content from string-or-array blocks, timestamp, and
|
||||||
|
// user type. Callers filter and transform as needed.
|
||||||
|
|
||||||
|
use std::fs;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
/// A single message extracted from a JSONL transcript.
|
||||||
|
pub struct TranscriptMessage {
|
||||||
|
/// 1-based line number in the JSONL file.
|
||||||
|
pub line: usize,
|
||||||
|
/// Raw role: "user" or "assistant".
|
||||||
|
pub role: String,
|
||||||
|
/// Extracted text content (trimmed, blocks joined with newlines).
|
||||||
|
pub text: String,
|
||||||
|
/// ISO timestamp from the message, or empty string.
|
||||||
|
pub timestamp: String,
|
||||||
|
/// For user messages: "external", "internal", etc. None for assistant.
|
||||||
|
pub user_type: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a JSONL transcript into structured messages.
|
||||||
|
///
|
||||||
|
/// Extracts all user and assistant messages. Content blocks of type "text"
|
||||||
|
/// are joined; tool_use, tool_result, thinking blocks are skipped.
|
||||||
|
/// System-reminder blocks are filtered out.
|
||||||
|
pub fn parse_transcript(path: &Path) -> Result<Vec<TranscriptMessage>, String> {
|
||||||
|
let content = fs::read_to_string(path)
|
||||||
|
.map_err(|e| format!("read {}: {}", path.display(), e))?;
|
||||||
|
|
||||||
|
let mut messages = Vec::new();
|
||||||
|
for (i, line) in content.lines().enumerate() {
|
||||||
|
let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) else { continue };
|
||||||
|
|
||||||
|
let msg_type = obj.get("type").and_then(|v| v.as_str()).unwrap_or("");
|
||||||
|
if msg_type != "user" && msg_type != "assistant" { continue; }
|
||||||
|
|
||||||
|
let timestamp = obj.get("timestamp")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("")
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let user_type = obj.get("userType")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.map(|s| s.to_string());
|
||||||
|
|
||||||
|
let Some(text) = extract_text_content(&obj) else { continue };
|
||||||
|
let text = text.trim().to_string();
|
||||||
|
if text.is_empty() { continue; }
|
||||||
|
|
||||||
|
messages.push(TranscriptMessage {
|
||||||
|
line: i + 1,
|
||||||
|
role: msg_type.to_string(),
|
||||||
|
text,
|
||||||
|
timestamp,
|
||||||
|
user_type,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(messages)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extract text content from a JSONL message object.
|
||||||
|
///
|
||||||
|
/// Handles both string content and array-of-blocks content (filtering to
|
||||||
|
/// type="text" blocks only). Strips `<system-reminder>` tags.
|
||||||
|
fn extract_text_content(obj: &serde_json::Value) -> Option<String> {
|
||||||
|
let msg = obj.get("message").unwrap_or(obj);
|
||||||
|
let content = msg.get("content")?;
|
||||||
|
|
||||||
|
let text = match content {
|
||||||
|
serde_json::Value::String(s) => s.clone(),
|
||||||
|
serde_json::Value::Array(arr) => {
|
||||||
|
let texts: Vec<&str> = arr.iter()
|
||||||
|
.filter_map(|block| {
|
||||||
|
let block_type = block.get("type").and_then(|v| v.as_str())?;
|
||||||
|
if block_type != "text" { return None; }
|
||||||
|
let t = block.get("text").and_then(|v| v.as_str())?;
|
||||||
|
// Skip system-reminder blocks entirely
|
||||||
|
if t.contains("<system-reminder>") { return None; }
|
||||||
|
Some(t)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
if texts.is_empty() { return None; }
|
||||||
|
texts.join("\n")
|
||||||
|
}
|
||||||
|
_ => return None,
|
||||||
|
};
|
||||||
|
|
||||||
|
Some(text)
|
||||||
|
}
|
||||||
|
|
@ -5,6 +5,28 @@ use crate::store::{Store, new_relation};
|
||||||
use crate::graph::Graph;
|
use crate::graph::Graph;
|
||||||
use crate::similarity;
|
use crate::similarity;
|
||||||
|
|
||||||
|
/// Collect (key, content) pairs for all section children of a file-level node.
|
||||||
|
fn section_children<'a>(store: &'a Store, file_key: &str) -> Vec<(&'a str, &'a str)> {
|
||||||
|
let prefix = format!("{}#", file_key);
|
||||||
|
store.nodes.iter()
|
||||||
|
.filter(|(k, _)| k.starts_with(&prefix))
|
||||||
|
.map(|(k, n)| (k.as_str(), n.content.as_str()))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Find the best matching candidate by cosine similarity against content.
|
||||||
|
/// Returns (key, similarity) if any candidate exceeds threshold.
|
||||||
|
fn best_match(candidates: &[(&str, &str)], content: &str, threshold: f32) -> Option<(String, f32)> {
|
||||||
|
let (best_key, best_sim) = candidates.iter()
|
||||||
|
.map(|(key, text)| (*key, similarity::cosine_similarity(content, text)))
|
||||||
|
.max_by(|a, b| a.1.total_cmp(&b.1))?;
|
||||||
|
if best_sim > threshold {
|
||||||
|
Some((best_key.to_string(), best_sim))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Refine a link target: if the target is a file-level node with section
|
/// Refine a link target: if the target is a file-level node with section
|
||||||
/// children, find the best-matching section by cosine similarity against
|
/// children, find the best-matching section by cosine similarity against
|
||||||
/// the source content. Returns the original key if no sections exist or
|
/// the source content. Returns the original key if no sections exist or
|
||||||
|
|
@ -16,31 +38,13 @@ pub fn refine_target(store: &Store, source_content: &str, target_key: &str) -> S
|
||||||
// Only refine file-level nodes (no # in key)
|
// Only refine file-level nodes (no # in key)
|
||||||
if target_key.contains('#') { return target_key.to_string(); }
|
if target_key.contains('#') { return target_key.to_string(); }
|
||||||
|
|
||||||
let prefix = format!("{}#", target_key);
|
let sections = section_children(store, target_key);
|
||||||
let sections: Vec<(&str, &str)> = store.nodes.iter()
|
|
||||||
.filter(|(k, _)| k.starts_with(&prefix))
|
|
||||||
.map(|(k, n)| (k.as_str(), n.content.as_str()))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if sections.is_empty() { return target_key.to_string(); }
|
if sections.is_empty() { return target_key.to_string(); }
|
||||||
|
|
||||||
let mut best_section = "";
|
best_match(§ions, source_content, 0.05)
|
||||||
let mut best_sim = 0.0f32;
|
.map(|(key, _)| key)
|
||||||
|
.unwrap_or_else(|| target_key.to_string())
|
||||||
for (section_key, section_content) in §ions {
|
|
||||||
let sim = similarity::cosine_similarity(source_content, section_content);
|
|
||||||
if sim > best_sim {
|
|
||||||
best_sim = sim;
|
|
||||||
best_section = section_key;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Threshold: only refine if there's a meaningful match
|
|
||||||
if best_sim > 0.05 && !best_section.is_empty() {
|
|
||||||
best_section.to_string()
|
|
||||||
} else {
|
|
||||||
target_key.to_string()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A proposed link move: from hub→neighbor to section→neighbor
|
/// A proposed link move: from hub→neighbor to section→neighbor
|
||||||
|
|
@ -70,16 +74,12 @@ pub fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph)
|
||||||
// Only works on file-level nodes that have section children
|
// Only works on file-level nodes that have section children
|
||||||
if hub_key.contains('#') { return None; }
|
if hub_key.contains('#') { return None; }
|
||||||
|
|
||||||
let prefix = format!("{}#", hub_key);
|
let sections = section_children(store, hub_key);
|
||||||
let sections: Vec<(&str, &str)> = store.nodes.iter()
|
|
||||||
.filter(|(k, _)| k.starts_with(&prefix))
|
|
||||||
.map(|(k, n)| (k.as_str(), n.content.as_str()))
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if sections.is_empty() { return None; }
|
if sections.is_empty() { return None; }
|
||||||
|
|
||||||
// Get all neighbors of the hub
|
// Get all neighbors of the hub
|
||||||
let neighbors = graph.neighbors(hub_key);
|
let neighbors = graph.neighbors(hub_key);
|
||||||
|
let prefix = format!("{}#", hub_key);
|
||||||
|
|
||||||
let mut moves = Vec::new();
|
let mut moves = Vec::new();
|
||||||
|
|
||||||
|
|
@ -93,19 +93,7 @@ pub fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Find best-matching section by content similarity
|
// Find best-matching section by content similarity
|
||||||
let mut best_section = "";
|
if let Some((best_section, best_sim)) = best_match(§ions, neighbor_content, 0.05) {
|
||||||
let mut best_sim = 0.0f32;
|
|
||||||
|
|
||||||
for (section_key, section_content) in §ions {
|
|
||||||
let sim = similarity::cosine_similarity(neighbor_content, section_content);
|
|
||||||
if sim > best_sim {
|
|
||||||
best_sim = sim;
|
|
||||||
best_section = section_key;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only propose move if there's a reasonable match
|
|
||||||
if best_sim > 0.05 && !best_section.is_empty() {
|
|
||||||
let snippet = crate::util::first_n_chars(
|
let snippet = crate::util::first_n_chars(
|
||||||
neighbor_content.lines()
|
neighbor_content.lines()
|
||||||
.find(|l| !l.is_empty() && !l.starts_with("<!--") && !l.starts_with("##"))
|
.find(|l| !l.is_empty() && !l.starts_with("<!--") && !l.starts_with("##"))
|
||||||
|
|
@ -115,7 +103,7 @@ pub fn differentiate_hub_with_graph(store: &Store, hub_key: &str, graph: &Graph)
|
||||||
moves.push(LinkMove {
|
moves.push(LinkMove {
|
||||||
neighbor_key: neighbor_key.to_string(),
|
neighbor_key: neighbor_key.to_string(),
|
||||||
from_hub: hub_key.to_string(),
|
from_hub: hub_key.to_string(),
|
||||||
to_section: best_section.to_string(),
|
to_section: best_section,
|
||||||
similarity: best_sim,
|
similarity: best_sim,
|
||||||
neighbor_snippet: snippet,
|
neighbor_snippet: snippet,
|
||||||
});
|
});
|
||||||
|
|
@ -188,11 +176,7 @@ pub fn find_differentiable_hubs(store: &Store) -> Vec<(String, usize, usize)> {
|
||||||
if deg < threshold { continue; }
|
if deg < threshold { continue; }
|
||||||
if key.contains('#') { continue; }
|
if key.contains('#') { continue; }
|
||||||
|
|
||||||
let prefix = format!("{}#", key);
|
let section_count = section_children(store, key).len();
|
||||||
let section_count = store.nodes.keys()
|
|
||||||
.filter(|k| k.starts_with(&prefix))
|
|
||||||
.count();
|
|
||||||
|
|
||||||
if section_count > 0 {
|
if section_count > 0 {
|
||||||
hubs.push((key.clone(), deg, section_count));
|
hubs.push((key.clone(), deg, section_count));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue