Custom Deserialize for NodeLeaf: recompute tokens on deserialization

token_ids are not serialized (serde skip), so deserialized nodes had
0 tokens. The custom Deserialize impl recomputes tokens from the body
text, restoring the invariant at the reconstruction boundary. No
separate recompute step needed.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-08 21:14:54 -04:00
parent a09567849f
commit 5c9590ada7

View file

@ -63,7 +63,8 @@ pub enum NodeBody {
} }
/// A leaf node: typed content with cached token IDs. /// A leaf node: typed content with cached token IDs.
#[derive(Debug, Clone, Serialize, Deserialize)] /// Token IDs are not serialized — they're recomputed on deserialization.
#[derive(Debug, Clone, Serialize)]
pub struct NodeLeaf { pub struct NodeLeaf {
body: NodeBody, body: NodeBody,
#[serde(skip)] #[serde(skip)]
@ -71,6 +72,23 @@ pub struct NodeLeaf {
timestamp: Option<DateTime<Utc>>, timestamp: Option<DateTime<Utc>>,
} }
impl<'de> Deserialize<'de> for NodeLeaf {
fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
#[derive(Deserialize)]
struct Raw {
body: NodeBody,
timestamp: Option<DateTime<Utc>>,
}
let raw = Raw::deserialize(deserializer)?;
let token_ids = if raw.body.is_prompt_visible() {
tokenizer::encode(&raw.body.render())
} else {
vec![]
};
Ok(NodeLeaf { body: raw.body, token_ids, timestamp: raw.timestamp })
}
}
/// A node in the context AST. /// A node in the context AST.
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub enum AstNode { pub enum AstNode {