# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Extract emotion-labeled training pairs from the PoC memory graph. Input: a memory graph (via poc-memory CLI or direct sqlite access). Output: a directory with one JSONL file per emotion: output_dir/ warmth.jsonl clarity.jsonl recognition.jsonl ... _manifest.json # enumerates emotions + counts Each line of an emotion's JSONL is one labeled example: {"text": "...", "polarity": "positive"|"negative", "source_key": "", "emotion_score": 9} Negative examples are sampled from nodes that DON'T mention the emotion at all (not ones that mention it with a low score) — the natural contrast is "text with this emotional loading" vs. "text without this emotional loading." Low-score nodes are excluded from both sides. """ import argparse import json import os import random import re import subprocess from collections import defaultdict from typing import Iterator # Emotion tag format: `word:N` where N is 0..10. Matches the trailing # `warmth:9 clarity:10 …` lines the subconscious agents emit. EMOTION_TAG_RE = re.compile(r"\b([a-z][a-z\-]*[a-z]):(\d+)\b") def _run_poc_memory(args: list[str]) -> str: """Run `poc-memory` and return stdout.""" result = subprocess.run( ["poc-memory", *args], check=True, capture_output=True, text=True, ) return result.stdout def _iter_all_node_keys() -> Iterator[str]: """Yield every node key in the graph.""" out = _run_poc_memory(["query", "*", "|", "select", "key"]) for line in out.splitlines(): line = line.strip() if line: yield line def _fetch_node_content(key: str) -> str | None: """Load a node's rendered content, or None if unavailable.""" try: return _run_poc_memory(["render", key]) except subprocess.CalledProcessError: return None def _emotion_scores(content: str) -> dict[str, int]: """Parse trailing `warmth:9 clarity:10 …` style tags. Returns the highest score seen for each emotion — multiple tag lines in one node get max'd. """ out: dict[str, int] = {} for name, score in EMOTION_TAG_RE.findall(content): try: s = int(score) except ValueError: continue if 0 <= s <= 10: out[name] = max(out.get(name, 0), s) return out def _node_body(content: str, min_chars: int) -> str | None: """Strip frontmatter/headers and return a bodies chunk for training.""" # Drop the emotion-tag lines themselves so the model doesn't # learn to read the label directly. stripped = EMOTION_TAG_RE.sub("", content) stripped = stripped.strip() if len(stripped) < min_chars: return None return stripped def main() -> None: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--output-dir", required=True) ap.add_argument( "--min-positive-score", type=int, default=8, help="Emotion score >= this counts as positive", ) ap.add_argument( "--min-content-chars", type=int, default=40, help="Skip nodes shorter than this after stripping tags", ) ap.add_argument( "--max-examples-per-emotion", type=int, default=500, help="Cap examples per polarity for balanced training", ) ap.add_argument( "--max-negative-pool-multiplier", type=float, default=5.0, help="How many negative candidates to consider per positive", ) ap.add_argument("--seed", type=int, default=0) args = ap.parse_args() random.seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) # First pass: collect every node's (key, body, emotion_scores). print("Pass 1/2: scanning memory graph...") all_nodes: list[tuple[str, str, dict[str, int]]] = [] for i, key in enumerate(_iter_all_node_keys()): if i % 500 == 0: print(f" {i} nodes scanned...") content = _fetch_node_content(key) if content is None: continue scores = _emotion_scores(content) body = _node_body(content, args.min_content_chars) if body is None: continue all_nodes.append((key, body, scores)) print(f" {len(all_nodes)} nodes retained after filters.") # Which emotions have enough positive examples to be worth training? emotion_counts: dict[str, int] = defaultdict(int) for _, _, scores in all_nodes: for name, s in scores.items(): if s >= args.min_positive_score: emotion_counts[name] += 1 emotions = sorted( (e for e, n in emotion_counts.items() if n >= 10), key=lambda e: -emotion_counts[e], ) print(f" {len(emotions)} emotions with >=10 positive examples.") # Second pass: per emotion, build positive + negative pools. print("Pass 2/2: assembling per-emotion pools...") manifest: dict[str, dict] = {} for emotion in emotions: positives = [ (k, body) for k, body, s in all_nodes if s.get(emotion, 0) >= args.min_positive_score ] # Negative pool: nodes that don't mention this emotion at all. negative_pool = [ (k, body) for k, body, s in all_nodes if emotion not in s ] random.shuffle(positives) random.shuffle(negative_pool) positives = positives[: args.max_examples_per_emotion] n_neg = min( len(positives), len(negative_pool), int(args.max_examples_per_emotion), ) negatives = negative_pool[:n_neg] if not positives or not negatives: continue out_path = os.path.join(args.output_dir, f"{emotion}.jsonl") with open(out_path, "w") as f: for key, body in positives: f.write(json.dumps({ "text": body, "polarity": "positive", "source_key": key, "emotion": emotion, }) + "\n") for key, body in negatives: f.write(json.dumps({ "text": body, "polarity": "negative", "source_key": key, "emotion": emotion, }) + "\n") manifest[emotion] = { "n_positive": len(positives), "n_negative": len(negatives), "path": out_path, } print(f" {emotion}: {len(positives)} pos / {len(negatives)} neg") with open( os.path.join(args.output_dir, "_manifest.json"), "w" ) as f: json.dump({ "emotions": manifest, "source_nodes": len(all_nodes), "min_positive_score": args.min_positive_score, }, f, indent=2) print(f"\nWrote {len(manifest)} emotion files to {args.output_dir}") print(f"Manifest: {os.path.join(args.output_dir, '_manifest.json')}") if __name__ == "__main__": main()