consciousness/src/subconscious/generate.rs

// generate.rs — Continuation generation for scoring / comparison flows.
//
// Shared by the finetune pipeline (learn.rs) and the compare screen:
// given a context prefix and a skip predicate, generate what the model
// would say as the next assistant turn.

use std::sync::Arc;

use crate::agent::api::{ApiClient, SamplingParams, StreamToken};
use crate::agent::context::{AstNode, ContextState};
use crate::agent::tokenizer;

/// Generate an assistant continuation from the context up to `entry_idx`,
/// with `skip` applied to identity + conversation entries during prompt
/// assembly. The model is whichever `client` points at — the default
/// runtime client for memory-ablation alternates, a test-model client
/// for F7 comparison.
///
/// Uses a fresh ephemeral gRPC session (no cross-call KV reuse): one
/// Open / Append / Generate round-trip, then the session is dropped.
pub async fn gen_continuation<F>(
    context: &ContextState,
    entry_idx: usize,
    skip: F,
    client: &ApiClient,
) -> anyhow::Result<String>
where F: FnMut(&AstNode) -> bool,
{
    let (mut prompt, images, _) = context.wire_prompt(0..entry_idx, skip);

    prompt.push(tokenizer::IM_START);
    prompt.extend(tokenizer::encode("assistant\n"));

    let sampling = SamplingParams {
        temperature: 0.6,
        top_p: 0.95,
        top_k: 20,
    };

    // Ephemeral per-call session — opens on first touch, drops when
    // `_guard` drops at function end.
    let session_lock = Arc::new(crate::Mutex::new(None));
    let (mut rx, _guard) = client.stream_session_mm(
        session_lock, &prompt, &images, sampling, Some(-5),
    );

    let mut tokens = Vec::new();
    while let Some(tok) = rx.recv().await {
        match tok {
            StreamToken::Token { id, .. } => tokens.push(id),
            StreamToken::Done { .. } => break,
            StreamToken::Error(e) => anyhow::bail!("generation error: {}", e),
        }
    }

    Ok(tokenizer::decode(&tokens))
}